diff --git a/.gitignore b/.gitignore index 84245149..29b142b0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ node_modules/ **/model/*.csv **/model/*.json **/concepts/*.json +import_solution_metadata/ +solution_metadata/ \ No newline at end of file diff --git a/agile_operations/network_infrastructure/README.md b/agile_operations/network_infrastructure/README.md new file mode 100644 index 00000000..50724425 --- /dev/null +++ b/agile_operations/network_infrastructure/README.md @@ -0,0 +1,181 @@ +# Network Infrastructure - Cybersecurity (TigerGraph Solution Kit) + +Cybersecurity is a crucial aspect of large organizations. Enterprises operate +their own data centers and network infrastructure involving many different +devices. Cyberattacks and other incidents can lead to issues such as data +breaches, corrupted files, and loss of data - resulting in billions of +dollars lost each year. One key capability for detecting and responding to +these threats is a deep understanding of your organization's network +infrastructure and how devices are connected. + +TigerGraph allows you to connect data from multiple sources and load data +at terabyte scale. With TigerGraph visualizations, users can see the +topology of their network infrastructure, understand how components relate, +and run graph algorithms at scale to discover related incidents and events +based on device topology in near real time. + +This solution kit provisions a complete graph environment - schema, sample +data, and GSQL queries to help you: + +- Visualize router-firewall-switch-server topologies. +- Identify single points of failure and the blast radius of device failures. +- Trace incidents and alerts through network paths and time. +- Find unsecured routes and devices with frequent alerts and incidents. + +--- + +## Contents + +- [Overview](#overview) +- [Components](#components) +- [Prerequisites](#prerequisites) +- [Setup Instructions](#setup-instructions) +- [Query Explanations](#query-explanations) +- [Mock Data](#mock-data) +- [Insights Application](#insights-application) + +--- + +## Overview + +Modern enterprises operate complex on-premise and hybrid data centers with +routers, firewalls, switches, and servers forming intricate topologies. +Cyber attacks, misconfigurations, and hardware failures can cascade across +these networks, causing outages, data loss, and service disruptions. + +Graph databases like TigerGraph are well-suited for this type of analysis: +they can represent topology, events, alerts, and incidents as a connected +model and traverse it in real time. This solution kit combines: + +- **Network devices** + A generic `Device` vertex plus specialized device types: + `Router`, `Firewall`, `Switch`, and `Server`. + +- **Events, alerts, and incidents** + `Event`, `Alert`, and `Incident` vertices with associated classification + vertices (`Event_Type`, `Alert_Type`, `Incident_Type`) to capture what + happened and how it was categorized. + +- **Time hierarchy** + A time dimension for temporal analysis and visualization: + `Time_Year`, `Time_Date_Month`, `Time_Date`, `Time_Date_Hour`, + `Time_Date_Minute`. + +- **Topology and connectivity** + `Connect_To` edges between devices and impact/causal relationships such as: + `Impacts`, `Linked_With_Alert`, `Linked_With_Incident`, `From_Device`, + `To_Device`, plus relationships along the time hierarchy + (`Has_Minute`, `Has_Hour`, `Has_Date`, `Has_Month`, `Has_Year`). + +You can use the included queries as building blocks for operations, +security analysis, and incident investigation - or extend the graph with +your own device types, log sources, or analytics. + +> **Graph name:** This kit creates and uses a graph named `Network_Infrastructure`. + +--- +# Components + +This repository includes multiple components: + +- `data` - Sample data. +- `load_jobs` - Scripts for data loading tasks. +- `meta` - Solution Kit metadata - includes a TG Insights application. +- `queries` - Collection of GSQL queries. +- `schema` - Definition of database schema. +- `readme.md` - This usage guide. +- `setup.sh` - Automated setup script. + + +## Prerequisites + +Before you run this solution kit, make sure you have: + +- **A running TigerGraph instance** + - TigerGraph installed and running, or use the prebuilt kit on TG cloud. + - You must have permission to create graphs and run GSQL commands. + +- **GSQL client access** + - The `gsql` command-line tool available on the same machine/container where you cloned this repo. + - Ability to connect to your TigerGraph service + +- **Network access for sample data** + - Outbound internet access from the TigerGraph machine to read the sample + CSV files from the public S3 bucket used in `loading_job/load_data.gsql`. + +- **Shell environment** + - A Unix-like shell (Linux, macOS, or WSL) to run `setup.sh` and `queries/install_queries.sh`. + - Executable permissions for the scripts: + ```bash + chmod +x setup.sh + chmod +x queries/install_queries.sh + ``` + +> **Graph name:** This kit creates and uses a graph named +> `Network_Infrastructure`. + +# Setup Instructions +The following instructions assume that you are running the following scripts +with `gsql` command installed. + +If you don't yet have the `gsql` command available, see the TigerGraph documentation: + +- **Local GSQL shell on the server** + [The GSQL Shell](https://docs.tigergraph.com/tigergraph-server/current/gsql-shell/) + (explains how to run `gsql` directly on a TigerGraph server) + +- **Remote GSQL client (from your laptop or another machine)** + [Using a Remote GSQL Client](https://docs.tigergraph.com/tigergraph-server/current/gsql-shell/using-a-remote-gsql-client) + (explains how to download the GSQL client JAR, configure SSL, and create a `gsql` alias) +Ensure that the script is executable with: +```bash + +chmod +x setup.sh + +``` +Then, run the automated script using: +```bash +./setup.sh +``` + + +The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: + +1. **Schema Creation**: Initiates the schema creation process with the `schema/create_network_infrastructure_graph.gsql` script. +2. **Data Loading**: Load data into the schema by running the data loading jobs with the scripts in the `loading_job` folder. +3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. + +## Query Explanations + +We have different queries to perform the following tasks: + +1. **Topology Visualization**: Explore the topology of the devices in the current dataset. There are multiple queries in this group that are suitable for different visualizations. The topology can be shown starting from router all the way down to the server, or it can be shown downstream, starting from other vertex like a router or switch. + +2. **Statistics**: Provide more statistics into the current network infrastructure. For instance, we can find top devices with highest amount of alerts or incidents. + +3. **Device Failure Impact Visualization**: Provide visualizations for a potential single point of failure. If one device along the path fails, we want to know if it's possible for other devices downstream to fail as well. The criteria for potential failed device is if there is no path from that device back to the working router. + +4. **Unsecure Server Visualization**: Provide visualizations of unsecured servers where it is possible to go from the router to the server without passing through the hardware firewall. + +5. **Incident Impact**: Find the impact of an incident based on the topology of the device within a specified radius. + +6. **Events Filtering**: Find the events based on some filters, like the device that is impacted by the event and the time range that the event occurs. + +7. **Find Potential Incidents that Cause an Event**: Find the potential source or cause of an event based on the connection of the different devices. The source or cause of the event has to be an Incident, although this can be changed. + +8. **Find Potential Events that Got Affected by an Incident**: Find the potential events that are caused by the input incident based on the connections of the different devices. + +## Mock Data + +The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. + + +## Insights Application + +We have provided an insights application called "Network Infrastructure Insights". The applications provided the users with an intuitive view of the information in the graph. + +There are currently 4 pages in the Insights Application : +- Network Infrastructure Topology Exploration +- Network Infrastructure Topology Analysis +- Event Visualizations +- Event Causes Visualizations diff --git a/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization.gsql b/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization.gsql index c22df26b..ef65600c 100644 --- a/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization.gsql +++ b/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization.gsql @@ -1,6 +1,26 @@ CREATE OR REPLACE QUERY device_failure_impact_radius_visualization ( VERTEX device ) { + /* + Query Name: + device_failure_impact_radius_visualization + + Purpose: + 1. Find devices that do NOT have an alternative communication path. + 2. Determine devices that will be impacted if the input device fails. + 3. Display impacted devices and their interconnections. + + Concept: + - If a device has an alternative path (redundant connectivity), it won't fail. + - Devices without alternative paths are marked as 'impacted'. + + Inputs: + - device: The device whose failure we are analyzing. + + Outputs: + - impacted_devices: Devices that will fail if the input device fails. + - @@edges_to_display: Edges between impacted devices for visual impact analysis. + */ SetAccum @@edges_to_display; OrAccum @has_alternative_path; diff --git a/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization_with_subgraph_topology.gsql b/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization_with_subgraph_topology.gsql index 207200de..d77b56fa 100644 --- a/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization_with_subgraph_topology.gsql +++ b/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization_with_subgraph_topology.gsql @@ -1,6 +1,25 @@ CREATE OR REPLACE QUERY device_failure_impact_radius_visualization_with_subgraph_topology ( VERTEX device ) { + /* + Query Name: + device_failure_impact_radius_visualization_with_subgraph_topology + + Purpose: + 1. Build the subgraph that contains the input device (based on connectivity). + 2. Explore and detect devices that have alternative network paths (redundancy). + 3. Identify devices that will be impacted (fail) if the given device fails. + 4. Collect and return visualization data: subgraph topology and failure impact edges. + + Input: + - device: The starting device whose failure impact we want to analyze. + + Output: + - all_vertices_in_subgraph: Devices in the connectivity region of the input device. + - @@edges_to_display_in_subgraph: Edges representing overall subgraph structure. + - impacted_devices: Devices that do not have an alternative path (will fail). + - @@edges_to_display: Edges among impacted devices (failure impact radius). + */ SetAccum @@edges_to_display; SetAccum @@edges_to_display_in_subgraph; OrAccum @has_alternative_path; @@ -72,4 +91,4 @@ CREATE OR REPLACE QUERY device_failure_impact_radius_visualization_with_subgraph UPDATE DESCRIPTION OF QUERY device_failure_impact_radius_visualization_with_subgraph_topology "This query finds and visualizes the devices that will fail if the provided input device fails along with the subgraph that the input device was in. Use only for visualization purposes." -UPDATE DESCRIPTION OF QUERY_PARAM device_failure_impact_radius_visualization_with_subgraph_topology.device "The input device (accepts devices of all types)." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM device_failure_impact_radius_visualization_with_subgraph_topology.device "The input device (accepts devices of all types)." diff --git a/agile_operations/network_infrastructure/queries/downstream_device_topology_visualization.gsql b/agile_operations/network_infrastructure/queries/downstream_device_topology_visualization.gsql index d39e7c98..4a8c8237 100644 --- a/agile_operations/network_infrastructure/queries/downstream_device_topology_visualization.gsql +++ b/agile_operations/network_infrastructure/queries/downstream_device_topology_visualization.gsql @@ -2,6 +2,29 @@ CREATE OR REPLACE QUERY downstream_device_topology_visualization ( VERTEX device, UINT k_hop_switch_limit = 3 ) { + /* + Query Name: + downstream_device_topology_visualization + + Purpose: + Visualize the downstream topology from a given device, following the device hierarchy: + Router -> Firewall -> Switch -> Server + Also explores multiple downstream Switch layers (k-hop depth traversal). + + Key Features: + . Identifies all downstream devices classified by device type. + . Follows device hierarchy dynamically based on input device type. + . Limits multi-hop Switch traversal using k_hop_switch_limit. + . Returns devices and connecting edges for visualization. + + Inputs: + - device: Starting device. + - k_hop_switch_limit: Maximum depth for switch-to-switch iterations (default = 3). + + Outputs: + - impacted_devices: All discovered downstream devices. + - @@edges_to_display: Edges to visualize the downstream network path. +*/ SetAccum @@impacted_devices; SetAccum @@edges_to_display; diff --git a/agile_operations/network_infrastructure/queries/explore_topology_from_all_router.gsql b/agile_operations/network_infrastructure/queries/explore_topology_from_all_router.gsql index 4f8862e5..fa22ad03 100644 --- a/agile_operations/network_infrastructure/queries/explore_topology_from_all_router.gsql +++ b/agile_operations/network_infrastructure/queries/explore_topology_from_all_router.gsql @@ -1,4 +1,18 @@ CREATE OR REPLACE QUERY explore_topology_from_all_router () { + /* + Query Name: + explore_topology_from_all_router + + Purpose: + Visualize the entire network topology by: + - Retrieving all devices in the graph + - Exploring all connected edges using Connect_To + - Displaying both devices and their connections + + Outputs: + - all_devices_with_connections: List of devices connected via Connect_To edges + - @@edges_to_display: All edges among connected devices for visualization +*/ SetAccum @@edges_to_display; all_devices = {Device.*}; @@ -13,4 +27,4 @@ CREATE OR REPLACE QUERY explore_topology_from_all_router () { PRINT @@edges_to_display AS edges_to_display; } -UPDATE DESCRIPTION OF QUERY explore_topology_from_all_router "This query visualizes the network topology of all devices in the database. It shows the downstream connections from all routers." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY explore_topology_from_all_router "This query visualizes the network topology of all devices in the database. It shows the downstream connections from all routers." diff --git a/agile_operations/network_infrastructure/queries/explore_topology_from_multiple_routers.gsql b/agile_operations/network_infrastructure/queries/explore_topology_from_multiple_routers.gsql index 2f7f345a..169defad 100644 --- a/agile_operations/network_infrastructure/queries/explore_topology_from_multiple_routers.gsql +++ b/agile_operations/network_infrastructure/queries/explore_topology_from_multiple_routers.gsql @@ -1,4 +1,27 @@ CREATE OR REPLACE QUERY explore_topology_from_multiple_routers (SET> starter_router_set) { + /* + Query Name: + explore_topology_from_multiple_routers + + Purpose: + Visualizes the downstream network topology starting from one or more routers. + Traverses devices in this structured order: + Router -> Firewall -> Switch -> Server + + Key Features: + Uses BFS to exhaustively discover downstream switches. + + Inputs: + starter_router_set - Set of router vertices to start the topology exploration. + If empty, all routers in the graph are selected automatically. + + Outputs: + - all_visited_router_devices - Routers connected to input routers via Device_Has_Type + - all_visited_firewalls - First downstream stage (Firewalls) + - all_visited_switches - All connected Switches (multi-hop via BFS) + - all_visited_servers - Downstream Servers + - @@edges_to_display - All edges to display full topology +*/ OrAccum @visited; SetAccum @@edges_to_display; @@ -85,4 +108,4 @@ CREATE OR REPLACE QUERY explore_topology_from_multiple_routers (SET starter_router) { + /* + Query Name: + explore_topology_from_one_router + + Purpose: + Visualizes the downstream network topology starting from a single router. + Traverses devices in this structured order: + Router -> Firewall -> Switch -> Server + + Key Features: + . Discovers all downstream Firewalls and Switches (including bypass paths) + . Uses BFS to find all connected Switches (multi-hop exploration) + . Captures all edges forming the full topology view + + Input: + starter_router - A single Router vertex that acts as the exploration starting point. + + Outputs: + - all_visited_router_devices - Devices directly connected to starter router via Device_Has_Type + - all_visited_firewalls - Firewalls downstream of the router + - all_visited_switches - All discovered Switches (including BFS expansion) + - all_visited_servers - Servers connected downstream of switches + - @@edges_to_display - Complete collection of edges forming the explored topology +*/ OrAccum @visited; SetAccum @@edges_to_display; @@ -80,4 +104,4 @@ CREATE OR REPLACE QUERY explore_topology_from_one_router (VERTEX starter UPDATE DESCRIPTION OF QUERY explore_topology_from_one_router "This query visualizes the network topology starting from the router 'starter_router'." -UPDATE DESCRIPTION OF QUERY_PARAM explore_topology_from_one_router.starter_router "The input router device to explore the network topology." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM explore_topology_from_one_router.starter_router "The input router device to explore the network topology." diff --git a/agile_operations/network_infrastructure/queries/find_events_by_impacted_device_and_time_range.gsql b/agile_operations/network_infrastructure/queries/find_events_by_impacted_device_and_time_range.gsql index 924cfcf9..badf7cc1 100644 --- a/agile_operations/network_infrastructure/queries/find_events_by_impacted_device_and_time_range.gsql +++ b/agile_operations/network_infrastructure/queries/find_events_by_impacted_device_and_time_range.gsql @@ -4,6 +4,35 @@ CREATE OR REPLACE QUERY find_events_by_impacted_device_and_time_range ( DATETIME end_time, BOOL show_event_types_vis = FALSE ) { + /* + Query Name: + find_events_by_impacted_device_and_time_range + + Purpose: + Finds and visualizes events linked to a specific device within a given time range. + Also traces time hierarchy (Minute -> Hour -> Date -> Month -> Year) for chronological visualization. + Optionally includes detailed event type information (Alert, Incident, and their classifications). + + Key Features: + . Filters events by impacted device and time window. + . Visualizes full time-based hierarchy for each event. + . Option to include related event type, alert, and incident classifications. + . Collects all traversal edges for easy visualization. + + Inputs: + input_device - The impacted device to search events for. + start_time - Minimum timestamp of events to include. + end_time - Maximum timestamp of events to include. + show_event_types_vis - (Optional) If TRUE, includes event/alert/incident types. + + Outputs: + - linked_events_within_time - Events impacting the input device within the time range. + - linked_time_date_minute / hour / date / month / year - Chronologically related time vertices. + - linked_event_types - Event classification (if enabled). + - linked_alerts_within_time, linked_incidents_within_time - Associated alerts and incidents. + - linked_alert_types, linked_incident_types - Alert/Incident categories. + - @@edges_to_display - All edges used to visualize event and time relationships. +*/ SetAccum @@edges_to_display; input_device_set = {input_device}; diff --git a/agile_operations/network_infrastructure/queries/find_events_by_time_range_and_event_type.gsql b/agile_operations/network_infrastructure/queries/find_events_by_time_range_and_event_type.gsql index 99d6ade8..be815431 100644 --- a/agile_operations/network_infrastructure/queries/find_events_by_time_range_and_event_type.gsql +++ b/agile_operations/network_infrastructure/queries/find_events_by_time_range_and_event_type.gsql @@ -3,6 +3,30 @@ CREATE OR REPLACE QUERY find_events_by_time_range_and_event_type ( DATETIME end_time, STRING input_event_type_filter = "" ) { + /* + Query Name: + find_events_by_time_range_and_event_type + + Purpose: + Retrieves events occurring within a specific time range and optionally filters by event type. + Additionally, collects related metadata such as impacted devices, alert types, + and incident types for comprehensive event analysis. + + Inputs: + start_time - Minimum datetime filter for event retrieval. + end_time - Maximum datetime filter for event retrieval. + input_event_type_filter - (Optional) Filters by event type. If empty, all types are included. + + Outputs: + - selected_events_with_info: + . event_id - Event identifier + . event_time - Timestamp of the event + . event_type . Type classification (Security, System, Network, etc.) + . event_alert_type - Enriched alert type data (if any) + . event_incident_type - Enriched incident type data (if any) + . impacted_devices_list - Devices affected by this event + +*/ MaxAccum @event_type; MaxAccum @incident_type; MaxAccum @alert_type; diff --git a/agile_operations/network_infrastructure/queries/find_potential_incident_source_of_event_by_time.gsql b/agile_operations/network_infrastructure/queries/find_potential_incident_source_of_event_by_time.gsql index c88da0ca..787d2052 100644 --- a/agile_operations/network_infrastructure/queries/find_potential_incident_source_of_event_by_time.gsql +++ b/agile_operations/network_infrastructure/queries/find_potential_incident_source_of_event_by_time.gsql @@ -3,6 +3,31 @@ CREATE OR REPLACE QUERY find_potential_incident_source_of_event_by_time ( INT max_radius = 3, INT num_seconds_before_event_start = 3600 ) { + /* + Query Name: + find_potential_incident_source_of_event_by_time + + Purpose: + Identifies potential root-cause incidents for a given input event by: + . Retrieving impacted devices of the event + . Exploring connected devices within a specified hop radius + . Scanning for past incident events within a defined timeframe + . Linking discovered incidents with their incident types + + Inputs: + input_event - The event vertex to investigate potential source incidents for + max_radius - Max number of hops allowed for device connectivity exploration (default: 3) + num_seconds_before_event_start - Time window (in seconds) before input_event.timestamp to search for related incidents (default: 3600) + + Outputs: + . input_event_set - Original event input + . related_devices_within_radius - Connected devices marked by event radius + . incident_events_from_related_devices - Incident events found in time range + . incidents_from_related_devices - Linked incident entities + . incident_types_of_related_devices - Final categorized incident types + . @@edges_to_display - All traversal edges for visualization/UI mapping + +*/ SetAccum @@related_devices_set; SetAccum @@edges_to_display; MinAccum @@end_time_accum; @@ -129,4 +154,4 @@ UPDATE DESCRIPTION OF QUERY find_potential_incident_source_of_event_by_time "Thi UPDATE DESCRIPTION OF QUERY_PARAM find_potential_incident_source_of_event_by_time.input_event "The input events to find potential source incidents to" UPDATE DESCRIPTION OF QUERY_PARAM find_potential_incident_source_of_event_by_time.max_radius "The maximum number of hops extending from the device(s) the input event has an impact on. Defaults to 3." -UPDATE DESCRIPTION OF QUERY_PARAM find_potential_incident_source_of_event_by_time.num_seconds_before_event_start "The number of seconds before an event starts to search for incidents. Defaults to 3600 seconds." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM find_potential_incident_source_of_event_by_time.num_seconds_before_event_start "The number of seconds before an event starts to search for incidents. Defaults to 3600 seconds." diff --git a/agile_operations/network_infrastructure/queries/find_potential_related_events_from_incident_by_time.gsql b/agile_operations/network_infrastructure/queries/find_potential_related_events_from_incident_by_time.gsql index e27fbee7..aecdab4b 100644 --- a/agile_operations/network_infrastructure/queries/find_potential_related_events_from_incident_by_time.gsql +++ b/agile_operations/network_infrastructure/queries/find_potential_related_events_from_incident_by_time.gsql @@ -3,6 +3,35 @@ CREATE OR REPLACE QUERY find_potential_related_events_from_incident_by_time ( INT max_radius = 3, INT num_seconds_from_incident_start = 3600 ) { + /* + Query Name: + find_potential_related_events_from_incident_by_time + + Purpose: + Identifies events that could be potentially related to a given input incident. + It does so by: + . Discovering all devices impacted by the incident + . Expanding outward through connected devices within max_radius hops + . Searching for events (Alerts, Incidents) that occurred within a time window + starting from the incident occurrence time + . Classifying and linking detected related alerts and incidents with types + + Inputs: + input_incident - Starting Incident vertex for event correlation analysis + max_radius - Max number of hops to discover connected impacted devices (default: 3) + num_seconds_from_incident_start - Time window (in seconds) after incident start to look for related events (default: 3600) + + Outputs: + . input_incident_set - Original input incident + . linked_event - Event directly linked to input incident + . impacted_devices_within_radius - All devices reached via radius traversal + . alerts_from_impacted_devices - Related Alerts discovered in time range + . incidents_from_impacted_devices - Related Incidents discovered + . alert_types_of_impacted_devices - Enriched alert category details + . incident_types_of_impacted_devices - Enriched incident category details + . @@edges_to_display - All collected edges for UI / Graph visualization + +*/ SetAccum @@impacted_devices_set; SetAccum @@edges_to_display; MinAccum @@start_time_accum; diff --git a/agile_operations/network_infrastructure/queries/find_unsecured_servers_visualization.gsql b/agile_operations/network_infrastructure/queries/find_unsecured_servers_visualization.gsql index be31a400..87ac40fe 100644 --- a/agile_operations/network_infrastructure/queries/find_unsecured_servers_visualization.gsql +++ b/agile_operations/network_infrastructure/queries/find_unsecured_servers_visualization.gsql @@ -1,4 +1,27 @@ -CREATE OR REPLACE QUERY find_unsecured_servers_visualization (UINT k_hop_switch_limit = 3) { +CREATE OR REPLACE QUERY find_unsecured_servers_visualization (UINT k_hop_switch_limit = 3) { + /* + Query: find_unsecured_servers_visualization + + Purpose: + Visualizes unsecured network paths from Routers to Servers through Switches. + It identifies Servers that are reachable via Switches without passing through security devices (like Firewalls). + + What It Does: + . Finds all Routers, Switches, and Servers. + . Traverses paths from Routers -> Switches -> Servers using Connect_To edges. + . Expands through Switch-to-Switch connections up to 'k_hop_switch_limit' hops. + . Collects all involved vertices and edges for visualization. + + Key Outputs: + - routers_to_display -> Starting routers + - switches_to_display -> Switches on the unsecured path + - servers_to_display -> Potentially unsecured servers + - edges_to_display -> All traversal edges for graph visualization + + Parameter: + k_hop_switch_limit -> Maximum number of Switch-to-Switch traversal hops (default: 3) + +*/ SetAccum @@routers_to_display; SetAccum @@switches_to_display; diff --git a/agile_operations/network_infrastructure/queries/incident_impact_by_max_radius.gsql b/agile_operations/network_infrastructure/queries/incident_impact_by_max_radius.gsql index ad6e3a9d..4ccd3777 100644 --- a/agile_operations/network_infrastructure/queries/incident_impact_by_max_radius.gsql +++ b/agile_operations/network_infrastructure/queries/incident_impact_by_max_radius.gsql @@ -2,6 +2,31 @@ CREATE OR REPLACE QUERY incident_impact_by_max_radius ( VERTEX input_incident, INT max_radius = 4 ) { + /* + Query: incident_impact_by_max_radius + + Purpose: + Determines how far an incident can propagate through the network. + It finds all devices that may be impacted by a given incident, + up to a specified hop limit (max_radius). + + What It Does: + 1. Gets the Event linked to the input Incident. + 2. Finds devices directly impacted by the incident (radius 0). + 3. Iteratively explores additional devices connected via Connect_To edges, + marking each with its hop distance (incident_radius). + 4. Collects all impacted devices and edges for visualization. + + Key Outputs: + - input_incident_set -> The provided incident + - linked_event -> Event associated with the incident + - impacted_devices_within_radius -> All potentially impacted devices + - @@edges_to_display -> Edges used during traversal + + Parameters: + max_radius -> Max number of hops to explore propagation (default: 4) + +*/ SetAccum @@impacted_devices_set; SetAccum @@edges_to_display; OrAccum @visited; @@ -61,4 +86,4 @@ CREATE OR REPLACE QUERY incident_impact_by_max_radius ( UPDATE DESCRIPTION OF QUERY incident_impact_by_max_radius "This query finds the devices that can be impacted by an incident within 'max_radius' hops away from the devices directly impacted by the event." UPDATE DESCRIPTION OF QUERY_PARAM incident_impact_by_max_radius.input_incident "The input incident to find devices that can be impacted by it." -UPDATE DESCRIPTION OF QUERY_PARAM incident_impact_by_max_radius.max_radius "The maximum number of hops extending from the device(s) the input incident has an impact on. Defaults to 4." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM incident_impact_by_max_radius.max_radius "The maximum number of hops extending from the device(s) the input incident has an impact on. Defaults to 4." diff --git a/agile_operations/network_infrastructure/queries/top_k_devices_with_most_alerts.gsql b/agile_operations/network_infrastructure/queries/top_k_devices_with_most_alerts.gsql index 16b2a08f..c2214304 100644 --- a/agile_operations/network_infrastructure/queries/top_k_devices_with_most_alerts.gsql +++ b/agile_operations/network_infrastructure/queries/top_k_devices_with_most_alerts.gsql @@ -1,4 +1,23 @@ CREATE OR REPLACE QUERY top_k_devices_with_most_alerts (INT k) { + /* + Query: top_k_devices_with_most_alerts + + Purpose: + Identifies the devices that are associated with the highest number of alerts. + Useful for prioritizing monitoring, troubleshooting, and risk assessment. + + What It Does: + 1. Counts how many alerts are linked to each Event. + 2. Aggregates alert counts from Events to connected Devices. + 3. Sorts devices by total alert frequency in descending order. + 4. Returns the top K most alert-prone devices. + + Key Outputs: + - devices -> Top K devices with highest aggregated alert_count + + Parameter: + k -> Number of devices to return (top K) +*/ SumAccum @alert_count; diff --git a/agile_operations/network_infrastructure/queries/top_k_devices_with_most_incidents.gsql b/agile_operations/network_infrastructure/queries/top_k_devices_with_most_incidents.gsql index 80721d69..7320c9d3 100644 --- a/agile_operations/network_infrastructure/queries/top_k_devices_with_most_incidents.gsql +++ b/agile_operations/network_infrastructure/queries/top_k_devices_with_most_incidents.gsql @@ -1,4 +1,23 @@ CREATE OR REPLACE QUERY top_k_devices_with_most_incidents (INT k) { + /* + Query: top_k_devices_with_most_incidents + + Purpose: + Identifies devices that are associated with the highest number of incidents. + Helps in recognizing critical devices prone to failures, security breaches, or outages. + + What It Does: + 1. Counts the number of incidents linked to each Event. + 2. Propagates incident counts from Events to connected Devices. + 3. Sorts devices by total incident frequency in descending order. + 4. Returns the top K devices with the most incidents. + + Key Output: + - devices -> List of top K devices ranked by incident_count + + Parameter: + k -> Number of top devices to return.. +*/ SumAccum @incident_count; diff --git a/agile_operations/network_infrastructure/readme.md b/agile_operations/network_infrastructure/readme.md deleted file mode 100644 index 4caab760..00000000 --- a/agile_operations/network_infrastructure/readme.md +++ /dev/null @@ -1,60 +0,0 @@ -# Narratives - -Cybersecurity is a crucial aspect of big organizations. Enterprises have their own data centers and their own network infrastructure involving a lot of devices. Cyberattacks and other incidents can lead to issues such as data breach, corrupted files, and loss of data, resulting in billions of dollars lost each year. One aspect that can help with detecting those attacks and other incidents is to gain a better understanding of the network infrastructure of your organization. - -TigerGraph allows you to connect data from different sources and load data in scale of terabytes into TigerGraph. With visualizations in TigerGraph, users can gain a better visibility of the platform by seeing different components and the topology of their Network Infrastructure. Different graph algorithms can be run at scale and allows for discovery of related incidents and events based on the device topology close to real-time. - -# Components - -This repository includes multiple components: - -- `data` - Sample data. -- `load_jobs` - Scripts for data loading tasks. -- `meta` - Solution Kit metadata. -- `queries` - Collection of GSQL queries. -- `schema` - Definition of database schema. -- `readme.md` - This usage guide. -- `setup.sh` - Automated setup script. - -# Instructions - -The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: - -1. **Schema Creation**: Initiates the schema creation process with the `schema/create_network_infrastructure_graph.gsql` script. -2. **Data Loading**: Load data into the schema by running the data loading jobs with the scripts in the `loading_job` folder. -3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. - -## Query Explanations - -We have different queries to perform the following tasks: - -1. **Topology Visualization**: Explore the topology of the devices in the current dataset. There are multiple queries in this group that are suitable for different visualizations. The topology can be shown starting from router all the way down to the server, or it can be shown downstream, starting from other vertex like a router or switch. - -2. **Statistics**: Provide more statistics into the current network infrastructure. For instance, we can find top devices with highest amount of alerts or incidents. - -3. **Device Failure Impact Visualization**: Provide visualizations for a potential single point of failure. If one device along the path fails, we want to know if it's possible for other devices downstream to fail as well. The criteria for potential failed device is if there is no path from that device back to the working router. - -4. **Unsecure Server Visualization**: Provide visualizations of unsecured servers where it is possible to go from the router to the server without passing through the hardware firewall. - -5. **Incident Impact**: Find the impact of an incident based on the topology of the device within a specified radius. - -6. **Events Filtering**: Find the events based on some filters, like the device that is impacted by the event and the time range that the event occurs. - -7. **Find Potential Incidents that Cause an Event**: Find the potential source or cause of an event based on the connection of the different devices. The source or cause of the event has to be an Incident, although this can be changed. - -8. **Find Potential Events that Got Affected by an Incident**: Find the potential events that are caused by the input incident based on the connections of the different devices. - -## Mock Data - -The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. - - -## Insights Application - -We have provided an insights application called "Network Infrastructure Insights". The applications provided the users with an intuitive view of the information in the graph. - -There are currently 4 pages in the Insights Application : -- Network Infrastructure Topology Exploration -- Network Infrastructure Topology Analysis -- Event Visualizations -- Event Causes Visualizations diff --git a/agile_operations/supply_chain_management/README.md b/agile_operations/supply_chain_management/README.md new file mode 100644 index 00000000..e079a0cd --- /dev/null +++ b/agile_operations/supply_chain_management/README.md @@ -0,0 +1,183 @@ +# Supply Chain Management (TigerGraph Solution Kit) + +The Supply Chain Management solution for TigerGraph delivers a complete, +enterprise-ready graph that models an end-to-end manufacturing and distribution +supply chain. It is built for manufacturers and distributors and mirrors +real-world ERP structures such as **Material Masters**, multi-level **Bills of +Materials (BOMs)**, **Purchase Orders**, **Sales Orders**, **Inventory**, +**Production Batches (SFC)**, **Plants**, **Shipments**, and full batch +genealogy. + +Unlike traditional relational warehouses that struggle with deep, recursive +questions, this graph enables real-time, highly connected analytics, including: + +- Impact analysis of **plant** or **supplier** failures. +- Full **forward/backward traceability** of production batches. +- Real-time **Available-to-Promise (ATP)**, with optional inventory freshness rules. +- **Country-of-origin** and domestic vs. international sourcing analysis. +- **Demand-supply matching** and identification of bottlenecks or constraints. + +By leveraging TigerGraph-s native parallel engine and bi-directional traversals, +analysts, planners, sourcing managers, and risk teams can get instant answers to +complex multi-hop questions, improving agility, resilience, and decision quality +across the supply chain. + +--- + +## Contents + +- [Overview](#overview) +- [Components](#components) +- [Prerequisites](#prerequisites) +- [Setup Instructions](#setup-instructions) +- [Query Execution Order and Explanations](#query-execution-order-and-explanations) +- [Scalability](#scalability) +- [Mock Data](#mock-data) + +--- + +## Overview + +This solution kit models the **end-to-end flow** of materials and orders across +a manufacturing supply chain: + +- **Master data** + - `Material` and `BOM` for finished goods, subassemblies, and components. + - `Supplier`, `Customer`, `Plant`, `Nation`, `Region`, `Feature`. +- **Production & genealogy** + - `SFC_Material` and `SFC_Assembly` represent actual production batches and + assembly operations. + - Graph edges capture "produced at", "used for", and BOM relationships. +- **Orders & pricing** + - `Purchase_Order`, `Line_Number` for inbound procurement. + - `Sales_Order`, `Sales_Order_Item` for outbound customer demand. +- **Inventory & movements** + - `Inventory_Held` for stock levels by location. + - `Material_Document_Item` for stock movements and their impact. +- **Shipments & logistics** + - `Shipment_Doc`, `Shipment_Item`, `Shipping_Doc`, `Delivery_Doc` for + transportation and fulfillment. +- **Geography & ownership** + - `Belongs_To`, `is_located_in` edges connect plants, suppliers, and + customers to `Nation` and `Region`, enabling regional risk and origin + analysis. + +> **Graph name:** This kit creates and uses a graph named `Supply_Chain_Management`. + +You can use the included queries for operations, planning, risk, and analytics + or extend the graph with your own entities, KPIs, and business rules. + +--- +# Components +This repository includes multiple components: + +* `data` - Realistic sample dataset (CSV files) with master and transactional data. +* `load_jobs` - Data loading jobs and scripts (supports offline batch and optional AWS S3 loading). +* `meta` - Solution Kit metadata includes the Insights application. +* `queries` - Complete set of 13 production-ready GSQL analytical queries. +* `schema` - Full graph schema definition (vertices, edges, indexes). +* `readme.md` - This usage guide. +* `setup.sh` - One-click automated installation script. + + +## Prerequisites + +Before you run this solution kit, make sure you have: + +- **A running TigerGraph instance** + - TigerGraph installed and running, or use the prebuilt kit on TG cloud. + - You must have permission to create graphs and run GSQL commands. + +- **GSQL client access** + - The `gsql` command-line tool available on the same machine/container where you cloned this repo. + - Ability to connect to your TigerGraph service + +- **Network access for sample data** + - Outbound internet access from the TigerGraph machine to read the sample + CSV files from the public S3 bucket used in `loading_job/load_data.gsql`. + +- **Shell environment** + - A Unix-like shell (Linux, macOS, or WSL) to run `setup.sh` and `queries/install_queries.sh`. + - Executable permissions for the scripts: + ```bash + chmod +x setup.sh + chmod +x queries/install_queries.sh + ``` + +> **Graph name:** This kit creates and uses a graph named +> `Supply_Chain_Management`. + +# Setup Instructions +The following instructions assume that you are running the following scripts +with `gsql` command installed. + +If you don't yet have the `gsql` command available, see the TigerGraph documentation: + +- **Local GSQL shell on the server** + [The GSQL Shell](https://docs.tigergraph.com/tigergraph-server/current/gsql-shell/) + (explains how to run `gsql` directly on a TigerGraph server) + +- **Remote GSQL client (from your laptop or another machine)** + [Using a Remote GSQL Client](https://docs.tigergraph.com/tigergraph-server/current/gsql-shell/using-a-remote-gsql-client) + (explains how to download the GSQL client JAR, configure SSL, and create a `gsql` alias) +Ensure that the script is executable with: +```bash + +chmod +x setup.sh + +``` +Then, run the automated script using: +```bash +./setup.sh +``` + +The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: + +1. **Schema Creation**: Creates the global graph `Supply_Chain_Management` and runs all schema definition files located in the `schema/` folder. + +2. **Data Loading**: Executes all loading jobs defined in `load_jobs/` to populate master data (Materials, Plants, BOMs) and transactional data (Purchase Orders, Sales Orders, Inventory, Shipments, SFC batches, etc.). + +3. **Query Installation**: Automatically installs the full library of 13 analytical queries from the `queries/` folder. + +After the script finishes (typically within minutes), the graph is fully loaded and ready for immediate use in GraphStudio or via REST API. + + +## Query Execution Order and Explanations +The queries are independent and can be executed in any order for day-to-day analysis. There is no mandatory sequential workflow except when performing inventory simulation or batch allocation. + +### Core Operational Queries (run on-demand) +- `product_quantity_sales_order` - Standard Available-to-Promise (ATP): checks if sufficient quantity exists for a Sales Order. +- `datewise_product_availability` - Smart ATP with freshness: only considers inventory updated within the last X months. +- `unfulfilled_orders` - Batch allocation engine: processes a set of Sales Orders and virtually reserves inventory. +- `check_shipment_capacity` - Compares ordered vs shipped quantities to find partially fulfilled items. +- `add_purchase_to_inventory` - Simulates the effect of receiving a Purchase Order on current stock levels. + +### Traceability & Risk Queries (recursive - run anytime) +- `explore_BOM` - Explodes a Bill of Materials downstream or traces where-used upstream to any depth. +- `explore_BOM_line` - Traces the actual physical genealogy of a specific batch (SFC) across production and consumption. +- `plant_failure_impact_nodes` - BFS impact analysis: returns all customers, orders, and finished goods affected if a Plant fails. +- `trace_BOM_line_nations` - Calculates the exact percentage of domestic vs international raw materials in a finished good. + +### Analytics & Reporting Queries (aggregation - run on-demand) +- `get_biggest_customers` - Top-K customers by ordered quantity (optional date range and nation filter). +- `get_biggest_suppliers` - Top-K suppliers by supplied quantity (optional date range and nation filter). +- `top_k_products_sales_order` - Most demanded finished goods across all Sales Orders. +- `top_k_products_purchase_order` - Most purchased raw/semi-finished materials. + +All queries are distributed, highly parameterized (depth, date ranges, top-K, nation filters, etc.), and complete in seconds even on datasets with millions of orders and multi-level BOMs. + +## Scalability +The heaviest queries (`explore_BOM`, `explore_BOM_line`, `plant_failure_impact_nodes`, `unfulfilled_orders`) use TigerGraph's native parallel traversal and recursion engines and scale linearly across the cluster. + +For extremely large supply chains (hundreds of millions of SFC instances or deep BOMs): +- Depth/hop limits are built into queries to guarantee predictable runtime. +- When an enormous number of temporary edges is expected (e.g., massive impact analysis), configure the engine to spill edges to disk instead of RAM: + +```bash +gadmin config set GPE.EdgeDataMemoryLimit 0 +gadmin config apply -y +gadmin restart -y +``` + +## Mock Data +The data folder is populated with sample data files. These files are crafted for testing and demonstration purposes. diff --git a/agile_operations/supply_chain_management/meta/Insights_Supply_Chain.json b/agile_operations/supply_chain_management/meta/Insights_Supply_Chain.json new file mode 100644 index 00000000..73a13827 --- /dev/null +++ b/agile_operations/supply_chain_management/meta/Insights_Supply_Chain.json @@ -0,0 +1,1262 @@ +{ + "defaultGraph": "Supply_Chain_Management", + "globalVariables": [], + "iconURL": "/insights/static/media/factory.4354dc741f4585127af6b70c9821a211.svg", + "id": "3fjAZmRYTeABxn4ncYY7M3", + "owner": "tigergraph", + "pageConfigSeparated": true, + "pages": [ + { + "chartMap": { + "6wa4sFXLXHJmWQzGJeDojT": { + "chartSettings": { + "category": [ + { + "id": "v_id", + "type": "string" + } + ], + "showColumns": [ + { + "isChecked": true, + "name": "v_id" + }, + { + "isChecked": false, + "name": "v_type" + }, + { + "isChecked": true, + "name": "customers.@total_quantity" + } + ], + "sortedColumns": [ + "v_id", + "customers.@total_quantity" + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "v_id", + "v_type", + "customers.@total_quantity" + ], + "value": [ + { + "id": "customers.@total_quantity", + "type": "number" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "6wa4sFXLXHJmWQzGJeDojT", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "get_biggest_customers", + "id": "get_biggest_customers", + "type": "QUERY" + }, + { + "data": "Top_K", + "id": "d01266d0-d7b3-451d-bb0b-71969d3dac88", + "paramGlobalInput": "Top_K", + "paramName": "top_k", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "e644912d-4d39-4544-8b97-afa36fbfcbdc", + "paramGlobalInput": "", + "paramName": "start_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "ad9bd726-e56d-4edf-b21e-7c1ef2bb66aa", + "paramGlobalInput": "", + "paramName": "end_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "elementType": "VERTEX", + "id": "b91d25e5-d27a-43d5-9b2b-0e2ef77bc969", + "paramGlobalInput": "", + "paramName": "input_nations", + "paramType": "SET", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Customers", + "type": "bar", + "version": "1763931181376086659" + }, + "aepZYhHtdrKMZscobrGixq": { + "chartSettings": { + "showColumns": [ + { + "isChecked": true, + "name": "p" + }, + { + "isChecked": true, + "name": "quantity" + } + ], + "sortedColumns": [ + "p", + "quantity" + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "p", + "quantity" + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "aepZYhHtdrKMZscobrGixq", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "top_k_products_sales_order", + "id": "top_k_products_sales_order", + "type": "QUERY" + }, + { + "data": "Top_K", + "id": "f0c011d4-d5d2-49fa-ba8c-3dd430d405af", + "paramGlobalInput": "Top_K", + "paramName": "k", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top Products Sales Order", + "type": "table", + "version": "1763931489536489074" + }, + "d1F1DV6GoqU8eBNjYT53b6": { + "chartSettings": { + "borderRadius": 26, + "borderWidth": 16, + "category": [ + { + "id": "v_id", + "type": "string" + } + ], + "radius": [ + 0, + 74 + ], + "roseType": true, + "showColumns": [ + { + "isChecked": true, + "name": "v_id" + }, + { + "isChecked": false, + "name": "v_type" + }, + { + "isChecked": true, + "name": "suppliers.@total_quantity" + } + ], + "showLegend": false, + "sortedColumns": [ + "v_id", + "suppliers.@total_quantity" + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "v_id", + "v_type", + "suppliers.@total_quantity" + ], + "value": [ + { + "id": "suppliers.@total_quantity", + "type": "number" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "d1F1DV6GoqU8eBNjYT53b6", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "get_biggest_suppliers", + "id": "get_biggest_suppliers", + "type": "QUERY" + }, + { + "data": "Top_K", + "id": "c04ee73c-eedf-46a5-bd4a-ff8135c33bde", + "paramGlobalInput": "Top_K", + "paramName": "top_k", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "b4447a62-707f-4e06-839f-c7f95a624fc9", + "paramGlobalInput": "", + "paramName": "start_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "6646046a-e5fb-4700-9e15-0c04ec289e36", + "paramGlobalInput": "", + "paramName": "end_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "elementType": "VERTEX", + "id": "0ed291e9-16c4-427f-983b-1ab45829fe70", + "paramGlobalInput": "", + "paramName": "input_nations", + "paramType": "SET", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Suppliers", + "type": "pie", + "version": "1763931246459106033" + }, + "h1DS1HwMbhESTmLuSNseGF": { + "chartSettings": { + "borderRadius": 32, + "borderWidth": 21, + "category": [ + { + "id": "IsProduct", + "type": "boolean" + } + ], + "radius": [ + 0, + 81 + ], + "roseType": false, + "tableHeaders": [ + "IsProduct", + "cnt" + ], + "tableIndex": 0, + "value": [ + { + "id": "cnt", + "type": "number" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "h1DS1HwMbhESTmLuSNseGF", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Supply_Chain_Management {\n GroupByAccum cnt> @@grp;\n res = \n SELECT s from Material:s \n ACCUM CASE when s.is_product then @@grp += (\"Product\"->1) else @@grp += (\"Material\"->1) end\n ;\n PRINT @@grp;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Material Product", + "type": "pie", + "version": "1764011824285635441" + }, + "iM7aR5tBHUSDNPuLkqCoeK": { + "chartSettings": { + "showColumns": [ + { + "isChecked": true, + "name": "p" + }, + { + "isChecked": true, + "name": "quantity" + } + ], + "sortedColumns": [ + "p", + "quantity" + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "p", + "quantity" + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "iM7aR5tBHUSDNPuLkqCoeK", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "top_k_products_purchase_order", + "id": "top_k_products_purchase_order", + "type": "QUERY" + }, + { + "data": "Top_K", + "id": "27f34beb-d818-44e9-b997-9538f52f37ac", + "paramGlobalInput": "Top_K", + "paramName": "k", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top Products Purchase Order", + "type": "table", + "version": "1763931519243173565" + }, + "jiFArchKghkbBhGC4d548q": { + "chartSettings": { + "inputStates": [ + { + "dataType": "number", + "id": "input_jVfdmk2x1Ajk4kjWZAzFmK", + "name": "Top_K", + "settings": { + "max": "50", + "min": "1" + }, + "widgetType": "Input" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "jiFArchKghkbBhGC4d548q", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "Inputs", + "version": "1763933359536882568" + }, + "vYoJnjMMFWLQWr6r1hThzL": { + "chartSettings": { + "values": [ + { + "fontColor": "#000000", + "fontSize": 36, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "iconURL": "/insights/static/media/layout-template.4aadad8f62567d7e8735dd875cf9d307.svg", + "id": "7pTWXkngZm4jQ68oDa8i8d", + "key": "Plants", + "label": "Plant", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 24, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 36, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "iconURL": "/insights/static/media/user-round.a4c78e138aaee380f0d4d18c1e5de9a2.svg", + "id": "jNjXkiSwzVtWQGPHSkVYyA", + "key": "Customers", + "label": "Customer", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 24, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 36, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "iconURL": "/insights/static/media/user-pen.e57c9feb5a07ce1f9b9c05d8ee2d0786.svg", + "id": "7a15tQkEd4KzMo8kJXWn9e", + "key": "Suppliers", + "label": "Supplier", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 24, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 36, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "iconURL": "/insights/static/media/flag.ce538663ace2f1f8f1ef1eb29d46f4bf.svg", + "id": "5Nwa7ruXMyTRNz7J226TDh", + "key": "Nations", + "label": "Nation", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 24, + "styleRule": [] + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "vYoJnjMMFWLQWr6r1hThzL", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Supply_Chain_Management {\n res = SELECT s from Plant:s \n ;\n PRINT res.size() as Plants;\n res = SELECT s FROM Customer:s; \n PRINT res.size() as Customers;\n res = SELECT s from Nation:s;\n PRINT res.size() as Nations;\n res = SELECT s FROM Supplier:s;\n PRINT res.size() as Suppliers;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "value", + "version": "1763933189824585386" + } + }, + "globalParameters": { + "Top_K": { + "id": "input_jVfdmk2x1Ajk4kjWZAzFmK", + "name": "Top_K", + "type": "NUMBER", + "value": 10 + } + }, + "iconURL": "/insights/static/media/chart-scatter.c83829978fdeba96c343f7cc07ee2bf9.svg", + "id": "mfgR3jZc9v78jGjp3Vn8jf", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 20, + "i": "6wa4sFXLXHJmWQzGJeDojT", + "moved": false, + "static": false, + "w": 8, + "x": 4, + "y": 32 + }, + { + "h": 23, + "i": "aepZYhHtdrKMZscobrGixq", + "moved": false, + "static": false, + "w": 4, + "x": 4, + "y": 9 + }, + { + "h": 24, + "i": "d1F1DV6GoqU8eBNjYT53b6", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 28 + }, + { + "h": 19, + "i": "h1DS1HwMbhESTmLuSNseGF", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 9 + }, + { + "h": 23, + "i": "iM7aR5tBHUSDNPuLkqCoeK", + "moved": false, + "static": false, + "w": 4, + "x": 8, + "y": 9 + }, + { + "h": 9, + "i": "jiFArchKghkbBhGC4d548q", + "moved": false, + "static": false, + "w": 2, + "x": 0, + "y": 0 + }, + { + "h": 9, + "i": "vYoJnjMMFWLQWr6r1hThzL", + "moved": false, + "static": false, + "w": 10, + "x": 2, + "y": 0 + } + ], + "xs": [ + { + "h": 9, + "i": "jiFArchKghkbBhGC4d548q", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 9, + "i": "vYoJnjMMFWLQWr6r1hThzL", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 9 + }, + { + "h": 19, + "i": "h1DS1HwMbhESTmLuSNseGF", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 18 + }, + { + "h": 23, + "i": "aepZYhHtdrKMZscobrGixq", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 37 + }, + { + "h": 23, + "i": "iM7aR5tBHUSDNPuLkqCoeK", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 60 + }, + { + "h": 24, + "i": "d1F1DV6GoqU8eBNjYT53b6", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 83 + }, + { + "h": 20, + "i": "6wa4sFXLXHJmWQzGJeDojT", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 107 + } + ] + }, + "title": "Analytics", + "version": "1764009629639742855", + "weight": -10 + }, + { + "chartMap": { + "5ZQJVEuiq5cAxVxZWVqoLS": { + "chartSettings": { + "edgeLength": 175, + "rulesByType": { + "Line_Number": [ + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "@isSrc", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#1a7c0e" + } + ] + } + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "5ZQJVEuiq5cAxVxZWVqoLS", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "explore_BOM_line_Insights", + "id": "explore_BOM_line_Insights", + "type": "QUERY" + }, + { + "data": "Line_Number", + "id": "9eed304b-a520-48ce-8da6-40203562c238", + "paramGlobalInput": "", + "paramName": "vertType", + "paramType": "STRING", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "LineVertex", + "id": "ec0f667f-e6d9-42e9-9d6b-98ad1be0bb33", + "paramGlobalInput": "LineVertex", + "paramName": "id", + "paramType": "STRING", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "LineDepth", + "id": "cb2c8a11-87d4-410f-980d-bd08fa44aa1e", + "paramGlobalInput": "LineDepth", + "paramName": "depth", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "LineUpstream", + "id": "02e56b6f-bcf7-40ab-9c0f-2508fc886654", + "paramGlobalInput": "LineUpstream", + "paramName": "upstream", + "paramType": "BOOL", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "2ea2db6f-3568-467b-a6bf-fcb78f19b1a5", + "paramGlobalInput": "", + "paramName": "use_date_range", + "paramType": "BOOL", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "323ec303-484d-4ed0-a21e-74dc8de82cbe", + "paramGlobalInput": "", + "paramName": "start_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "b5fea6eb-6b3c-47df-bddd-e1aadbc6ee9e", + "paramGlobalInput": "", + "paramName": "end_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "internal-graph", + "version": "1764021018679482546" + }, + "7GwzFLnQ5dDDVtUtVx7mPv": { + "chartSettings": { + "inputStates": [ + { + "dataType": "string", + "id": "input_rAjS1bcFu9NBPSX8wdUK4U", + "name": "LineVertex", + "settings": { + "graphName": "Supply_Chain_Management", + "labelKey": "Line_Number.id", + "open": false, + "options": [], + "patternLimit": 300, + "query": "", + "searchPattern": [ + { + "alias": "Line_Number", + "data": "Line_Number", + "id": "93d5c9b3-f987-4cfa-9d4d-369211e92af2", + "orderBy": [ + { + "asc": true, + "expression": { + "type": "AttrVariable", + "value": "alias_schema_Line_Number_0.id" + }, + "label": "Line_Number.id" + } + ], + "type": "vertex" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "tableHeaders": [ + "Matched pattern", + "Line_Number" + ], + "useQuery": true, + "valueKey": "Line_Number.id" + }, + "widgetType": "Dropdown" + }, + { + "dataType": "number", + "id": "input_1cRTwt6EKyjHGDuhpK7XUT", + "name": "LineDepth", + "settings": { + "max": "20", + "min": "2" + }, + "widgetType": "Input" + }, + { + "dataType": "bool", + "id": "input_p7HJXg2VhZhgeF1pK8hv4S", + "name": "LineUpstream", + "settings": { + "options": [] + }, + "widgetType": "Dropdown" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "7GwzFLnQ5dDDVtUtVx7mPv", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "BOM Line", + "type": "Inputs", + "version": "1763928107928372418" + }, + "hx3rVgRRGfLEwkeabZwKKi": { + "chartSettings": { + "edgeLength": 285, + "rulesByType": { + "BOM": [ + { + "condition": "always", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": "", + "fieldName": "id", + "fieldType": "string", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "node-radius", + "styleLabel": "0.25x", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "numeric", + "styleValue": 0.25 + }, + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "@isSrc", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#0571f0" + }, + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "@isSrc", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "node-radius", + "styleLabel": "1x", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "numeric", + "styleValue": 1 + }, + { + "condition": "=", + "conditionEndValue": 20, + "conditionStartValue": 1, + "conditionValue": "0", + "fieldName": "base_quantity", + "fieldType": "number", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#ff0000" + } + ], + "Material": [ + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "@isSrc", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#4973f3" + }, + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "is_product", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "node-radius", + "styleLabel": "2x", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "numeric", + "styleValue": 2 + } + ] + } + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "hx3rVgRRGfLEwkeabZwKKi", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "explore_BOM_insights", + "id": "explore_BOM_insights", + "type": "QUERY" + }, + { + "data": "vertType", + "id": "2e9b80fd-030e-49fb-9c68-754bf2efca37", + "paramGlobalInput": "vertType", + "paramName": "vertType", + "paramType": "STRING", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "vertexId", + "id": "70e1db0f-1025-4a86-8fc3-03da2d5c8338", + "paramGlobalInput": "vertexId", + "paramName": "id", + "paramType": "STRING", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "depth", + "id": "23681b01-c596-4f2a-8208-212236a966d4", + "paramGlobalInput": "depth", + "paramName": "depth", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "upstream", + "id": "f622d4ae-0db7-4da1-af9b-54317cb0bbb3", + "paramGlobalInput": "upstream", + "paramName": "upstream", + "paramType": "BOOL", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "internal-graph", + "version": "1763928813910151148" + }, + "wGYxHtNz9dmfUBHvh8vi8a": { + "chartSettings": { + "inputStates": [ + { + "dataType": "string", + "id": "input_35dZipma5w14zcPa9srhP7", + "name": "vertType", + "settings": { + "graphName": "Supply_Chain_Management", + "open": false, + "options": [ + { + "isCreatable": true, + "label": "BOM", + "value": "BOM" + }, + { + "isCreatable": true, + "label": "Material", + "value": "Material" + } + ], + "patternLimit": 300, + "query": "", + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "useQuery": false + }, + "widgetType": "Dropdown" + }, + { + "dataType": "string", + "id": "input_rGNXpbuD8xm2Jp2jqYA3TR", + "name": "vertexId", + "settings": { + "graphName": "Supply_Chain_Management", + "open": false, + "options": [], + "patternLimit": 300, + "query": "INTERPRET QUERY(STRING vertType) FOR GRAPH Supply_Chain_Management {\n ListAccum @@ids;\n res = SELECT s from (BOM|Material):s \n where s.type == vertType\n ACCUM @@ids += s.id \n ;\n PRINT @@ids;\n}", + "queryType": "interactive", + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "useQuery": true + }, + "widgetType": "Dropdown" + }, + { + "dataType": "number", + "id": "input_rHJAj5bR83viw6oAHPtzh4", + "name": "depth", + "settings": { + "max": "20" + }, + "widgetType": "Input" + }, + { + "dataType": "bool", + "id": "input_gvFDPVUBoEs7WCmUUYvKiA", + "name": "upstream", + "settings": { + "options": [ + { + "isCreatable": true, + "label": "TRUE", + "value": true + }, + { + "isCreatable": true, + "label": "FALSE", + "value": false + } + ] + }, + "widgetType": "Dropdown" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "wGYxHtNz9dmfUBHvh8vi8a", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "BOM", + "type": "Inputs", + "version": "1763928099165046718" + } + }, + "globalParameters": { + "LineDepth": { + "id": "input_1cRTwt6EKyjHGDuhpK7XUT", + "name": "LineDepth", + "type": "NUMBER", + "value": 5 + }, + "LineUpstream": { + "id": "input_p7HJXg2VhZhgeF1pK8hv4S", + "name": "LineUpstream", + "type": "BOOL", + "value": "true" + }, + "LineVertex": { + "id": "input_rAjS1bcFu9NBPSX8wdUK4U", + "name": "LineVertex", + "type": "STRING", + "value": "mat_101_China_74_3931_3634" + }, + "depth": { + "id": "input_rHJAj5bR83viw6oAHPtzh4", + "name": "depth", + "type": "NUMBER", + "value": 3 + }, + "upstream": { + "id": "input_gvFDPVUBoEs7WCmUUYvKiA", + "name": "upstream", + "type": "BOOL", + "value": "true" + }, + "vertType": { + "id": "input_35dZipma5w14zcPa9srhP7", + "name": "vertType", + "type": "STRING", + "value": "BOM" + }, + "vertexId": { + "id": "input_rGNXpbuD8xm2Jp2jqYA3TR", + "name": "vertexId", + "type": "STRING", + "value": "mat_1330_China_3665" + } + }, + "iconURL": "/insights/static/media/library.d3f7f207c6bb1d7be8e64045a19991b2.svg", + "id": "xB74jXwxq3KbSxpMEXvnQH", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 43, + "i": "5ZQJVEuiq5cAxVxZWVqoLS", + "moved": false, + "static": false, + "w": 6, + "x": 6, + "y": 10 + }, + { + "h": 10, + "i": "7GwzFLnQ5dDDVtUtVx7mPv", + "moved": false, + "static": false, + "w": 6, + "x": 6, + "y": 0 + }, + { + "h": 43, + "i": "hx3rVgRRGfLEwkeabZwKKi", + "moved": false, + "static": false, + "w": 6, + "x": 0, + "y": 10 + }, + { + "h": 10, + "i": "wGYxHtNz9dmfUBHvh8vi8a", + "moved": false, + "static": false, + "w": 6, + "x": 0, + "y": 0 + } + ], + "xs": [ + { + "h": 10, + "i": "wGYxHtNz9dmfUBHvh8vi8a", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 12, + "i": "7GwzFLnQ5dDDVtUtVx7mPv", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 10 + }, + { + "h": 43, + "i": "hx3rVgRRGfLEwkeabZwKKi", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 22 + }, + { + "h": 43, + "i": "5ZQJVEuiq5cAxVxZWVqoLS", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 65 + } + ] + }, + "title": "BOM", + "version": "1763928024253252036", + "weight": 10 + } + ], + "title": "Supply Chain Management", + "userRoleForApp": "owner", + "version": "1764009595079839773" +} \ No newline at end of file diff --git a/agile_operations/supply_chain_management/meta/images/Supply_Chain_Management_Schema_Image.png b/agile_operations/supply_chain_management/meta/images/Supply_Chain_Management_Schema_Image.png new file mode 100644 index 00000000..1adba384 Binary files /dev/null and b/agile_operations/supply_chain_management/meta/images/Supply_Chain_Management_Schema_Image.png differ diff --git a/agile_operations/supply_chain_management/queries/add_purchase_to_inventory.gsql b/agile_operations/supply_chain_management/queries/add_purchase_to_inventory.gsql new file mode 100644 index 00000000..b530fc7c --- /dev/null +++ b/agile_operations/supply_chain_management/queries/add_purchase_to_inventory.gsql @@ -0,0 +1,62 @@ +CREATE OR REPLACE QUERY add_purchase_to_inventory(VERTEX purchase_order, SET invs){ + /* + Description: + Simulates the goods-receipt (GR) posting of a Purchase Order and instantly + shows the resulting inventory impact for the affected materials in the + specified Inventory_Held locations (storage locations / bins). + + This is a "what-if" inventory projection query - ideal for MRP simulation, + ATP/CTP validation, and real-time inventory planning without touching + the actual ERP system. + + Parameters: + purchase_order (VERTEX): + The Purchase Order vertex representing the inbound receipt + invs (SET): + Set of Inventory_Held primary IDs (storage locations) to check/update. + Use empty set {} to include all relevant locations. + + Output: + . Previous inventory quantity + last_updated timestamp per material/location + . Projected new inventory quantity after GR posting + . Clear message if no inventory exists in the specified locations + */ + + SumAccum @quantity; + MinAccum @date; + MapAccum>>> @@prev_inv_map, @@curr_inv_map; + STRING err, prev, curr; + + Start = {purchase_order}; + + poi = SELECT p FROM Start:s -(Has_Line_Number:i)- Line_Number:p + ACCUM + p.@date += s.date; + + prod = SELECT pr FROM poi:p -(reverse_Used_For:r)- SFC_Material:pr + ACCUM + pr.@quantity += p.quantity, + pr.@date += p.@date; + + inventories = to_vertex_set(invs,"Inventory_Held"); + + updated_inventory_1 = SELECT i FROM inventories:i -(Inventory_Has_Material:c)- SFC_Material:p + ACCUM + @@prev_inv_map += (i-> (p -> (i.last_updated -> i.quantity))); + + updated_inventory = SELECT i FROM inventories:i -(Inventory_Has_Material:c)- SFC_Material:p + ACCUM + INT quant = i.quantity + p.@quantity, + @@curr_inv_map += (i-> (p -> (p.@date -> quant))); + + + IF @@prev_inv_map.size()== 0 THEN + PRINT("There is no inventory for this product at specified inventory id."); + ELSE + PRINT( "The previous inventory for the product was- "); + PRINT(@@prev_inv_map); + PRINT("The current inventory for the product is- "); + PRINT(@@curr_inv_map); + END; + +} diff --git a/agile_operations/supply_chain_management/queries/check_shipment_capacity.gsql b/agile_operations/supply_chain_management/queries/check_shipment_capacity.gsql new file mode 100644 index 00000000..70b41da2 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/check_shipment_capacity.gsql @@ -0,0 +1,45 @@ +CREATE OR REPLACE QUERY check_shipment_capacity(VERTEX so){ + /* + Description: + Instant Order Fulfillment Status & Shipment Completion Check. + Compares the ordered quantity on a Sales Order against the quantity + actually shipped (via Shipment_Item vertices) and immediately reports: + . Total ordered quantity per material + . Quantity already fulfilled (shipped) + . Remaining unfulfilled quantity + Parameters: + so (VERTEX): + The Sales Order vertex to analyze + + Output: + @@prod_total_quantity -> Total ordered quantity per material + @@prod_fulfillment -> Quantity already shipped/fulfilled + @@prod_unfulfillment -> Remaining open quantity (backorder + */ + + + MapAccum @@prod_fulfillment, @@prod_unfulfillment, @@prod_total_quantity; + MaxAccum @my_quantity; + + Start = {so}; + + soi = SELECT si FROM Start:s -(Has_Sales_Order_Item:i)- Sales_Order_Item:si; + + prod = SELECT p FROM soi:si -(For_Material:s)- SFC_Material:p + ACCUM + p.@my_quantity += si.quantity; + + ship = SELECT p FROM prod:p -(Carries_Product:c)- Shipment_Item:sh + ACCUM + IF p.@my_quantity > sh.quantity THEN + INT total_unfulfilled = p.@my_quantity - sh.quantity, + @@prod_fulfillment += (p -> sh.quantity), + @@prod_unfulfillment += (p -> total_unfulfilled), + @@prod_total_quantity += (p-> p.@my_quantity) + END; + + PRINT @@prod_total_quantity; + PRINT @@prod_fulfillment; + PRINT @@prod_unfulfillment; + +} diff --git a/agile_operations/supply_chain_management/queries/datewise_product_availability.gsql b/agile_operations/supply_chain_management/queries/datewise_product_availability.gsql new file mode 100644 index 00000000..6221783f --- /dev/null +++ b/agile_operations/supply_chain_management/queries/datewise_product_availability.gsql @@ -0,0 +1,65 @@ +CREATE OR REPLACE QUERY datewise_product_availability(VERTEX sales, INT inventory_interval_threshold){ +/* + Description: + Real-time Available-to-Promise (ATP) with Inventory Freshness Control. + Answers the critical business question: + "Can we fulfill this Sales Order using only inventory that has been + physically counted or received within the last X months?" + + This is a strict, audit-ready ATP check that prevents promising + customers based on stale or "ghost" inventory - a major cause of + delivery failures in traditional ERP systems. + + Parameters: + sales (VERTEX): + The Sales Order to validate + inventory_interval_threshold (INT): + Maximum allowed age of inventory in months. + Example: 3 -> only use inventory updated in the last 3 months + + Output: + . Clear fulfillment decision message + . Detailed map of which materials pass/fail the freshness + quantity check + . Ready for integration into order management UIs or ATP engines + */ + + TYPEDEF TUPLE myTuple; + + HeapAccum(1,quant ASC, inv ASC) @invHeap; + MapAccum> @product_sales_quantity; + MapAccum>> @@product_inventory; + ListAccum @@print_message; + MinAccum @prod_sales_date; + + Start = {sales}; + + soi = SELECT s FROM Start:so-(Has_Sales_Order_Item:i)-Sales_Order_Item:s + ACCUM + s.@prod_sales_date += so.date; + + prod = SELECT p FROM soi:s-(For_Material:r)-SFC_Material:p + ACCUM + p.@product_sales_quantity += (p->s.quantity), + p.@prod_sales_date += soi.@prod_sales_date; + + product_result = SELECT p FROM prod:p-(Inventory_Has_Material:c)- Inventory_Held:i + ACCUM + IF i.quantity >= p.@product_sales_quantity.get(p) AND i.last_updated >= datetime_sub(p.@prod_sales_date, INTERVAL inventory_interval_threshold MONTH) THEN + p.@invHeap += myTuple(i,i.quantity) + END + POST-ACCUM + IF p.@invHeap.size() > 0 THEN + @@product_inventory += (p->(p.@invHeap.top().inv->p.@invHeap.top().quant)), + p.@invHeap.pop() + END; + + IF @@product_inventory.size() == prod.size() THEN + @@print_message = "Sales Order can be satisfied for all products since they are available in inventory"; + ELSE + @@print_message = "Sales Order can only be satisfied for " + to_string(@@product_inventory.size()) + " count of products."; + END; + + PRINT @@print_message; + PRINT @@product_inventory; + } + diff --git a/agile_operations/supply_chain_management/queries/explore_BOM_insights.gsql b/agile_operations/supply_chain_management/queries/explore_BOM_insights.gsql new file mode 100644 index 00000000..bc2d874a --- /dev/null +++ b/agile_operations/supply_chain_management/queries/explore_BOM_insights.gsql @@ -0,0 +1,60 @@ +CREATE OR REPLACE DISTRIBUTED QUERY explore_BOM_insights(STRING vertType,STRING id, INT depth, BOOL upstream) FOR GRAPH Supply_Chain_Management { + /* + Query Name: explore_BOM_insights + Multi-level BOM exploration (downstream) or where-used analysis (upstream) + starting from a vertex identified by (vertType, id). + + Key Use Cases: + . Explode a material/BOM into its components and related suppliers (downstream) + . Reverse trace: find BOMs/material relationships and supply paths (upstream) + . Useful for impact analysis, sourcing visibility, and BOM troubleshooting. + + Parameters: + vertType (STRING): + Vertex type name of the starting vertex (e.g., Material, BOM, Supplier) + id (STRING): + Primary/external ID of the starting vertex (converted via to_vertex(id, vertType)) + depth (INT): + Maximum traversal depth + upstream (BOOL): + TRUE -> Where-used / reverse trace + FALSE -> BOM explosion / forward trace + + Output: + Prints the traversed edge set (@@edges) and vertex set (@@nodes). + */ + SetAccum @@edges; + SetAccum @@nodes; + OrAccum @visited; + OrAccum @isSrc; + vertex vertss; + vertss = to_vertex(id,vertType); + // get src + verts = {vertss}; + verts = select s from verts:s ACCUM s.@isSrc += TRUE,@@nodes += s; + + // while loop + WHILE verts.size() > 0 LIMIT depth DO + // traverse up or down + IF upstream == TRUE THEN + verts = + SELECT t FROM verts:s -((Supplies|reverse_Has_Component_Material|reverse_Produced_By):e)- (Material|BOM):t + WHERE t.@visited == FALSE + ACCUM @@edges += e,@@nodes += t + POST-ACCUM + s.@visited = TRUE; + ELSE + verts = + SELECT t FROM verts:s -((reverse_Supplies|Has_Component_Material|Produced_By):e)- (Material|BOM|Supplier):t + WHERE t.@visited == FALSE + ACCUM @@edges += e,@@nodes += t + POST-ACCUM + s.@visited = TRUE; + END; + END; + + // print edges + PRINT @@edges; + nodes = {@@nodes}; + PRINT nodes; +} diff --git a/agile_operations/supply_chain_management/queries/explore_BOM_line_Insights.gsql b/agile_operations/supply_chain_management/queries/explore_BOM_line_Insights.gsql new file mode 100644 index 00000000..5e3b98b8 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/explore_BOM_line_Insights.gsql @@ -0,0 +1,76 @@ +CREATE OR REPLACE DISTRIBUTED QUERY explore_BOM_line_Insights(STRING vertType,STRING id, INT depth, BOOL upstream, BOOL use_date_range, DATETIME start_date, DATETIME end_date) FOR GRAPH Supply_Chain_Management { + /* + Description: + End-to-end transactional lineage traversal (order-to-cash & procure-to-pay) + across actual production instances (SFC), sales orders, purchase orders, + shipments, and customers/suppliers. + + This is the "digital thread" query — it connects the physical flow + (what was actually built and shipped) with the commercial flow + (who ordered it and who supplied the components). + + Parameters: + vertType (STRING): + Vertex type name of the starting vertex — typically SFC_Material (actual batch), + Sales_Order, Purchase_Order, Customer, or Supplier + id (STRING): + Primary/external ID of the starting vertex (paired with vertType and converted + to a vertex via to_vertex(id, vertType)) + depth (INT): + Maximum traversal depth + upstream (BOOL): + FALSE → Follow the flow downstream + TRUE → Follow the flow upstream + use_date_range (BOOL): + If TRUE, only traverse through dated vertices (Sales_Order, Purchase_Order, + SFC_Assembly) that fall within the specified window + start_date / end_date (DATETIME): + Optional time fence for filtering transactional vertices + + */ + SetAccum @@nodes; + OrAccum @isSrc; + vertex vertss; + vertss = to_vertex(id,vertType); + // get src + verts = {vertss}; + verts = select s from verts:s ACCUM s.@isSrc += TRUE,@@nodes += s; + + SetAccum @@edges; + OrAccum @visited; + + + // while loop + WHILE verts.size() > 0 LIMIT depth DO + // traverse up or down + IF upstream == TRUE THEN + verts = + SELECT t FROM verts:s -((Has_Purchase_Order|Has_Line_Number|reverse_Used_For| + reverse_Has_Component_SFC|reverse_To_Be_Produced_By|reverse_For_Material| + reverse_Has_Sales_Order_Item|reverse_Has_Sales_Order):e)- + (Purchase_Order|Line_Number|SFC_Material|SFC_Assembly|Sales_Order_Item|Sales_Order|Customer):t + WHERE t.@visited == FALSE + // relevant vertices in date range (SFC_Assembly,Sales_Order,Purchase_Order) + ACCUM @@edges += e,@@nodes += t + POST-ACCUM + s.@visited = TRUE; + ELSE + verts = + SELECT t FROM verts:s -((reverse_Has_Purchase_Order|reverse_Has_Line_Number|Used_For| + Has_Component_SFC|To_Be_Produced_By|For_Material| + Has_Sales_Order_Item|Has_Sales_Order):e)- + (Supplier|Purchase_Order|Line_Number|SFC_Material|SFC_Assembly|Sales_Order_Item|Sales_Order):t + WHERE t.@visited == FALSE + // relevant vertices in date range (SFC_Assembly,Sales_Order,Purchase_Order) + ACCUM @@edges += e,@@nodes += t + POST-ACCUM + s.@visited = TRUE; + END;//Based_On,reverse_Based_On,Happens_At,reverse_Happens_At,Produced_By,reverse_Produced_By,Has_Component_Material,reverse_Has_Component_Material,SFC_To_Material,reverse_SFC_To_Material,Supplies,reverse_Supplies,Has,reverse_Has + END; + + // print edges + PRINT @@edges; + nodes = {@@nodes}; + PRINT nodes; + PRINT "explore_BOMLine works!"; +} diff --git a/agile_operations/supply_chain_management/queries/explore_bom.gsql b/agile_operations/supply_chain_management/queries/explore_bom.gsql new file mode 100644 index 00000000..1f124978 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/explore_bom.gsql @@ -0,0 +1,48 @@ +CREATE OR REPLACE DISTRIBUTED QUERY explore_BOM(VERTEX input_vert, INT depth, BOOL upstream) FOR GRAPH Supply_Chain_Management { + /* + Query Name: explore_BOM + Full multi-level BOM explosion (downstream) or where-used analysis (upstream) + from any Material, BOM header, or Supplier. + + Key Use Cases: + . Explode a finished good into all components (any depth, even 20-30 levels) + . Reverse trace: "Which finished goods or BOMs use this raw material?" + . Critical for demand planning, costing, engineering change impact, + single-source risk analysis, and compliance reporting. + + Parameters: + input_vert : Starting vertex (Material Or BOM) + depth : Maximum traversal depth (set to 30+ for full explosion) + upstream : TRUE -> Where-Used (who consumes this material?) + FALSE -> BOM Explosion (what does this material consist of?) + */ + + SetAccum @@edges; + OrAccum @visited; + + // get src + verts = {input_vert}; + + // while loop + WHILE verts.size() > 0 LIMIT depth DO + // traverse up or down + IF upstream == TRUE THEN + verts = + SELECT t FROM verts:s -((Supplies|reverse_Has_Component_Material|reverse_Produced_By):e)- (Material|BOM):t + WHERE t.@visited == FALSE + ACCUM @@edges += e + POST-ACCUM + s.@visited = TRUE; + ELSE + verts = + SELECT t FROM verts:s -((reverse_Supplies|Has_Component_Material|Produced_By):e)- (Material|BOM|Supplier):t + WHERE t.@visited == FALSE + ACCUM @@edges += e + POST-ACCUM + s.@visited = TRUE; + END; + END; + + // print edges + PRINT @@edges; +} diff --git a/agile_operations/supply_chain_management/queries/explore_bom_line.gsql b/agile_operations/supply_chain_management/queries/explore_bom_line.gsql new file mode 100644 index 00000000..9c46ff12 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/explore_bom_line.gsql @@ -0,0 +1,67 @@ +CREATE OR REPLACE DISTRIBUTED QUERY explore_BOM_line(VERTEX input_vert, INT depth, BOOL upstream, BOOL use_date_range, DATETIME start_date, DATETIME end_date) FOR GRAPH Supply_Chain_Management { + /* + Description: + End-to-end transactional lineage traversal (order-to-cash & procure-to-pay) + across actual production instances (SFC), sales orders, purchase orders, + shipments, and customers/suppliers. + + This is the "digital thread" query - it connects the physical flow + (what was actually built and shipped) with the commercial flow + (who ordered it and who supplied the components). + + Parameters: + input_vert (VERTEX): + Starting vertex - typically SFC_Material (actual batch), Sales_Order, + Purchase_Order, Customer, or Supplier + depth (INT): + Maximum traversal depth (usually 10-20 is sufficient for full lineage) + upstream (BOOL): + FALSE -> Follow the flow downstream (e.g. raw material -> finished good -> customer) + TRUE -> Follow the flow upstream (e.g. finished good -> raw materials -> supplier) + use_date_range (BOOL): + If TRUE, only traverse through dated vertices (Sales_Order, Purchase_Order, + SFC_Assembly) that fall within the specified window + start_date / end_date (DATETIME): + Optional time fence for filtering transactional vertices + + */ + + SetAccum @@edges; + OrAccum @visited; + + // get src + verts = {input_vert}; + + // while loop + WHILE verts.size() > 0 LIMIT depth DO + // traverse up or down + IF upstream == TRUE THEN + verts = + SELECT t FROM verts:s -((Has_Purchase_Order|Has_Line_Number|reverse_Used_For| + reverse_Has_Component_SFC|reverse_To_Be_Produced_By|reverse_For_Material| + reverse_Has_Sales_Order_Item|reverse_Has_Sales_Order):e)- + (Purchase_Order|Line_Number|SFC_Material|SFC_Assembly|Sales_Order_Item|Sales_Order|Customer):t + WHERE t.@visited == FALSE + // relevant vertices in date range (SFC_Assembly,Sales_Order,Purchase_Order) + ACCUM @@edges += e + POST-ACCUM + s.@visited = TRUE; + ELSE + verts = + SELECT t FROM verts:s -((reverse_Has_Purchase_Order|reverse_Has_Line_Number|Used_For| + Has_Component_SFC|To_Be_Produced_By|For_Material| + Has_Sales_Order_Item|Has_Sales_Order):e)- + (Supplier|Purchase_Order|Line_Number|SFC_Material|SFC_Assembly|Sales_Order_Item|Sales_Order):t + WHERE t.@visited == FALSE + // relevant vertices in date range (SFC_Assembly,Sales_Order,Purchase_Order) + ACCUM @@edges += e + POST-ACCUM + s.@visited = TRUE; + END; + END; + + // print edges + PRINT @@edges; + PRINT "explore_BOMLine works!"; +} + diff --git a/agile_operations/supply_chain_management/queries/get_biggest_customers.gsql b/agile_operations/supply_chain_management/queries/get_biggest_customers.gsql new file mode 100644 index 00000000..a5b342d9 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/get_biggest_customers.gsql @@ -0,0 +1,52 @@ +CREATE OR REPLACE DISTRIBUTED QUERY get_biggest_customers(INT top_k, DATETIME start_date, DATETIME end_date, SET input_nations) FOR GRAPH Supply_Chain_Management { + /* + Description: + Top-K Customers by Delivered Quantity (or Revenue Proxy) in a selected period + with optional geographic filtering (by Nation / Region). + + Instantly ranks customers based on actual shipped/delivered volume from + Sales Orders - far more accurate than booked revenue or invoice data, + because it reflects real physical fulfillment. + + Parameters: + top_k (INT): + Number of top customers to return (e.g., 10, 20, 50) + start_date / end_date (DATETIME): + Time window for analysis (e.g., last 12 months, current FY) + input_nations (SET>): + Optional filter - only include customers from these countries. + Pass empty set {} to include all nations globally. + + Output: + Ranked list of Customer vertices with accumulated delivered quantity + Ready for visualization in GraphStudio, BI tools, or REST API exposure. + */ + + SumAccum @total_quantity; + + // if no nations, take all nations + nations = {input_nations}; + IF nations.size() == 0 THEN + nations = {Nation.*}; + END; + // get all customers in nations + customers = + SELECT t FROM nations:s -(reverse_Belongs_To)- Customer:t; + // filter their sales orders by date range + sales_orders = + SELECT t FROM customers:s -(Has_Sales_Order)- Sales_Order:t + //WHERE t.date BETWEEN start_date AND end_date + ; + // sum up the quantities of sales order items + sales_orders = + SELECT s FROM sales_orders:s -(Has_Sales_Order_Item)- Sales_Order_Item:t + ACCUM s.@total_quantity += t.quantity; + // sum up sums on customers and sort customers + customers = + SELECT t FROM sales_orders:s -(reverse_Has_Sales_Order)- Customer:t + ACCUM t.@total_quantity += s.@total_quantity + ORDER BY t.@total_quantity DESC + LIMIT top_k; + PRINT customers[customers.@total_quantity]; +} + diff --git a/agile_operations/supply_chain_management/queries/get_biggest_suppliers.gsql b/agile_operations/supply_chain_management/queries/get_biggest_suppliers.gsql new file mode 100644 index 00000000..3eeb843d --- /dev/null +++ b/agile_operations/supply_chain_management/queries/get_biggest_suppliers.gsql @@ -0,0 +1,51 @@ +CREATE OR REPLACE DISTRIBUTED QUERY get_biggest_suppliers(INT top_k, DATETIME start_date, DATETIME end_date, SET input_nations) FOR GRAPH Supply_Chain_Management { + /* + Description: + Top-K Suppliers by Purchased/Inbound Volume in a selected time period + with optional geographic filtering (by Nation / Region). + + Ranks suppliers based on actual received or ordered quantity from + Purchase Orders - the most accurate measure of supplier spend and + strategic importance in manufacturing and distribution environments. + Parameters: + top_k (INT): + Number of top suppliers to return (e.g., 10, 25, 50) + start_date / end_date (DATETIME): + Analysis period (e.g., trailing 12 months, current fiscal year) + input_nations (SET>): + Optional country/region filter. + Pass empty set {} to include all suppliers globally. + + Output: + Ranked list of Supplier vertices with total purchased quantity + Ready for GraphStudio visualization, Power BI, Tableau, or REST API. + */ + + SumAccum @total_quantity; + + // if no nations, take all nations + nations = {input_nations}; + IF nations.size() == 0 THEN + nations = {Nation.*}; + END; + // get all suppliers in nations + suppliers = + SELECT t FROM nations:s -(reverse_Belongs_To)- Supplier:t; + // filter their purchase orders by date range + purchase_orders = + SELECT t FROM suppliers:s -(Has_Purchase_Order)- Purchase_Order:t + //WHERE t.date BETWEEN start_date AND end_date + ; + // sum up the quantities of line numbers + purchase_orders = + SELECT s FROM purchase_orders:s -(Has_Line_Number)- Line_Number:t + ACCUM s.@total_quantity += t.quantity; + // sum up sums on suppliers and sort suppliers + suppliers = + SELECT t FROM purchase_orders:s -(reverse_Has_Purchase_Order)- Supplier:t + ACCUM t.@total_quantity += s.@total_quantity + ORDER BY t.@total_quantity DESC + LIMIT top_k; + PRINT suppliers[suppliers.@total_quantity]; +} + diff --git a/agile_operations/supply_chain_management/queries/plant_failure_impact_nodes.gsql b/agile_operations/supply_chain_management/queries/plant_failure_impact_nodes.gsql new file mode 100644 index 00000000..64adb82f --- /dev/null +++ b/agile_operations/supply_chain_management/queries/plant_failure_impact_nodes.gsql @@ -0,0 +1,132 @@ + CREATE OR REPLACE QUERY plant_failure_impact_nodes (VERTEX start_inv,INT max_hops = 8) { + /* + Description: + Critical Business Continuity & Risk Impact Analysis Query. + Answers the mission-critical question: + "If this manufacturing plant goes down tomorrow (fire, strike, natural disaster), + which finished goods, customer orders, and end customers will be directly impacted?" + + Performs a constrained BFS from a Plant -> through current inventory -> required materials + -> open sales order items -> customers. Returns the full downstream impact path + within seconds - impossible in traditional ERP or BI systems. + Parameters: + start_plant (VERTEX): + The manufacturing or distribution plant to simulate failure of + max_hops (INT, default 8): + Maximum graph distance to traverse. 8 is sufficient for full global impact. + + Output: + . All reachable Customer vertices (affected end customers) + . Full path of edges showing exactly how disruption propagates + . Visualizable in GraphStudio as impact network + */ + + TYPEDEF TUPLE Edge_Info; + SumAccum @@num_hops_traversed; // current step + ListAccum @@error_message_list; + ListAccum @local_from_edges; + OrAccum @or_visited; + SetAccum @@edges_to_display; + + SetAccum @@end_set; + SetAccum @@vertices_to_display_set; + SetAccum @@next_traverse_set; + + SetAccum @@e_type_set; + + @@e_type_set += ( + "Plant_Has_Inventory", + "Inventory_Has_Material", + "reverse_For_Material", + "reverse_Has_Sales_Order_Item", + "Has_Sales_Order" + ); + + + start_vertices(ANY) = {start_inv}; + end_vertices(ANY) = {Customer.*}; + + // initialize the step + @@num_hops_traversed = 0; + + // start from the source node + start_vertices = SELECT s + FROM start_vertices:s + POST-ACCUM + s.@or_visited += TRUE, + @@vertices_to_display_set += s + ; + + // Select end vertex for visualization + end_vertices = SELECT s + FROM end_vertices:s + POST-ACCUM + @@vertices_to_display_set += s + ; + + // Check if end is visited. + visited_end = SELECT s + FROM end_vertices:s + WHERE s.@or_visited + ; + + // breadth-first search from source + WHILE (start_vertices.size() > 0 AND visited_end.size() == 0) LIMIT max_hops DO + @@num_hops_traversed += 1; + + start_vertices = SELECT t + FROM start_vertices:s -(@@e_type_set>:e)- :t + WHERE t.@or_visited == FALSE + ACCUM + t.@local_from_edges += Edge_Info(s, t, e) + POST-ACCUM + t.@or_visited += TRUE + ; + + visited_end = SELECT s + FROM end_vertices:s + WHERE s.@or_visited + ; + END; + + end_traverse_back = visited_end; + + // go from target to source to collect the edges + vertices visited + print end_traverse_back; + WHILE (end_traverse_back.size() > 0) DO + end_traverse_back = SELECT s + FROM end_traverse_back:s + POST-ACCUM + FOREACH e_info IN s.@local_from_edges DO + @@next_traverse_set += e_info.from_vertex, + @@edges_to_display += e_info.curr_edge + END + ; + + + end_traverse_back = {@@next_traverse_set}; + @@vertices_to_display_set += @@next_traverse_set; + @@next_traverse_set.clear(); + END; + + vertices_to_display = {@@vertices_to_display_set}; + + IF ( + @@end_set.size() == 1 AND + visited_end.size() == 0 + ) THEN + STRING not_found_error_str = ( + "Path "+ "not found within " + to_string(max_hops) + " hops. " + + to_string(@@num_hops_traversed) + " hops were traversed in total." + ); + + @@error_message_list += not_found_error_str; + END; + + PRINT + end_traverse_back, + @@error_message_list AS error_message_list + ; + PRINT vertices_to_display, @@edges_to_display AS edges_to_display; +} + diff --git a/agile_operations/supply_chain_management/queries/product_quantity_sales_order.gsql b/agile_operations/supply_chain_management/queries/product_quantity_sales_order.gsql new file mode 100644 index 00000000..0aa17672 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/product_quantity_sales_order.gsql @@ -0,0 +1,57 @@ +CREATE OR REPLACE QUERY product_quantity_sales_order(VERTEX sales){ + /* + Description: + Real-time Available-to-Promise (ATP) Check - Basic Edition. + Answers the fundamental order fulfillment question: + "Do we have enough on-hand inventory (any location, any batch) to fulfill this sales order today?" + + Compares required quantity per material on a Sales Order against current + global inventory (Inventory_Held) and instantly reports whether the order + can be fully satisfied. + + Parameters: + sales (VERTEX): + The Sales Order vertex to validate + + Output: + . Clear fulfillment decision message + . Detailed map of materials with sufficient inventory (location + quantity) + . Immediate visibility into which items block full fulfillment + + */ + + TYPEDEF TUPLE myTuple; + + HeapAccum(1,quant ASC, inv ASC) @invHeap; + MapAccum> @product_sales_quantity; + MapAccum>> @@product_inventory; + ListAccum @@print_message; + + Start = {sales}; + + soi = SELECT s FROM Start:so-(Has_Sales_Order_Item:i)-Sales_Order_Item:s; + + prod = SELECT p FROM soi:s-(For_Material:r)-SFC_Material:p + ACCUM + p.@product_sales_quantity += (p->s.quantity); + + product_result = SELECT p FROM prod:p-(Inventory_Has_Material:c)- Inventory_Held:i + ACCUM + IF i.quantity >= p.@product_sales_quantity.get(p) THEN + p.@invHeap += myTuple(i,i.quantity) + END + POST-ACCUM + @@product_inventory += (p->(p.@invHeap.top().inv->p.@invHeap.top().quant)), + p.@invHeap.pop(); + + IF @@product_inventory.size() == prod.size() THEN + @@print_message = "Sales Order can be satisfied for all products since they are available in inventory"; + ELSE + INT current = prod.size() -@@product_inventory.size(); + @@print_message = "Sales Order can only be satisfied for " + to_string(current) + "count of products."; + END; + + PRINT @@print_message; + PRINT @@product_inventory; + } + diff --git a/agile_operations/supply_chain_management/queries/queries.gsql b/agile_operations/supply_chain_management/queries/queries.gsql deleted file mode 100644 index dccf3ad1..00000000 --- a/agile_operations/supply_chain_management/queries/queries.gsql +++ /dev/null @@ -1,568 +0,0 @@ -CREATE DISTRIBUTED QUERY explore_BOM(VERTEX input_vert, INT depth, BOOL upstream) FOR GRAPH Supply_Chain_Management { - - SetAccum @@edges; - OrAccum @visited; - - // get src - verts = {input_vert}; - - // while loop - WHILE verts.size() > 0 LIMIT depth DO - // traverse up or down - IF upstream == TRUE THEN - verts = - SELECT t FROM verts:s -((Supplies|reverse_Has_Component_Material|reverse_Produced_By):e)- (Material|BOM):t - WHERE t.@visited == FALSE - ACCUM @@edges += e - POST-ACCUM - s.@visited = TRUE; - ELSE - verts = - SELECT t FROM verts:s -((reverse_Supplies|Has_Component_Material|Produced_By):e)- (Material|BOM|Supplier):t - WHERE t.@visited == FALSE - ACCUM @@edges += e - POST-ACCUM - s.@visited = TRUE; - END; - END; - - // print edges - PRINT @@edges; -} - -CREATE DISTRIBUTED QUERY explore_BOM_line( - VERTEX input_vert, - INT depth, - BOOL upstream, - BOOL use_date_range, - DATETIME start_date, - DATETIME end_date) FOR GRAPH Supply_Chain_Management { - - SetAccum @@edges; - OrAccum @visited; - - // get src - verts = {input_vert}; - - // while loop - WHILE verts.size() > 0 LIMIT depth DO - // traverse up or down - IF upstream == TRUE THEN - verts = - SELECT t FROM verts:s -((Has_Purchase_Order|Has_Line_Number|reverse_Used_For| - reverse_Has_Component_SFC|reverse_To_Be_Produced_By|reverse_For_Material| - reverse_Has_Sales_Order_Item|reverse_Has_Sales_Order):e)- - (Purchase_Order|Line_Number|SFC_Material|SFC_Assembly|Sales_Order_Item|Sales_Order|Customer):t - WHERE t.@visited == FALSE - // relevant vertices in date range (SFC_Assembly,Sales_Order,Purchase_Order) - ACCUM @@edges += e - POST-ACCUM - s.@visited = TRUE; - ELSE - verts = - SELECT t FROM verts:s -((reverse_Has_Purchase_Order|reverse_Has_Line_Number|Used_For| - Has_Component_SFC|To_Be_Produced_By|For_Material| - Has_Sales_Order_Item|Has_Sales_Order):e)- - (Supplier|Purchase_Order|Line_Number|SFC_Material|SFC_Assembly|Sales_Order_Item|Sales_Order):t - WHERE t.@visited == FALSE - // relevant vertices in date range (SFC_Assembly,Sales_Order,Purchase_Order) - ACCUM @@edges += e - POST-ACCUM - s.@visited = TRUE; - END; - END; - - // print edges - PRINT @@edges; - PRINT "explore_BOMLine works!"; -} - -CREATE DISTRIBUTED QUERY get_biggest_customers(INT top_k, DATETIME start_date, DATETIME end_date, SET input_nations) FOR GRAPH Supply_Chain_Management { - - SumAccum @total_quantity; - - // if no nations, take all nations - nations = {input_nations}; - IF nations.size() == 0 THEN - nations = {Nation.*}; - END; - // get all customers in nations - customers = - SELECT t FROM nations:s -(reverse_Belongs_To)- Customer:t; - // filter their sales orders by date range - sales_orders = - SELECT t FROM customers:s -(Has_Sales_Order)- Sales_Order:t - WHERE t.date BETWEEN start_date AND end_date; - // sum up the quantities of sales order items - sales_orders = - SELECT s FROM sales_orders:s -(Has_Sales_Order_Item)- Sales_Order_Item:t - ACCUM s.@total_quantity += t.quantity; - // sum up sums on customers and sort customers - customers = - SELECT t FROM sales_orders:s -(reverse_Has_Sales_Order)- Customer:t - ACCUM t.@total_quantity += s.@total_quantity - ORDER BY t.@total_quantity DESC - LIMIT top_k; - PRINT customers[customers.@total_quantity]; -} - -CREATE DISTRIBUTED QUERY get_biggest_suppliers(INT top_k, DATETIME start_date, DATETIME end_date, SET input_nations) FOR GRAPH Supply_Chain_Management { - - SumAccum @total_quantity; - - // if no nations, take all nations - nations = {input_nations}; - IF nations.size() == 0 THEN - nations = {Nation.*}; - END; - // get all suppliers in nations - suppliers = - SELECT t FROM nations:s -(reverse_Belongs_To)- Supplier:t; - // filter their purchase orders by date range - purchase_orders = - SELECT t FROM suppliers:s -(Has_Purchase_Order)- Purchase_Order:t - WHERE t.date BETWEEN start_date AND end_date; - // sum up the quantities of line numbers - purchase_orders = - SELECT s FROM purchase_orders:s -(Has_Line_Number)- Line_Number:t - ACCUM s.@total_quantity += t.quantity; - // sum up sums on suppliers and sort suppliers - suppliers = - SELECT t FROM purchase_orders:s -(reverse_Has_Purchase_Order)- Supplier:t - ACCUM t.@total_quantity += s.@total_quantity - ORDER BY t.@total_quantity DESC - LIMIT top_k; - PRINT suppliers[suppliers.@total_quantity]; -} - -CREATE DISTRIBUTED QUERY trace_BOM_line_nations(VERTEX input_fg) FOR GRAPH Supply_Chain_Management { - OrAccum @visited; - MaxAccum @@src_nation, @rm_nation; - SumAccum @@domestic_quantity, @@international_quantity; - - verts (ANY) = {input_fg}; - - sfc_assembly = - SELECT t FROM verts:s -(To_Be_Produced_By)- SFC_Assembly:t; - bom = - SELECT t FROM sfc_assembly:s -(Based_On)- BOM:t; - plant = - SELECT t FROM bom:s -(Happens_At)- Plant:t; - src_nation = - SELECT t FROM plant:s -(Belongs_To)- Nation:t - POST-ACCUM @@src_nation += t; - - raw_materials (ANY) = {}; - WHILE verts.size() > 0 DO - // traverse until suppliers - // categorize raw materials as domestic or international - verts = - SELECT t FROM verts:s -((Has_Component_SFC|To_Be_Produced_By|For_Material):e)- - (SFC_Material|SFC_Assembly):t - WHERE t.@visited == FALSE - POST-ACCUM - s.@visited = TRUE; - - // collect raw materials (sfc_materials with edge to line number) - temp_raw_materials = - SELECT s FROM verts:s WHERE s.outdegree("Used_For") > 0; - raw_materials = raw_materials UNION temp_raw_materials; - END; - - // traverse from raw materials -> line number -> purchase order -> supplier (pass in MaxAccum) - line_numbers = - SELECT t FROM raw_materials:s -(Used_For)- Line_Number:t - POST-ACCUM t.@visited += TRUE; - purchase_orders = - SELECT t FROM line_numbers:s -(reverse_Has_Line_Number)- Purchase_Order:t - ACCUM t.@visited += TRUE; - suppliers = - SELECT t FROM purchase_orders:s -(reverse_Has_Purchase_Order)- Supplier:t - POST-ACCUM t.@visited += TRUE; - suppliers = - SELECT s FROM suppliers:s -(Belongs_To)- Nation:t - POST-ACCUM t.@rm_nation += t; - purchase_orders = - SELECT s FROM purchase_orders:s -(reverse_Has_Purchase_Order)- Supplier:t - ACCUM s.@rm_nation += t.@rm_nation; - line_numbers = - SELECT s FROM line_numbers:s -(reverse_Has_Line_Number)- Purchase_Order:t - ACCUM s.@rm_nation += t.@rm_nation; - raw_materials = - SELECT s FROM raw_materials:s -(Used_For)- Line_Number:t - ACCUM - IF t.@rm_nation == @@src_nation THEN - @@domestic_quantity += s.quantity - ELSE - @@international_quantity += s.quantity - END; - - PRINT @@domestic_quantity, @@international_quantity; -} - -CREATE OR REPLACE QUERY add_purchase_to_inventory(VERTEX purchase_order, SET invs){ - /* This query updates the inventory for the product supplied by the supplier */ - - SumAccum @quantity; - MinAccum @date; - MapAccum>>> @@prev_inv_map, @@curr_inv_map; - STRING err, prev, curr; - - Start = {purchase_order}; - - poi = SELECT p FROM Start:s -(Has_Line_Number:i)- Line_Number:p - ACCUM - p.@date += s.date; - - prod = SELECT pr FROM poi:p -(reverse_Used_For:r)- SFC_Material:pr - ACCUM - pr.@quantity += p.quantity, - pr.@date += p.@date; - - inventories = to_vertex_set(invs,"Inventory_Held"); - - updated_inventory_1 = SELECT i FROM inventories:i -(Inventory_Has_Material:c)- SFC_Material:p - ACCUM - @@prev_inv_map += (i-> (p -> (i.last_updated -> i.quantity))); - - updated_inventory = SELECT i FROM inventories:i -(Inventory_Has_Material:c)- SFC_Material:p - ACCUM - INT quant = i.quantity + p.@quantity, - @@curr_inv_map += (i-> (p -> (p.@date -> quant))); - - - IF @@prev_inv_map.size()== 0 THEN - PRINT("There is no inventory for this product at specified inventory id."); - ELSE - PRINT( "The previous inventory for the product was- "); - PRINT(@@prev_inv_map); - PRINT("The current inventory for the product is- "); - PRINT(@@curr_inv_map); - END; - -} - -CREATE OR REPLACE QUERY check_shipment_capacity(VERTEX so){ - /* This query checks shipment capacity and sales order quantity and to check if the order has been fulfilled */ - - - MapAccum @@prod_fulfillment, @@prod_unfulfillment, @@prod_total_quantity; - MaxAccum @my_quantity; - - Start = {so}; - - soi = SELECT si FROM Start:s -(Has_Sales_Order_Item:i)- Sales_Order_Item:si; - - prod = SELECT p FROM soi:si -(For_Material:s)- SFC_Material:p - ACCUM - p.@my_quantity += si.quantity; - - ship = SELECT p FROM prod:p -(Carries_Product:c)- Shipment_Item:sh - ACCUM - IF p.@my_quantity > sh.quantity THEN - INT total_unfulfilled = p.@my_quantity - sh.quantity, - @@prod_fulfillment += (p -> sh.quantity), - @@prod_unfulfillment += (p -> total_unfulfilled), - @@prod_total_quantity += (p-> p.@my_quantity) - END; - - PRINT @@prod_total_quantity; - PRINT @@prod_fulfillment; - PRINT @@prod_unfulfillment; - -} - -CREATE OR REPLACE QUERY datewise_product_availability(VERTEX sales, INT inventory_interval_threshold){ - /* This query gets the product quantity required in a sales order and checks inventory for the product and checks the date when the - inventory was last updated. If inventory was updated more than inventory_interval_threshold ago, order will not be fulfilled */ - - TYPEDEF TUPLE myTuple; - - HeapAccum(1,quant ASC, inv ASC) @invHeap; - MapAccum> @product_sales_quantity; - MapAccum>> @@product_inventory; - ListAccum @@print_message; - MinAccum @prod_sales_date; - - Start = {sales}; - - soi = SELECT s FROM Start:so-(Has_Sales_Order_Item:i)-Sales_Order_Item:s - ACCUM - s.@prod_sales_date += so.date; - - prod = SELECT p FROM soi:s-(For_Material:r)-SFC_Material:p - ACCUM - p.@product_sales_quantity += (p->s.quantity), - p.@prod_sales_date += soi.@prod_sales_date; - - product_result = SELECT p FROM prod:p-(Inventory_Has_Material:c)- Inventory_Held:i - ACCUM - IF i.quantity >= p.@product_sales_quantity.get(p) AND i.last_updated >= datetime_sub(p.@prod_sales_date, INTERVAL inventory_interval_threshold MONTH) THEN - p.@invHeap += myTuple(i,i.quantity) - END - POST-ACCUM - IF p.@invHeap.size() > 0 THEN - @@product_inventory += (p->(p.@invHeap.top().inv->p.@invHeap.top().quant)), - p.@invHeap.pop() - END; - - IF @@product_inventory.size() == prod.size() THEN - @@print_message = "Sales Order can be satisfied for all products since they are available in inventory"; - ELSE - @@print_message = "Sales Order can only be satisfied for " + to_string(@@product_inventory.size()) + " count of products."; - END; - - PRINT @@print_message; - PRINT @@product_inventory; - } - - CREATE OR REPLACE QUERY plant_failure_impact_nodes (VERTEX start_inv,INT max_hops = 8) { - // BFS Shortest Path - base version that can traverse through all these edges: - // Stores, Contains, SO_Requests, Includes_SO, Places - // Adapted from the following BFS algorithm: - // https://github.com/tigergraph/gsql-graph-algorithms/blob/master/algorithms/Path/bfs/tg_bfs.gsql - - TYPEDEF TUPLE Edge_Info; - SumAccum @@num_hops_traversed; // current step - ListAccum @@error_message_list; - ListAccum @local_from_edges; - OrAccum @or_visited; - SetAccum @@edges_to_display; - - SetAccum @@end_set; - SetAccum @@vertices_to_display_set; - SetAccum @@next_traverse_set; - - SetAccum @@e_type_set; - - @@e_type_set += ( - "Plant_Has_Inventory", - "Inventory_Has_Material", - "reverse_For_Material", - "reverse_Has_Sales_Order_Item", - "Has_Sales_Order" - ); - - - start_vertices(ANY) = {start_inv}; - end_vertices(ANY) = {Customer.*}; - - // initialize the step - @@num_hops_traversed = 0; - - // start from the source node - start_vertices = SELECT s - FROM start_vertices:s - POST-ACCUM - s.@or_visited += TRUE, - @@vertices_to_display_set += s - ; - - // Select end vertex for visualization - end_vertices = SELECT s - FROM end_vertices:s - POST-ACCUM - @@vertices_to_display_set += s - ; - - // Check if end is visited. - visited_end = SELECT s - FROM end_vertices:s - WHERE s.@or_visited - ; - - // breadth-first search from source - WHILE (start_vertices.size() > 0 AND visited_end.size() == 0) LIMIT max_hops DO - @@num_hops_traversed += 1; - - start_vertices = SELECT t - FROM start_vertices:s -(@@e_type_set>:e)- :t - WHERE t.@or_visited == FALSE - ACCUM - t.@local_from_edges += Edge_Info(s, t, e) - POST-ACCUM - t.@or_visited += TRUE - ; - - visited_end = SELECT s - FROM end_vertices:s - WHERE s.@or_visited - ; - END; - - end_traverse_back = visited_end; - - // go from target to source to collect the edges + vertices visited - WHILE (end_traverse_back.size() > 0) DO - end_traverse_back = SELECT s - FROM end_traverse_back:s - POST-ACCUM - FOREACH e_info IN s.@local_from_edges DO - @@next_traverse_set += e_info.from_vertex, - @@edges_to_display += e_info.curr_edge - END - ; - - - end_traverse_back = {@@next_traverse_set}; - @@vertices_to_display_set += @@next_traverse_set; - @@next_traverse_set.clear(); - END; - - vertices_to_display = {@@vertices_to_display_set}; - - IF ( - @@end_set.size() == 1 AND - visited_end.size() == 0 - ) THEN - STRING not_found_error_str = ( - "Path "+ "not found within " + to_string(max_hops) + " hops. " + - to_string(@@num_hops_traversed) + " hops were traversed in total." - ); - - @@error_message_list += not_found_error_str; - END; - - PRINT - end_traverse_back, - @@error_message_list AS error_message_list - ; - PRINT vertices_to_display, @@edges_to_display AS edges_to_display; -} - -CREATE OR REPLACE QUERY product_quantity_sales_order(VERTEX sales){ - /* This query gets the product quantity required in a sales order and checks inventory for the product */ - - TYPEDEF TUPLE myTuple; - - HeapAccum(1,quant ASC, inv ASC) @invHeap; - MapAccum> @product_sales_quantity; - MapAccum>> @@product_inventory; - ListAccum @@print_message; - - Start = {sales}; - - soi = SELECT s FROM Start:so-(Has_Sales_Order_Item:i)-Sales_Order_Item:s; - - prod = SELECT p FROM soi:s-(For_Material:r)-SFC_Material:p - ACCUM - p.@product_sales_quantity += (p->s.quantity); - - product_result = SELECT p FROM prod:p-(Inventory_Has_Material:c)- Inventory_Held:i - ACCUM - IF i.quantity >= p.@product_sales_quantity.get(p) THEN - p.@invHeap += myTuple(i,i.quantity) - END - POST-ACCUM - @@product_inventory += (p->(p.@invHeap.top().inv->p.@invHeap.top().quant)), - p.@invHeap.pop(); - - IF @@product_inventory.size() == prod.size() THEN - @@print_message = "Sales Order can be satisfied for all products since they are available in inventory"; - ELSE - INT current = prod.size() -@@product_inventory.size(); - @@print_message = "Sales Order can only be satisfied for " + to_string(current) + "count of products."; - END; - - PRINT @@print_message; - PRINT @@product_inventory; - } - - CREATE OR REPLACE QUERY top_k_products_purchase_order(INT k = 10) { - /* Get the top purchased products from purchase orders */ - - TYPEDEF TUPLE prod_tuple; - - HeapAccum(k,quantity DESC, p ASC) @@prod_heap; - - Start = {Purchase_Order.*}; - - res = SELECT p FROM Start:s -(Has_Line_Number:i)- Line_Number:p; - - res = SELECT p FROM res:r -(reverse_Used_For:s)- SFC_Material:p - ACCUM - @@prod_heap += prod_tuple(p,r.quantity); - - PRINT @@prod_heap; -} - -CREATE OR REPLACE QUERY top_k_products_sales_order(INT k = 10) { - /* Get the top purchased products from sales orders */ - - TYPEDEF TUPLE prod_tuple; - - HeapAccum(k,quantity DESC, p ASC) @@prod_heap; - - Start = {Sales_Order.*}; - - res = SELECT p FROM Start:s -(Has_Sales_Order_Item:i)- Sales_Order_Item:p; - - res = SELECT p FROM res:r -(For_Material:s)- SFC_Material:p - ACCUM - @@prod_heap += prod_tuple(p,r.quantity); - - PRINT @@prod_heap; -} - -CREATE OR REPLACE QUERY unfulfilled_orders(Set> sales_set) { - /* This query gets multiple sales orders as parameters and checks inventory for - products in the sales order. It fulfills order and keeps track of inventory level. */ - - MapAccum> @prod_per_sales; - MapAccum>> @@prod_per_inv; - MapAccum> @@total_prod_inv; - ListAccum @@unfulfilled_orders; - - OrAccum @order_fulfilled = TRUE; - - Start = {sales_set}; - - res = SELECT si FROM Start:s-(Has_Sales_Order_Item:i)-Sales_Order_Item:si; - - res = SELECT si FROM res:si-(For_Material:r)-SFC_Material:p - ACCUM - si.@prod_per_sales += (p-> si.quantity); - - inv = SELECT i FROM Inventory_Held:i -(Inventory_Has_Material:c)- SFC_Material:p - ACCUM - @@prod_per_inv += (p->(i->i.quantity)), - @@total_prod_inv += (p->i.quantity); - - - res_1 = SELECT si FROM res:si-(For_Material:r)-SFC_Material:p - ACCUM - FOREACH (key,value) IN @@prod_per_inv.get(p) DO - IF @@total_prod_inv.get(p) >= si.@prod_per_sales.get(p) THEN - BREAK - END - END, - IF @@total_prod_inv.get(p) < si.@prod_per_sales.get(p) THEN - si.@order_fulfilled += FALSE - END; - - PRINT @@prod_per_inv AS Previous_Product_Inventory; - - res_2 = SELECT si FROM res:si-(For_Material:r)-SFC_Material:p - ACCUM - IF NOT si.@order_fulfilled THEN - @@unfulfilled_orders += si - ELSE - FOREACH (key,value) IN @@prod_per_inv.get(p) DO - IF value >= si.@prod_per_sales.get(p) THEN - INT val = value - si.@prod_per_sales.get(p), - @@prod_per_inv += (p->(key->val)), - BREAK - ELSE - si.@prod_per_sales += (p->si.@prod_per_sales.get(p)-value), - @@prod_per_inv += (p->(key->0)) - END - END - END; - - PRINT @@prod_per_inv AS Current_Product_Inventory; - - PRINT @@unfulfilled_orders AS Orders_that_cannot_be_fulfilled; - PRINT res_2; - - -} \ No newline at end of file diff --git a/agile_operations/supply_chain_management/queries/top_k_product_sales_order.gsql b/agile_operations/supply_chain_management/queries/top_k_product_sales_order.gsql new file mode 100644 index 00000000..67bdff09 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/top_k_product_sales_order.gsql @@ -0,0 +1,32 @@ +CREATE OR REPLACE QUERY top_k_products_sales_order(INT k = 10) { + /* + Description: + Global Top-K Best-Selling / Most Demanded Materials (by total ordered quantity) + across all Sales Orders in the entire graph. + + This is the true "voice of the customer" - reveals which finished goods, + semi-finished items, or configurable materials drive real revenue and + demand volume. Essential for S&OP, demand planning, and product strategy. + Parameters: + k (INT, default 10): + Number of top-selling materials to return (e.g., 10, 20, 50, 100) + Output: + Ranked heap of SFC_Material vertices with total sold/ordered quantity + + */ + + TYPEDEF TUPLE prod_tuple; + + HeapAccum(k,quantity DESC, p ASC) @@prod_heap; + + Start = {Sales_Order.*}; + + res = SELECT p FROM Start:s -(Has_Sales_Order_Item:i)- Sales_Order_Item:p; + + res = SELECT p FROM res:r -(For_Material:s)- SFC_Material:p + ACCUM + @@prod_heap += prod_tuple(p,r.quantity); + + PRINT @@prod_heap; +} + diff --git a/agile_operations/supply_chain_management/queries/top_k_products_purchase_order.gsql b/agile_operations/supply_chain_management/queries/top_k_products_purchase_order.gsql new file mode 100644 index 00000000..4d5d6ee8 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/top_k_products_purchase_order.gsql @@ -0,0 +1,32 @@ + CREATE OR REPLACE QUERY top_k_products_purchase_order(INT k = 10) { + /* + Description: + Global Top-K Most Purchased Materials (by total ordered quantity) + across all Purchase Orders in the entire graph. + + Reveals true consumption drivers - the raw, semi-finished, or finished + materials that dominate procurement volume. This is the real "voice of + the supply chain" for demand and spend patterns. + + Parameters: + k (INT, default 10): + Number of top materials to return (e.g., 10, 25, 50, 100) + + Output: + Ranked heap of SFC_Material vertices with total purchased quantity + */ + + TYPEDEF TUPLE prod_tuple; + + HeapAccum(k,quantity DESC, p ASC) @@prod_heap; + + Start = {Purchase_Order.*}; + + res = SELECT p FROM Start:s -(Has_Line_Number:i)- Line_Number:p; + + res = SELECT p FROM res:r -(reverse_Used_For:s)- SFC_Material:p + ACCUM + @@prod_heap += prod_tuple(p,r.quantity); + + PRINT @@prod_heap; +} diff --git a/agile_operations/supply_chain_management/queries/trace_bom_line_nations.gsql b/agile_operations/supply_chain_management/queries/trace_bom_line_nations.gsql new file mode 100644 index 00000000..97380664 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/trace_bom_line_nations.gsql @@ -0,0 +1,86 @@ +CREATE OR REPLACE DISTRIBUTED QUERY trace_BOM_line_nations(VERTEX input_fg) FOR GRAPH Supply_Chain_Management { + /* + Description: + Domestic vs. International Sourcing Analysis for a Specific Finished Good Batch. + + Starting from a real produced instance (SFC_Material) of a finished good, + this query traces the full transactional BOM lineage upstream through + actual production and procurement events to determine: + . What quantity of input materials was sourced domestically + . What quantity was sourced internationally + + This is a true "Country of Origin" and supply chain sovereignty query - + critical for compliance, risk, ESG, trade policy, and cost analysis. + + Parameters: + input_fg (VERTEX): + A specific produced batch/instance of a finished good (actual SFC_Material vertex) + + Output: + @@domestic_quantity -> Total quantity sourced from the same nation as the final assembly plant + @@international_quantity -> Total quantity sourced from all other nations + */ + OrAccum @visited; + MaxAccum @@src_nation, @rm_nation; + SumAccum @@domestic_quantity, @@international_quantity; + + verts (ANY) = {input_fg}; + + sfc_assembly = + SELECT t FROM verts:s -(To_Be_Produced_By)- SFC_Assembly:t; + bom = + SELECT t FROM sfc_assembly:s -(Based_On)- BOM:t; + plant = + SELECT t FROM bom:s -(Happens_At)- Plant:t; + src_nation = + SELECT t FROM plant:s -(Belongs_To)- Nation:t + POST-ACCUM @@src_nation += t; + + raw_materials (ANY) = {}; + WHILE verts.size() > 0 DO + // traverse until suppliers + // categorize raw materials as domestic or international + verts = + SELECT t FROM verts:s -((Has_Component_SFC|To_Be_Produced_By|For_Material):e)- + (SFC_Material|SFC_Assembly):t + WHERE t.@visited == FALSE + POST-ACCUM + s.@visited = TRUE; + + // collect raw materials (sfc_materials with edge to line number) + temp_raw_materials = + SELECT s FROM verts:s WHERE s.outdegree("Used_For") > 0; + raw_materials = raw_materials UNION temp_raw_materials; + END; + + // traverse from raw materials -> line number -> purchase order -> supplier (pass in MaxAccum) + line_numbers = + SELECT t FROM raw_materials:s -(Used_For)- Line_Number:t + POST-ACCUM t.@visited += TRUE; + purchase_orders = + SELECT t FROM line_numbers:s -(reverse_Has_Line_Number)- Purchase_Order:t + ACCUM t.@visited += TRUE; + suppliers = + SELECT t FROM purchase_orders:s -(reverse_Has_Purchase_Order)- Supplier:t + POST-ACCUM t.@visited += TRUE; + suppliers = + SELECT s FROM suppliers:s -(Belongs_To)- Nation:t + POST-ACCUM t.@rm_nation += t; + purchase_orders = + SELECT s FROM purchase_orders:s -(reverse_Has_Purchase_Order)- Supplier:t + ACCUM s.@rm_nation += t.@rm_nation; + line_numbers = + SELECT s FROM line_numbers:s -(reverse_Has_Line_Number)- Purchase_Order:t + ACCUM s.@rm_nation += t.@rm_nation; + raw_materials = + SELECT s FROM raw_materials:s -(Used_For)- Line_Number:t + ACCUM + IF t.@rm_nation == @@src_nation THEN + @@domestic_quantity += s.quantity + ELSE + @@international_quantity += s.quantity + END; + + PRINT @@domestic_quantity, @@international_quantity; +} + diff --git a/agile_operations/supply_chain_management/queries/unfulfilled_orders.gsql b/agile_operations/supply_chain_management/queries/unfulfilled_orders.gsql new file mode 100644 index 00000000..15ecf274 --- /dev/null +++ b/agile_operations/supply_chain_management/queries/unfulfilled_orders.gsql @@ -0,0 +1,84 @@ +CREATE OR REPLACE QUERY unfulfilled_orders(Set> sales_set) { + /* + Description: + Batch Available-to-Promise (ATP) with Real-Time Inventory Allocation. + + Takes a set of Sales Orders (any size) and performs a true multi-order, + multi-material ATP check using current global inventory. + Automatically allocates inventory to fulfill orders in the order they + appear in the input set and reports: + . Which orders can be fully fulfilled + . Which orders remain unfulfilled (or partially fulfilled) + . Updated inventory levels after allocation + + This is the production-grade "order promising engine" used by advanced + order management systems - but running natively on the graph in real time. + + Parameters: + sales_set (SET>): + Set of Sales Order vertices to evaluate and allocate (FIFO order) + + Output: + . Previous_Product_Inventory -> inventory before allocation + . Current_Product_Inventory -> inventory after allocation + . Orders_that_cannot_be_fulfilled -> list of fully/partially unfulfilled orders + */ + + MapAccum> @prod_per_sales; + MapAccum>> @@prod_per_inv; + MapAccum> @@total_prod_inv; + ListAccum @@unfulfilled_orders; + + OrAccum @order_fulfilled = TRUE; + + Start = {sales_set}; + + res = SELECT si FROM Start:s-(Has_Sales_Order_Item:i)-Sales_Order_Item:si; + + res = SELECT si FROM res:si-(For_Material:r)-SFC_Material:p + ACCUM + si.@prod_per_sales += (p-> si.quantity); + + inv = SELECT i FROM Inventory_Held:i -(Inventory_Has_Material:c)- SFC_Material:p + ACCUM + @@prod_per_inv += (p->(i->i.quantity)), + @@total_prod_inv += (p->i.quantity); + + + res_1 = SELECT si FROM res:si-(For_Material:r)-SFC_Material:p + ACCUM + FOREACH (key,value) IN @@prod_per_inv.get(p) DO + IF @@total_prod_inv.get(p) >= si.@prod_per_sales.get(p) THEN + BREAK + END + END, + IF @@total_prod_inv.get(p) < si.@prod_per_sales.get(p) THEN + si.@order_fulfilled += FALSE + END; + + PRINT @@prod_per_inv AS Previous_Product_Inventory; + + res_2 = SELECT si FROM res:si-(For_Material:r)-SFC_Material:p + ACCUM + IF NOT si.@order_fulfilled THEN + @@unfulfilled_orders += si + ELSE + FOREACH (key,value) IN @@prod_per_inv.get(p) DO + IF value >= si.@prod_per_sales.get(p) THEN + INT val = value - si.@prod_per_sales.get(p), + @@prod_per_inv += (p->(key->val)), + BREAK + ELSE + si.@prod_per_sales += (p->si.@prod_per_sales.get(p)-value), + @@prod_per_inv += (p->(key->0)) + END + END + END; + + PRINT @@prod_per_inv AS Current_Product_Inventory; + + PRINT @@unfulfilled_orders AS Orders_that_cannot_be_fulfilled; + PRINT res_2; + + +} diff --git a/agile_operations/supply_chain_management/readme.md b/agile_operations/supply_chain_management/readme.md deleted file mode 100644 index c97ea0ef..00000000 --- a/agile_operations/supply_chain_management/readme.md +++ /dev/null @@ -1,39 +0,0 @@ -# Narratives - -Inventory Management is the process of ordering, storing, and using a company's inventory. In retail, manufacturing, and other inventory-intensive sectors, a company's raw materials and finished goods are the core of its business. A shortage of inventory when and where needed can be detrimental and at the same time a large inventory is at risk of spoilage, theft, damage, or shifts in demand. To manage inventory and transport logistics, it is important to understand the quantity of materials or products available in inventory. - -Products in inventory management have intricate relationships with various components of the supply chain also having a network of dependencies for each calculation. TigerGraph's ability to model and navigate complex relationships makes it the ideal choice. Querying is also more intuitive and faster with TigerGraph leading to more efficient querying. The large volumes of products, their inventories, along with the other components in a supply chain require a scalable database platform which has the ability to maintain the large data like TigerGraph. - -# Components - -This repository includes multiple components: - -- `data` - Sample data. -- `load_jobs` - Scripts for data loading tasks. -- `meta` - Solution Kit metadata. -- `queries` - Collection of GSQL queries. -- `schema` - Definition of database schema. -- `readme.md` - This usage guide. -- `setup.sh` - Automated setup script. - -# Instructions - -The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: - -1. **Schema Creation**: Initiates the schema creation process with the `schema/create_inventory_management_graph.gsql` script. -2. **Data Loading**: Load data into the schema by running the data loading jobs with the scripts in the `loading_job` folder. -3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. - -## Query Explanations - -We have different queries to perform the following tasks: - -1. **Available Inventory Measure:** This query shows the available inventory for every product. It also tells us how many inventories a particular product is stored at and the time at which the inventory stock value was last updated. With this we can get a general idea of inventory and make decisions regarding re-stocking the inventory or putting a halt on the manufacturing of products. -2. **Determine Fulfillment of Sales Orders :** This query determines the stock of product required in each sales order placed by a customer. It then determines the quantity of available stock of the product in the inventory which is used to realise fulfillment of sales orders. If a sales order can be fulfilled, it tells us the result stock in the inventory after fulfillment of sales order. -3. **Impact of Plant Failure on the Components :** This query gives us the impact that the failure of a plant has. With this query we can keep a track of affected products and inventories and make analysis based on this information. -4. **Top Products in a Sales Order/Purchase Order :** To maintain inventory levels, it is necessary to know the product that is most required and its required quantity. Among all the sales orders, certain products can most popular and important which should also require maintainance of a certain quantity of these products. -5. --- - -## Mock Data - -The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. diff --git a/connected_customer/customer_360/README.md b/connected_customer/customer_360/README.md index 77a98e3b..36e3c97d 100644 --- a/connected_customer/customer_360/README.md +++ b/connected_customer/customer_360/README.md @@ -1,37 +1,384 @@ -# Narratives +# Customer 360 - Financial (TigerGraph Solution Kit) -Customer 360 allows financial institutions to gain a comprehensive view of each customer's interactions, preferences, and needs across all touchpoints. Tigergraph can connect data silos across an institution such as banking transactions, online interactions, customer service inquiries, and social media engagements. The Customer 360 platform provides valuable insights for targeted product offerings. Additionally, it fosters stronger customer relationships, improves retention rates, and ultimately drives profitability through increased customer satisfaction and loyalty. +A TigerGraph solution kit that builds a Customer 360 degree view for financial institutions. -# Components +This project provisions a complete graph environment - schema, sample data, +and GSQL queries - to analyze customer behavior and engagement across accounts, +products, and digital touchpoints. -This repository includes multiple components: +With this kit you can: -- `data` - Sample data. -- `load_jobs` - Scripts for data loading tasks. -- `meta` - Solution Kit metadata. -- `queries` - Collection of GSQL queries. -- `schema` - Definition of database schema. -- `readme.md` - This usage guide. -- `setup.sh` - Automated setup script. +- Identify your most engaged customers. +- Find customers who browse products but never apply. +- Spot abandoned applications and the individuals behind them. +- Analyze engagement patterns across loans, credit cards, email, web, and apps. -# Instructions +--- -The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: +## Contents + +- [Overview](#overview) +- [Features](#features) +- [Prerequisites](#prerequisites) +- [Setup Instructions](#setup-instructions) +- [Query Explanations](#query-explanations) +- [Run Example Queries](#run-example-queries) +- [Using Your Own Data](#using-your-own-data) +- [Resetting the Environment](#resetting-the-environment) + +--- + +## Overview + +Customer_360 is about consolidating all customer interactions - accounts, +transactions, digital engagement, and product holdings - into a single, +connected view. Graph databases like TigerGraph are well-suited for this +because they can traverse deeply connected data (e.g., customers -> sessions -> +actions -> products) in real time. + +This solution kit models a retail banking environment with: + +- **Individuals** and their **Accounts** +- **Products** (Loans and Credit Cards) held by those accounts +- **Sessions** and **Engagement events** (like web search, product browsing, email, + device, and application activity) +- **Contact information** (phone, email, address, and app contact points) + +You can use the included queries as building blocks for marketing, risk, +and analytics use cases, or extend the graph with your own entities and interactions. + +--- + +## Features + +- **Pre-built TigerGraph schema** + Vertices and edges for Individuals, Accounts, Contact Points, Sessions, + and Engagement events, defined in `schema/1_create_schema.gsql`. + +- **Sample Customer 360 dataset** + Mock CSV data for individuals, accounts, and sessions is stored in the data/ + directory for reference and visualization. At runtime, the application + actually loads this data from a public S3 bucket. + +- **End-to-end loading job** + A single loading job in `loading_job/load_data.gsql` that: + - Defines a data source pointing at S3. + - Loads vertices and edges from three CSV files: + - `Session.csv` + - `Account_Info.csv` + - `Individual_Info.csv` + +- **Curated query library** + Example GSQL queries in `queries/` that cover: + - Engagement analysis (top-k engaged individuals, never-engaged customers). + - Application funnel (started vs submitted vs never applied). + - Product interest and email engagement by product. + - Product holdings by account. + +- **Automated setup script** + `setup.sh` creates the graph, loads the data, and installs all queries in one go. + +--- + +## Prerequisites + +Before you run this solution kit, make sure you have: + +- **A running TigerGraph instance** + - TigerGraph installed and running, or use the prebuilt kit on TG cloud. + - You must have permission to create graphs and run GSQL commands. + +- **GSQL client access** + - The `gsql` command-line tool available on the same machine/container where you cloned this repo. + - Ability to connect to your TigerGraph service + +- **Network access for sample data** + - Outbound internet access from the TigerGraph machine to read the sample + CSV files from the public S3 bucket used in `loading_job/load_data.gsql`. + +- **Shell environment** + - A Unix-like shell (Linux, macOS, or WSL) to run `setup.sh` and `queries/install_queries.sh`. + - Executable permissions for the scripts: + ```bash + chmod +x setup.sh + chmod +x queries/install_queries.sh + ``` + +> **Graph name:** This kit creates and uses a graph named `Customer_360_Financial`. + +# Setup Instructions +The following instructions assume that you are running the following scripts +with `gsql` command installed. + +If you don't yet have the `gsql` command available, see the TigerGraph documentation: + +- **Local GSQL shell on the server** + [The GSQL Shell](https://docs.tigergraph.com/tigergraph-server/current/gsql-shell/) + (explains how to run `gsql` directly on a TigerGraph server) + +- **Remote GSQL client (from your laptop or another machine)** + [Using a Remote GSQL Client](https://docs.tigergraph.com/tigergraph-server/current/gsql-shell/using-a-remote-gsql-client) + (explains how to download the GSQL client JAR, configure SSL, and create a `gsql` alias) +Ensure that the script is executable with: +```bash + +chmod +x setup.sh + +``` +Then, run the automated script using: +```bash +./setup.sh +``` + +This script will: +* Create the Customer_360_Financial graph and schema by defining all the + vertices, edges and their attributes with the command: + ```bash + + gsql schema/1_create_schema.gsql + ``` + +* Loads all the data from TigerGraph's public s3 bucket using the command: + ```bash + + gsql loading_job/load_data.gsql + ``` + +* Installs all the queries using another shell script located in: +```bash + queries/install_queries.sh + +``` + This script runs through all the queries in the repository and installs them + one by one. + +* To check if all the queries have run successfully, please run the following + command: + ```bash + gsql -g Customer_360_Financial "SHOW QUERY *" + ``` + +You should see entries such as Individuals_K_Most_Engagement, Application_Starts, +Customers_No_Engagement, Email_Engagement_By_Product, and others. -1. **Schema Creation**: Initiates the schema creation process with the `schema/1_create_schema.gsql` script. -2. **Data Loading**: Load data into the schema by running the data loading jobs with the `loading_job/load_data.gsql` scripts. -3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. ## Query Explanations -We have different queries to perform the following tasks: +This solution kit includes several pre-built queries. Below are the most useful +queries and what you can get from each of them, grouped by business +purpose. +### 1. Engagement Information + + +**Individuals_K_Most_Engagement(k)** +Returns the top *k* most engaged individuals based on all of their session +activities. Use this to quickly identify your most active customers for VIP +programs, targeted campaigns, or deeper segmentation, across every digital +touchpoint captured in the graph. + +--- + +**Customers_With_Sessions** +Returns all individuals who have created at least one session. This effectively +gives you your "digitally active" customer population and can be used to +segment customers into active versus inactive, or to measure adoption of your +digital channels over time. + +--- + +**Customers_No_Engagement** +Returns individuals who have never created a session and therefore have no +recorded digital engagement. This is a useful starting point for activation +campaigns, outreach, or branch-based follow-ups to encourage adoption of +online or mobile banking services. + +--- +### 2. Potential Sale Opportunities + +**Individuals_No_Application** +Returns individuals who have shown interest (for example, browsing products or +searching) but have never started an application. This is ideal for identifying +warm leads who may need a nudge, clearer information, or a simplified process +to move from research to application. + +--- + +**Application_Starts** +Returns individuals who have started but not submitted an application. This +represents abandoned applications and is useful for recovery campaigns, UX +diagnostics, and understanding friction points in your onboarding funnel across +products such as loans or credit cards. + +--- + +**Application_Submissions** +Returns individuals who have started and successfully submitted at least one +application. You can use this as a "converted" cohort, analyze their profiles +and behaviors, or compare them against abandoned-application segments to +understand which attributes or journeys correlate with successful completion. + +--- +### 3. Channel & Product Engagement + +**Individuals_Product_Browse** +Returns individuals who have browsed one or more products during their +sessions. This gives you a population of customers actively looking at your +offerings, which can be sliced further by product type or combined with +holding data to spot cross-sell and up-sell opportunities. + +--- + +**Individual_WebSearch** +Returns individuals who have engaged via web search around your products or +services. This can help you understand which customers are researching more +deeply, evaluate search-driven campaigns, or combine with application data to +see how search behavior correlates with eventual conversion. + +--- + +**Email_Engagement_Accounts** +Returns accounts associated with individuals who have engaged via email. This +lets you move beyond open/click metrics at the individual level and see which +accounts (often households or relationships) are actually interacting with your +email campaigns, supporting more accurate relationship-level marketing and +measurement. + +--- + +**Email_Engagement_By_Product** +Returns email engagement rates broken down by product type, such as loan types +or card categories. You can use this to compare how well email performs across +different products, identify under-engaged product lines, and prioritize where +to adjust messaging, targeting, or channels for better response. + +--- +This is a good starting point for product-level channel performance analysis. + +## Run Example Queries + +After the setup is complete and the data is loaded, you can run example +queries to validate that everything is working correctly. + +The graph name is `Customer_360_Financial` and that you are running them +from a shell where the `gsql` command is available. + +### Example: Top-K Most Engaged Individuals + +Find the top 10 individuals by total engagement (based on their session +actions): + +```bash +gsql -g Customer_360_Financial "RUN QUERY Individuals_K_Most_Engagement(10)" +``` + +## Using Your Own Data + +By default, this solution kit loads **mock data** from a TigerGraph-hosted +public S3 bucket, using the data source and loading job defined in +`loading_job/load_data.gsql`. The three CSV files it loads are: + +- `Session.csv` +- `Account_Info.csv` +- `Individual_Info.csv` + +If your data lives in cloud storage or another external system +(for example, Amazon S3, Google Cloud Storage, Azure Blob Storage, or a data warehouse), +you can keep the `CREATE DATA_SOURCE` pattern from the sample and adapt it to your environment. + +If your data is stored on the TigerGraph server's local filesystem instead, you can +skip `CREATE DATA_SOURCE` entirely and point `DEFINE FILENAME` directly to your local paths. + +TigerGraph supports multiple loading options. For detailed, up-to-date examples, +please refer to the official documentation: + +- **Data loading overview** - supported sources and general workflow + https://docs.tigergraph.com/tigergraph-server/current/data-loading/data-loading-overview + +- **Data loading entry page** - index of all loading methods (local files, cloud storage, warehouses, Spark, etc.) + https://docs.tigergraph.com/tigergraph-server/current/data-loading/ + +- **Load data from cloud storage** - step-by-step guide for Amazon S3, Google Cloud Storage, and Azure Blob Storage + https://docs.tigergraph.com/tigergraph-server/current/data-loading/load-from-cloud + +At a high level, the steps are: + +1. For cloud or external systems, define a `DATA_SOURCE` object that matches your storage system and credentials. + For local files on the TigerGraph server, you can omit `DATA_SOURCE`. +2. Update the `DEFINE FILENAME` lines in `load_all` so they point to your cloud URIs +3. Keep the `LOAD` blocks the same unless your column structure changes and if it does, +adjust the `VALUES(...)` mappings so each vertex/edge attribute receives the correct column. + +Once those changes are in place, you can reload your data with: + +```bash +gsql loading_job/load_data.gsql +``` + + +## Resetting the Environment + +Sometimes you'll want to wipe the graph data and start fresh - for example, when +switching to a new dataset or after updating the schema and loading logic. + +This repository provides a utility query to delete all data, and a separate +reset script for more advanced workflows. + +> **Warning** +> All of the commands in this section permanently delete data. +> Use them only in development or when you are sure you want to clear the graph. + +### 1. Clear all data using `delete_all.gsql` + +The file `queries/delete_all.gsql` defines a GSQL query named `delete_all()` that +removes all vertices and edges from the `Customer_360_Financial` graph. + +Under the hood it uses the GSQL `DELETE` statement, which cascades deletions: +when a vertex is deleted, all incident edges are removed automatically. + +To use it: + +1. **Ensure the query is installed** + + Normally `queries/install_queries.sh` installs all queries, including + `delete_all`. If you need to (re)install just this one: + + ```bash + gsql -g Customer_360_Financial queries/delete_all.gsql + ``` + +2. **Run the delete query** + +Once `delete_all()` is installed, run it to clear all vertices and edges: + +```bash +gsql -g Customer_360_Financial "RUN QUERY delete_all()" +``` + + +After this completes, the **schema and queries remain**, but **all data** has been +removed. + +--- + +### 2. Full reset using the `reset` script + +If you need a complete reset of the solution kit - not just data, but also the +loading job, data source, queries, and the graph definition - you can use the +`reset/reset.gsql` script. + +This script performs the following operations on `Customer_360_Financial`: -1. **Engagement Information**: Find accounts and customers that have the most engagement. There are multiple queries that allow the user to find accounts and individuals with certain engagement patterns. There are also queries to identify accounts with the most engagement and a breakdown of engagement based on the holding of loans and credit cards. +```gsql +USE GRAPH Customer_360_Financial -2. **Find Potential Sale Opportunities**: Find accounts and individuals who have been searching for products or browsing your product offering. Find individuals who have started an application but never finished it and what their account profile looks like. +-- data loading job & data source +DROP JOB load_all +DROP DATA_SOURCE s1 -2. **Find Customer Engagement Patterns**: Find accounts and individuals and identify their engagement patterns and customer's that have never engaged before. +-- clear all installed queries +DROP QUERY ALL -## Mock Data +-- clear schema and data +DROP GRAPH Customer_360_Financial +``` -The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. diff --git a/connected_customer/customer_360/meta/Insights_Customer360.json b/connected_customer/customer_360/meta/Insights_Customer360.json new file mode 100644 index 00000000..3a049b4f --- /dev/null +++ b/connected_customer/customer_360/meta/Insights_Customer360.json @@ -0,0 +1,406 @@ +{ + "defaultGraph": "Customer_360_Financial", + "iconURL": "/insights/static/media/circle-dollar-sign.8d7e49ebacc3cd7e7d503996e0fa61ed.svg", + "id": "3XoL8Yj78BwTk7C1KUSXKs", + "pageConfigSeparated": true, + "pages": [ + { + "chartMap": { + "4hyu4V6Vf7jMG9S7ANWLH3": { + "chartSettings": { + "markdown": "# Customer 360 – Financial Overview\n\nThis dashboard is built on the **Customer_360_Financial** graph from the TigerGraph solution kit. \nIt combines **Individuals, Accounts, Products, Sessions, and Engagement Events** into a single, connected view so you can analyze customer behaviour across digital touchpoints.\n\nUse this page as your **high-level summary** of customer activity and engagement. \nOther pages (Engagement, Funnel & Applications, Segments, Customer Detail) can be added later to drill into specific areas.\n\n---\n\n## What data is this using?\n\nThis application uses the schema and sample data provisioned by the solution kit:\n\n- **Individuals** and their **Accounts**\n- **Products** held by those accounts (Loans, Credit Cards)\n- **Sessions** and **Engagement events**:\n - Web search, product browsing, application-related events\n - Email interactions\n - App / device activity\n- **Contact information**, tied back to individuals and accounts\n\nAll widgets on this page read from the `Customer_360_Financial` graph. \nMost logic is implemented as **interpreted GSQL queries** in the dashboard (e.g. “engaged customers in date range”, “customers with no engagement”, “engaged vs not engaged split”).\n\n---\n\n## How to read this page\n\n### 1. Global Filters (top)\n\nUse the controls at the top of the page to change the scope of most metrics:\n\n- **Date Range** – `start_date` / `end_date` \n Used by the engagement KPIs, the engagement split chart, the trend widget, and the graph.\n\n- **Top-K Customers for Graph** – `top_k_customers` \n Controls how many high-engagement customers are included in the network graph.\n\n> Note: This kit ships with **sample data**, so trends and counts may look sparse. \n> In a real environment, the same queries will surface much richer patterns.\n\n---\n\n### 2. KPI Strip – “Who do we have, and who is active?”\n\nThe top row of **KPI tiles** answers the basic health questions:\n\n- **Total Customers** \n Total number of `Individual` vertices in the graph. \n This gives you the size of the customer base represented in the dataset.\n\n- **Engaged Customers (in range)** \n Customers who have at least one **Session** with at least one **action** between `start_date` and `end_date`. \n This is calculated from sessions whose `Created_On` falls within the selected window and have outgoing `\"action\"` edges.\n\n- **Customers with No Engagement** \n Individuals who exist in the graph but **have never had any session with actions**. \n This highlights dormant or unreachable customers who might need re-activation or data-quality review.\n\nUse these numbers together to understand:\n\n- How large the addressable base is.\n- What share of that base is actually active in the selected time window.\n- How much “hidden potential” sits in never-engaged customers.\n\n---\n\n### 3. Engagement Snapshot – “Who is active vs not?”\n\nThis section gives a quick **segmentation of the base**:\n\n- **Engaged vs Not Engaged (Pie Chart)** \n Splits customers into two segments for the selected date range:\n - **Engaged** – at least one session with actions in the window \n - **Not Engaged** – no engaged sessions in the window \n\nThis uses the same definition as the Engaged Customers KPI and is driven by an interpreted query that counts how many Individuals fall into each bucket.\n\nUse this to answer:\n\n- “What fraction of my base was active in this period?”\n- “If I ran a campaign, did the engaged slice grow?”\n\n---\n\n### 4. Activity Trend (Sample Data)\n\n> *Optional / demo:* Because the solution kit uses a small synthetic dataset, \n> activity over time can look sparse or spiky. In a real deployment, this chart becomes much more informative.\n\n- **Engaged Sessions Over Time (Line Chart)** \n Plots engaged sessions over time (e.g. per month) within the selected date range. \n Each point represents the number of sessions that:\n - Occurred between `start_date` and `end_date`, and \n - Have at least one `\"action\"` edge.\n\nUse this as a **sanity check** that queries and filters are wired correctly. \nIn production data, this widget will highlight peaks after campaigns, seasonal patterns, or drops in engagement.\n\n---\n\n### 5. Network Snapshot – “What does engagement look like as a graph?”\n\nThe bottom section shows a small **360° network slice** instead of just numbers:\n\n- **Top Engaged Customers Network (Graph Widget)** \n Shows the ego-network for the most engaged customers in the selected date range:\n - **Individuals**: the top-K customers ranked by engaged session count \n - **Sessions**: sessions (with actions) connected to those customers in the same window\n\nThe widget uses an interpreted query to:\n\n1. Find all engaged sessions within `start_date` / `end_date`. \n2. Aggregate session counts per `Individual`. \n3. Select the top-K customers (`top_k_customers`). \n4. Bring in their related sessions so the graph widget can visualize the connections.\n\nUse this graph to:\n\n- Visually inspect which customers are driving the most digital activity.\n- Explain the value of a connected Customer 360° view in demos.\n- Validate that the schema and engagement edges (`created_session`, `\"action\"`, etc.) are wired as expected.\n\n---\n\n## Where to go next\n\nThis Overview page is designed as a **starting point** for analysts:\n\n- Start here to understand **how many customers you have**, **how many are engaged**, and **how that engagement is distributed**.\n- If a number or slice looks unusual (e.g. very high never-engaged count, sudden drop in engaged customers), \n use dedicated pages (to be added) such as:\n - **Engagement** – channel usage, engagement by segment, session-level patterns \n - **Funnel & Applications** – application starts/submissions, drop-off by stage \n - **Segments** – product holdings, engagement tiers, region, or other attributes \n - **Customer Detail** – a focused 360° view for a single Individual\n\nAs you extend the app (e.g. add application KPIs or product splits), you can come back and expand this markdown to describe the new widgets. For now, it reflects the **current depth** of the dashboard you’ve actually built.\n" + }, + "graphName": "Customer_360_Financial", + "hideWidgetName": false, + "id": "4hyu4V6Vf7jMG9S7ANWLH3", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Overview Header & Help", + "type": "markdown", + "version": "1763474010331720264" + }, + "89P8HpShYntxmhLaJgi8sF": { + "chartSettings": {}, + "graphName": "Customer_360_Financial", + "hideWidgetName": false, + "id": "89P8HpShYntxmhLaJgi8sF", + "patternLimit": 5, + "query": "INTERPRET QUERY () FOR GRAPH Customer_360_Financial {\n\n // All individuals in the graph\n all_people = {Individual.*};\n\n // All sessions that have at least one action\n all_sessions = {SessionID.*};\n\n engaged_sessions = SELECT s\n FROM all_sessions:s\n WHERE s.outdegree(\"action\") > 0;\n\n // All individuals who *do* have at least one engaged session\n engaged_customers = SELECT i\n FROM all_people:i -(created_session>:cs)- engaged_sessions:s;\n\n // Now compute the set difference: all_people - engaged_customers\n never_engaged = all_people MINUS engaged_customers;\n\n // KPI: number of never-engaged customers\n PRINT never_engaged.size() AS customers_with_no_engagement;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Customers With No Engagement", + "type": "value", + "version": "1763462170883246644" + }, + "8KvjFYQviXDrcQhcr7Y1kZ": { + "chartSettings": {}, + "graphName": "Customer_360_Financial", + "hideWidgetName": false, + "id": "8KvjFYQviXDrcQhcr7Y1kZ", + "patternLimit": 5, + "query": "INTERPRET QUERY (\n DATETIME start_date,\n DATETIME end_date,\n INT top_k\n)\nFOR GRAPH Customer_360_Financial SYNTAX V2 {\n\n // Count sessions per individual\n SumAccum @sess_ct;\n\n // Base sets\n all_people = {Individual.*};\n all_sessions = {SessionID.*};\n\n // 1) Sessions in the window with at least one action\n engaged_sessions = SELECT s\n FROM all_sessions:s\n WHERE s.Created_On >= start_date\n AND s.Created_On < end_date\n AND s.outdegree(\"action\") > 0;\n\n // 2) Individuals who own those engaged sessions, with session counts\n engaged_customers = SELECT i\n FROM all_people:i -(created_session>:cs)- engaged_sessions:s\n ACCUM i.@sess_ct += 1;\n\n // 3) Top-K engaged customers by session count\n top_customers = SELECT i\n FROM engaged_customers:i\n ORDER BY i.@sess_ct DESC\n LIMIT top_k;\n\n // 4) Sessions for those top customers in the same window\n top_sessions = SELECT s\n FROM top_customers:i -(created_session>:cs)- engaged_sessions:s;\n\n // 5) Output vertex sets for the graph widget\n PRINT top_customers;\n PRINT top_sessions;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top K Engaged Customers", + "type": "internal-graph", + "version": "1763473737976082290" + }, + "inoZB26qaA23sNaQdAHUHT": { + "chartSettings": { + "category": [ + { + "id": "segment", + "type": "string" + } + ], + "tableHeaders": [ + "segment", + "value" + ], + "tableIndex": 0, + "value": [ + { + "id": "value", + "type": "number" + } + ] + }, + "graphName": "Customer_360_Financial", + "hideWidgetName": false, + "id": "inoZB26qaA23sNaQdAHUHT", + "patternLimit": 5, + "query": "INTERPRET QUERY (\n DATETIME start_date,\n DATETIME end_date\n)\nFOR GRAPH Customer_360_Financial SYNTAX V2 {\n\n // Each record is: { segment: \"Engaged\"/\"Not Engaged\", value: count }\n TYPEDEF TUPLE SegmentRec;\n BagAccum @@segments;\n\n // Global counters\n SumAccum @@engaged_count;\n SumAccum @@not_engaged_count;\n\n // Base sets\n all_people = {Individual.*};\n all_sessions = {SessionID.*};\n\n // 1) Sessions in the window with at least one action\n engaged_sessions = SELECT s\n FROM all_sessions:s\n WHERE s.Created_On >= start_date\n AND s.Created_On < end_date\n AND s.outdegree(\"action\") > 0;\n\n // 2) Individuals who have at least one such session\n engaged_customers = SELECT i\n FROM all_people:i -(created_session>:cs)- engaged_sessions:s;\n\n // 3) Count engaged customers\n count_engaged = SELECT i\n FROM engaged_customers:i\n ACCUM @@engaged_count += 1;\n\n // 4) Individuals who are NOT engaged in this window\n not_engaged_customers = all_people MINUS engaged_customers;\n\n // 5) Count not-engaged customers\n count_not = SELECT i\n FROM not_engaged_customers:i\n ACCUM @@not_engaged_count += 1;\n\n // 6) Build the two segment records (no ACCUM, just body-level assignments)\n @@segments += SegmentRec(\"Engaged\", @@engaged_count);\n @@segments += SegmentRec(\"Not Engaged\", @@not_engaged_count);\n\n // 7) Output for the pie chart\n PRINT @@segments;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "New Widget", + "type": "pie", + "version": "1763462945105139904" + }, + "rnTgCXrPVVi7XiTUoWMtgu": { + "chartSettings": { + "category": [ + { + "id": "day", + "type": "string" + } + ], + "tableHeaders": [ + "day", + "sessions" + ], + "tableIndex": 0, + "value": [ + { + "id": "sessions", + "type": "number" + } + ], + "valueAggregation": [ + { + "id": "SUM" + } + ] + }, + "graphName": "Customer_360_Financial", + "hideWidgetName": false, + "id": "rnTgCXrPVVi7XiTUoWMtgu", + "patternLimit": 5, + "query": "INTERPRET QUERY (\n DATETIME start_date,\n DATETIME end_date\n)\nFOR GRAPH Customer_360_Financial SYNTAX V2 {\n\n // Each record: { day: \"YYYY-MM-DD\", sessions: 1 }\n TYPEDEF TUPLE DayRec;\n BagAccum @@rows;\n\n all_sessions = {SessionID.*};\n\n // Sessions in the window with at least one action\n engaged_sessions = SELECT s\n FROM all_sessions:s\n WHERE s.Created_On >= start_date\n AND s.Created_On < end_date\n AND s.outdegree(\"action\") > 0\n ACCUM\n // One row per engaged session, bucketed by day string\n @@rows += DayRec(SUBSTR(to_string(s.Created_On), 0, 10), 1);\n\n PRINT @@rows;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Engaged Trend", + "type": "line", + "version": "1763472119725255424" + }, + "uwSbY4oNEapyvoWWxeRhPq": { + "chartSettings": {}, + "graphName": "Customer_360_Financial", + "hideWidgetName": false, + "id": "uwSbY4oNEapyvoWWxeRhPq", + "memoryLimit": 0, + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Customer_360_Financial {\n \n // Get all customer vertices\n all_customers = {Individual.*};\n\n // Print the total number of customers\n PRINT all_customers.size();\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Total Customers", + "type": "value", + "version": "1763456053754105891" + }, + "v5PQtWshtbjLcMV6KkKsHP": { + "chartSettings": { + "inputStates": [ + { + "dataType": "datetime", + "id": "input_1pVKxX1oUujfsHEQgTF8vD", + "label": "Start Date", + "name": "start_date", + "placeholder": "YYYY-MM-DD", + "settings": {}, + "widgetType": "Input" + }, + { + "dataType": "datetime", + "id": "input_hVV5pmWdGHS6j8yxWDU9eB", + "label": "End Date", + "name": "end_date", + "placeholder": "YYYY-MM-DD", + "settings": {}, + "widgetType": "Input" + }, + { + "dataType": "string", + "id": "input_hoh3tu6PvjkG5oiBY84xyN", + "label": "Product", + "name": "product_filter", + "placeholder": "ALL / LOAN / CARD", + "settings": {}, + "widgetType": "Input" + }, + { + "dataType": "number", + "id": "input_rtqpRDDoTENRA4c45MzfEN", + "label": "Minimum Engagement", + "name": "min_engagement", + "placeholder": "e.g. 1", + "settings": {}, + "widgetType": "Input" + }, + { + "dataType": "number", + "id": "input_sqzUrnDcy26dhpx3CyijfS", + "label": "Top K Customers", + "name": "top_k", + "settings": {}, + "widgetType": "Input" + } + ] + }, + "graphName": "Customer_360_Financial", + "hideWidgetName": false, + "id": "v5PQtWshtbjLcMV6KkKsHP", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Filter", + "type": "Inputs", + "version": "1763473500047112960" + }, + "xv7HDXoJDMtoCc2s3g5B7S": { + "chartSettings": {}, + "graphName": "Customer_360_Financial", + "hideWidgetName": false, + "id": "xv7HDXoJDMtoCc2s3g5B7S", + "patternLimit": 5, + "query": "INTERPRET QUERY (\n DATETIME start_date,\n DATETIME end_date\n)\nFOR GRAPH Customer_360_Financial {\n\n SumAccum @@total_sessions;\n SumAccum @@total_customers;\n\n SumAccum @sess_ct;\n MaxAccum @last_seen;\n\n all_people = {Individual.*};\n all_sessions = {SessionID.*};\n\n engaged_sessions = SELECT s\n FROM all_sessions:s\n WHERE s.Created_On >= start_date\n AND s.Created_On < end_date\n AND s.outdegree(\"action\") > 0\n ACCUM @@total_sessions += 1;\n\n engaged_customers = SELECT i\n FROM all_people:i -(created_session>:cs)- engaged_sessions:s\n ACCUM i.@sess_ct += 1,\n i.@last_seen += s.Created_On;\n\n counted_customers = SELECT i\n FROM engaged_customers:i\n ACCUM @@total_customers += 1;\n\n FLOAT avg_sessions_per_customer = 0.0;\n IF @@total_customers != 0 THEN\n avg_sessions_per_customer =\n toFloat(@@total_sessions) / toFloat(@@total_customers);\n END;\n\n // Only print the KPI row for this widget\n PRINT @@total_customers AS total_engaged_customers;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Engaged Customers", + "type": "value", + "version": "1763461647398691173" + } + }, + "globalParameters": { + "end_date": { + "id": "input_hVV5pmWdGHS6j8yxWDU9eB", + "name": "end_date", + "type": "DATETIME", + "value": "2025-11-01 00:00:00" + }, + "min_engagement": { + "id": "input_rtqpRDDoTENRA4c45MzfEN", + "name": "min_engagement", + "type": "NUMBER", + "value": 1 + }, + "product_filter": { + "id": "input_hoh3tu6PvjkG5oiBY84xyN", + "name": "product_filter", + "type": "STRING", + "value": "ALL" + }, + "start_date": { + "id": "input_1pVKxX1oUujfsHEQgTF8vD", + "name": "start_date", + "type": "DATETIME", + "value": "2020-08-01 00:00:00" + }, + "top_k": { + "id": "input_sqzUrnDcy26dhpx3CyijfS", + "name": "top_k", + "type": "NUMBER", + "value": 20 + } + }, + "iconURL": "/insights/static/media/library.d3f7f207c6bb1d7be8e64045a19991b2.svg", + "id": "ukVwHvhc7JkdnKrxEUuViw", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 15, + "i": "4hyu4V6Vf7jMG9S7ANWLH3", + "moved": false, + "static": false, + "w": 2, + "x": 10, + "y": 0 + }, + { + "h": 5, + "i": "89P8HpShYntxmhLaJgi8sF", + "moved": false, + "static": false, + "w": 2, + "x": 4, + "y": 0 + }, + { + "h": 12, + "i": "8KvjFYQviXDrcQhcr7Y1kZ", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 15 + }, + { + "h": 10, + "i": "inoZB26qaA23sNaQdAHUHT", + "moved": false, + "static": false, + "w": 2, + "x": 8, + "y": 5 + }, + { + "h": 12, + "i": "rnTgCXrPVVi7XiTUoWMtgu", + "moved": false, + "static": false, + "w": 4, + "x": 4, + "y": 5 + }, + { + "h": 5, + "i": "uwSbY4oNEapyvoWWxeRhPq", + "moved": false, + "static": false, + "w": 2, + "x": 8, + "y": 0 + }, + { + "h": 15, + "i": "v5PQtWshtbjLcMV6KkKsHP", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 5, + "i": "xv7HDXoJDMtoCc2s3g5B7S", + "moved": false, + "static": false, + "w": 2, + "x": 6, + "y": 0 + } + ], + "xs": [ + { + "h": 15, + "i": "v5PQtWshtbjLcMV6KkKsHP", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 5, + "i": "89P8HpShYntxmhLaJgi8sF", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 15 + }, + { + "h": 5, + "i": "xv7HDXoJDMtoCc2s3g5B7S", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 20 + }, + { + "h": 5, + "i": "uwSbY4oNEapyvoWWxeRhPq", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 25 + }, + { + "h": 15, + "i": "4hyu4V6Vf7jMG9S7ANWLH3", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 30 + }, + { + "h": 12, + "i": "rnTgCXrPVVi7XiTUoWMtgu", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 45 + }, + { + "h": 10, + "i": "inoZB26qaA23sNaQdAHUHT", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 57 + } + ] + }, + "title": "Overview", + "version": "1763473738844877827", + "weight": 10 + } + ], + "title": "Customer_360_Financial", + "userRoleForApp": "owner", + "version": "1763451431356995546" +} \ No newline at end of file diff --git a/connected_customer/customer_360/queries/Customers_With_Product.gsql b/connected_customer/customer_360/queries/Accounts_With_Product.gsql similarity index 57% rename from connected_customer/customer_360/queries/Customers_With_Product.gsql rename to connected_customer/customer_360/queries/Accounts_With_Product.gsql index 8714ea4b..7201baff 100644 --- a/connected_customer/customer_360/queries/Customers_With_Product.gsql +++ b/connected_customer/customer_360/queries/Accounts_With_Product.gsql @@ -1,4 +1,18 @@ CREATE DISTRIBUTED QUERY Accounts_With_Product(STRING product_type) FOR GRAPH Customer_360_Financial { + /* + Description: + Returns Accounts that are associated with either a + credit card or loan products linked via has_product edges. + + Parameters: + product_type (STRING): + The product category/type to match, e.g. a CardCategory for + CreditCard or a LoanType for Loan. + + Output: + @@accounts: + Set of Account vertices that hold the specified product_type. + */ SetAccum @@accounts; start = {Account.*}; @@ -17,4 +31,4 @@ CREATE DISTRIBUTED QUERY Accounts_With_Product(STRING product_type) FOR GRAPH Cu } UPDATE DESCRIPTION OF QUERY Accounts_With_Product "Returns Accounts that hold the specified product type" -UPDATE DESCRIPTION OF QUERY_PARAM Accounts_With_Product.product_type "The type of product held by returned accounts" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM Accounts_With_Product.product_type "The type of product held by returned accounts" diff --git a/connected_customer/customer_360/queries/Application_Engagement_Individuals.gsql b/connected_customer/customer_360/queries/Application_Engagement_Individuals.gsql index 69cc8338..b9620071 100644 --- a/connected_customer/customer_360/queries/Application_Engagement_Individuals.gsql +++ b/connected_customer/customer_360/queries/Application_Engagement_Individuals.gsql @@ -1,5 +1,17 @@ CREATE DISTRIBUTED QUERY Application_Engagement_Individuals(/* Parameters here */) FOR GRAPH Customer_360_Financial { - start = {SessionID.*}; +/* + Description: + Returns individuals who have engaged with a product application via + at least one session + + Parameters: + (None) + Uses all SessionID vertices; + + Output: + Set of Individual vertices with at least one application engagement. + */ +start = {SessionID.*}; app_sessions = SELECT s FROM start:s - (action:e) - ApplicationEngagement:t; individuals = SELECT t FROM app_sessions:s - (created_by:e) - Individual:t; diff --git a/connected_customer/customer_360/queries/Application_Starts.gsql b/connected_customer/customer_360/queries/Application_Starts.gsql index 9485e119..22c20076 100644 --- a/connected_customer/customer_360/queries/Application_Starts.gsql +++ b/connected_customer/customer_360/queries/Application_Starts.gsql @@ -1,5 +1,19 @@ CREATE DISTRIBUTED QUERY Application_Starts(/* Parameters here */) FOR GRAPH Customer_360_Financial { - start = {SessionID.*}; + /* + Description: + Returns Individuals who started an application but did not submit it, + by comparing sessions with "start_application" vs "submit_application" + actions. + + Parameters: + (None) + Uses all SessionID vertices; + + Output: + unfinished_individuals: Individuals with started but unfinished + application sessions. + */ + start = {SessionID.*}; SetAccum @@app_starts, @@app_submits, @@unfinished; app_starts = SELECT s FROM start:s - (action:e) - ApplicationEngagement:t diff --git a/connected_customer/customer_360/queries/Application_Submissions.gsql b/connected_customer/customer_360/queries/Application_Submissions.gsql index fc727764..d2fce709 100644 --- a/connected_customer/customer_360/queries/Application_Submissions.gsql +++ b/connected_customer/customer_360/queries/Application_Submissions.gsql @@ -1,4 +1,14 @@ CREATE DISTRIBUTED QUERY Application_Submissions(/* Parameters here */) FOR GRAPH Customer_360_Financial { +/* + Description: + Returns Individuals who started and submitted an application, + + Parameters: + (None) + Uses all SessionID vertices + Output: + The individuals who submitted applications, and those session ids + */ start = {SessionID.*}; SetAccum @@app_starts, @@app_submits, @@finished; @@ -7,7 +17,7 @@ CREATE DISTRIBUTED QUERY Application_Submissions(/* Parameters here */) FOR GRAP ACCUM @@app_starts += s; app_submits = SELECT s FROM start:s - (action:e) - ApplicationEngagement:t - WHERE e.action_detail == "submit_application" + WHERE e.action_detail == "submitted_application" ACCUM @@app_submits += s; @@finished = @@app_starts INTERSECT @@app_submits; @@ -21,4 +31,4 @@ CREATE DISTRIBUTED QUERY Application_Submissions(/* Parameters here */) FOR GRAP PRINT finished_individuals; } -UPDATE DESCRIPTION OF QUERY Application_Submissions "Returns all individuals who finished an application and submitted one" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY Application_Submissions "Returns all individuals who finished an application and submitted one" diff --git a/connected_customer/customer_360/queries/Customers_With_Sessions.gsql b/connected_customer/customer_360/queries/Customers_With_Sessions.gsql index 99d71bfa..465f413d 100644 --- a/connected_customer/customer_360/queries/Customers_With_Sessions.gsql +++ b/connected_customer/customer_360/queries/Customers_With_Sessions.gsql @@ -1,8 +1,18 @@ CREATE DISTRIBUTED QUERY Customers_With_Sessions(/* Parameters here */) FOR GRAPH Customer_360_Financial { - + /* + Description: + Returns Individuals who have created at least one session. + + Parameters: + (None) + Uses all Individual vertices in the graph. + + Output: + Individual vertices that are connected to at least one SessionID + */ start = {Individual.*}; engaged = SELECT s FROM start:s - (created_session:e) - SessionID:t; PRINT engaged; } -UPDATE DESCRIPTION OF QUERY Customers_With_Sessions "Returns all individuals who have created a session where they have engagement" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY Customers_With_Sessions "Returns all individuals who have created a session where they have engagement" diff --git a/connected_customer/customer_360/queries/Email_Engagement_Accounts.gsql b/connected_customer/customer_360/queries/Email_Engagement_Accounts.gsql index ef675107..1958ef38 100644 --- a/connected_customer/customer_360/queries/Email_Engagement_Accounts.gsql +++ b/connected_customer/customer_360/queries/Email_Engagement_Accounts.gsql @@ -1,5 +1,15 @@ CREATE DISTRIBUTED QUERY Email_Engagement_Accounts(/* Parameters here */) FOR GRAPH Customer_360_Financial { - + /* + Description: + Returns accounts whose representing individuals have an EmailEngagement. + + Parameters: + (None) + Uses all SessionID vertices in the graph. + + Output: + Account vertices reachable with linkage to EmailEngagement through a session + */ start = {SessionID.*}; email_sessions = SELECT s FROM start:s - (action:e) - EmailEngagement:t; @@ -10,4 +20,4 @@ CREATE DISTRIBUTED QUERY Email_Engagement_Accounts(/* Parameters here */) FOR GR PRINT accounts; } -UPDATE DESCRIPTION OF QUERY Email_Engagement_Accounts "Returns accounts that have had email engagement" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY Email_Engagement_Accounts "Returns accounts that have had email engagement" diff --git a/connected_customer/customer_360/queries/Email_Engagement_By_Product.gsql b/connected_customer/customer_360/queries/Email_Engagement_By_Product.gsql index 1fbcccf4..f130ff8a 100644 --- a/connected_customer/customer_360/queries/Email_Engagement_By_Product.gsql +++ b/connected_customer/customer_360/queries/Email_Engagement_By_Product.gsql @@ -1,4 +1,24 @@ CREATE DISTRIBUTED QUERY Email_Engagement_By_Product(/* Parameters here */) FOR GRAPH Customer_360_Financial { + /* + Description: + Calculates email engagement rates by product type, based on the share + of products held by accounts that have email engagement. + + Parameters: + (None) + Uses all SessionID vertices, EmailEngagement events, and products + linked to engaged accounts (CreditCard / Loan). + + Output: + @@account_prod_total: + Map of product_type -> number of such products held by + email-engaged accounts. + @@product_total: + Map of product_type -> total number of such products. + @@engagement_rate: + Map of product_type -> engagement rate + (email-engaged products / total products), rounded to 3 decimals. + */ # Get All Email Engagement Accounts start = {SessionID.*}; @@ -40,4 +60,4 @@ CREATE DISTRIBUTED QUERY Email_Engagement_By_Product(/* Parameters here */) FOR PRINT @@engagement_rate; } -UPDATE DESCRIPTION OF QUERY Email_Engagement_By_Product "Returns email engagement percentage broken down by product type" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY Email_Engagement_By_Product "Returns email engagement percentage broken down by product type" diff --git a/connected_customer/customer_360/queries/Individual_WebSearch.gsql b/connected_customer/customer_360/queries/Individual_WebSearch.gsql index 6c57c713..90fefccd 100644 --- a/connected_customer/customer_360/queries/Individual_WebSearch.gsql +++ b/connected_customer/customer_360/queries/Individual_WebSearch.gsql @@ -1,4 +1,16 @@ CREATE DISTRIBUTED QUERY Individual_WebSearch(/* Parameters here */) FOR GRAPH Customer_360_Financial { + /* + Description: + Returns Individuals who have performed a web search activity. + + Parameters: + (None) + Uses all SessionID vertices in the graph. + + Output: + individuals: + Individuals (vertices) that have used web_sessions. + */ start = {SessionID.*}; web_sessions = SELECT s FROM start:s - (action:e) - WebSearch:t; diff --git a/connected_customer/customer_360/queries/Individuals_K_Most_Engagement.gsql b/connected_customer/customer_360/queries/Individuals_K_Most_Engagement.gsql index 94ccb4ec..98426718 100644 --- a/connected_customer/customer_360/queries/Individuals_K_Most_Engagement.gsql +++ b/connected_customer/customer_360/queries/Individuals_K_Most_Engagement.gsql @@ -1,4 +1,17 @@ CREATE DISTRIBUTED QUERY Individuals_K_Most_Engagement(INT k = 10) FOR GRAPH Customer_360_Financial { + /* + Description: + Returns the top-k Individuals with the highest total engagement across + events + + Parameters: + k (INT, default = 10): + Number of Individuals to return, ranked by total engagements. + + Output: + individuals: + Top-k Individual vertices with the highest total engagement +*/ MaxAccum @engagements; sessions = {SessionID.*}; diff --git a/connected_customer/customer_360/queries/Individuals_No_Application.gsql b/connected_customer/customer_360/queries/Individuals_No_Application.gsql index a2a302d7..ff49fa8a 100644 --- a/connected_customer/customer_360/queries/Individuals_No_Application.gsql +++ b/connected_customer/customer_360/queries/Individuals_No_Application.gsql @@ -1,15 +1,22 @@ -CREATE DISTRIBUTED QUERY Individuals_No_Application(/* Parameters here */) FOR GRAPH Customer_360_Financial { - /* Find individuals who looked at a product but never started an application */ - sessions = {SessionID.*}; - - # Sessions with ProductBrowse or Websearch - - interest = SELECT s FROM sessions:s - (action:e) - :t - WHERE t.type != "ApplicationEngagement"; - - individuals = SELECT t FROM interest:s - (created_by:e) - Individual:t; - - PRINT individuals; +CREATE OR REPLACE DISTRIBUTED QUERY Individuals_No_Application() +FOR GRAPH Customer_360_Financial SYNTAX V1 { + // Find individuals who never started an application + + all_people = {Individual.*}; + + // Any session that has ApplicationEngagement + app_sessions = SELECT s + FROM SessionID:s -(action:e1)- ApplicationEngagement:a; + + // Individuals who have at least one such session + applied_people = SELECT i + FROM app_sessions:s -(created_by:e2)-> Individual:i; + + // People with no application sessions + result = all_people MINUS applied_people; + + PRINT result; } -UPDATE DESCRIPTION OF QUERY Individuals_No_Application "Return individuals who have never started an application but have looked at products" + +UPDATE DESCRIPTION OF QUERY Individuals_No_Application "Return individuals who have not made an application" diff --git a/connected_customer/customer_360/queries/Individuals_Product_Browse.gsql b/connected_customer/customer_360/queries/Individuals_Product_Browse.gsql index 75a6380e..886b2a81 100644 --- a/connected_customer/customer_360/queries/Individuals_Product_Browse.gsql +++ b/connected_customer/customer_360/queries/Individuals_Product_Browse.gsql @@ -1,4 +1,13 @@ CREATE DISTRIBUTED QUERY Individuals_Product_Browse(/* Parameters here */) FOR GRAPH Customer_360_Financial { + /* + Description: + Returns Individuals who have browsed products, via ProductBrowse. + + Parameters: + (None) + Uses all SessionID vertices in the graph. + + */ start = {SessionID.*}; prod_sessions = SELECT s FROM start:s - (action:e) - ProductBrowse:t; @@ -7,4 +16,4 @@ CREATE DISTRIBUTED QUERY Individuals_Product_Browse(/* Parameters here */) FOR G PRINT individuals; } -UPDATE DESCRIPTION OF QUERY Individuals_Product_Browse "Return individuals who have looked at products" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY Individuals_Product_Browse "Return individuals who have looked at products" diff --git a/connected_customer/customer_360/queries/delete_all.gsql b/connected_customer/customer_360/queries/delete_all.gsql index 9a390510..ad2158d1 100644 --- a/connected_customer/customer_360/queries/delete_all.gsql +++ b/connected_customer/customer_360/queries/delete_all.gsql @@ -1,4 +1,9 @@ CREATE DISTRIBUTED QUERY delete_all(/* Parameters here */) FOR GRAPH Customer_360_Financial{ + /* + Description: + Deletes all vertices and edges from the Customer_360_Financial graph, + effectively clearing the graph. + */ all = {ANY}; results = SELECT a FROM all:a - (:e) - ANY diff --git a/connected_customer/entity_resolution/README.md b/connected_customer/entity_resolution/README.md new file mode 100644 index 00000000..ce22a5f6 --- /dev/null +++ b/connected_customer/entity_resolution/README.md @@ -0,0 +1,341 @@ +# Entity Resolution - Connected Customer (TigerGraph Solution Kit) + +# Narratives + +Entity Resolution is the cornerstone of our approach to handling the vast array +of customer data in today's interconnected digital landscape. Specifically +tailored to the **Connected Customer** use case, this TigerGraph solution kit +excels at accurately identifying and linking entities across disparate +datasets, focusing on **Personally Identifiable Information (PII)** such as +email addresses, phone numbers, and physical addresses. By seamlessly +integrating PII from multiple sources, businesses can create a **unified view +of each customer**. This comprehensive understanding enables personalized +marketing campaigns, tailored product recommendations, and optimized customer +service interactions, fostering stronger customer relationships and driving +business success. + +Under the hood, the kit provisions a complete graph environment to **resolve, connect, and unify customer +records** at scale. Entities are grouped using a combination of deterministic +matching (shared PII) and fuzzy matching (MinHash + similarity scoring), then +organized into connected components that represent real-world customers. + +With this kit you can: + +- Build unified, cross-system customer profiles using shared and fuzzy-matched PII. +- Detect and link duplicate or overlapping customer records across data sources. +- Form connected components of matching entities for downstream analytics and activation. +- Export resolved entity clusters for use in MDM, CDP, or downstream data pipelines. + +## Contents + +- [Overview](#overview) +- [Components](#components) +- [Prerequisites](#prerequisites) +- [Setup Instructions](#setup-instructions) +- [Query Execution Order and Explanations](#query-execution-order-and-explanations) +- [Scalability](#scalability) +- [Mock Data](#mock-data) + +--- + +## Overview + +Modern customer data is fragmented across CRM systems, product platforms, +billing systems, marketing tools, and more. The same person may appear under +slightly different names, email addresses, phone numbers, or addresses. + +This solution kit demonstrates how to use **TigerGraph** to resolve those +fragments into unified entities: + +- **Entity** represents a person or customer instance as it appears in source data. +- **PII vertices** (Email, Phone, Name, Address components, Tax ID, etc.) represent + individual attributes used for matching. +- **Same_As** edges connect Entities that appear to represent the _same real-world individual_. +- **Connected_Component** vertices represent final unified clusters of Entities, + each cluster being a resolved "real-world entity". + +The kit includes both: + +- A **deterministic** matching layer using shared PII. +- A **fuzzy** matching layer using **MinHash** and **Jaro-Winkler** similarity on + name, email, and phone. + +You can: + +- Run the full entity resolution workflow on sample data. +- Tune matching thresholds and weights. +- Re-run incremental matching as new Entities arrive. +- Export results to CSV for external consumption. + +--- +# Components + +This repository includes multiple components: + +- `data` - Sample data. +- `load_jobs` - Scripts for data loading tasks. +- `meta` - Solution Kit metadata. +- `queries` - Collection of GSQL queries. +- `schema` - Definition of database schema. +- `readme.md` - This usage guide. +- `setup.sh` - Automated setup script. + +--- + +## Prerequisites + +Before you run this solution kit, make sure you have: + +- **A running TigerGraph instance** + - TigerGraph installed and running, or use the prebuilt kit on TG cloud. + - You must have permission to create graphs and run GSQL commands. + +- **GSQL client access** + - The `gsql` command-line tool available on the same machine/container where you cloned this repo. + - Ability to connect to your TigerGraph service + +- **Network access for sample data** + - Outbound internet access from the TigerGraph machine to read the sample + CSV files from the public S3 bucket used in `loading_job/load_data.gsql`. + +- **Shell environment** + - A Unix-like shell (Linux, macOS, or WSL) to run `setup.sh` and `queries/install_queries.sh`. + - Executable permissions for the scripts: + ```bash + chmod +x setup.sh + chmod +x queries/install_queries.sh + ``` + +> **Graph name:** This kit creates and uses a graph named `Entity_Resolution`. + +# Setup Instructions +The following instructions assume that you are running the following scripts +with `gsql` command installed. + +If you don't yet have the `gsql` command available, see the TigerGraph documentation: + +- **Local GSQL shell on the server** + [The GSQL Shell](https://docs.tigergraph.com/tigergraph-server/current/gsql-shell/) + (explains how to run `gsql` directly on a TigerGraph server) + +- **Remote GSQL client (from your laptop or another machine)** + [Using a Remote GSQL Client](https://docs.tigergraph.com/tigergraph-server/current/gsql-shell/using-a-remote-gsql-client) + (explains how to download the GSQL client JAR, configure SSL, and create a `gsql` alias) +Ensure that the script is executable with: +```bash + +chmod +x setup.sh + +``` +Then, run the automated script using: +```bash +./setup.sh +``` + + +The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: + +1. **Schema Creation**: Initiates the schema creation process with the `schema/add_to_er_graph.gsql` script. +2. **Data Loading**: Load data into the schema by running the data loading jobs with the `loading_job/load_data_to_er_graph.gsql` and `loading_job/load_hash_to_er_graph.gsql` scripts. +3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. + +## Query Execution Order and Explanations + +### Step 1: Delete Edges of Connected Components + +**Note**: This step **can be skipped** if running the Entity Resolution process for **the first time** on a graph. + +Run the `delete_all_cc_connections` query to delete the `Entity_In_Ring` edges on the graph. + +The default batching parameters allows the user to execute this query in one batch: + +- INT num_of_batches = 1 +- INT batch_id = 0 + +By setting the params differently the user can execute this query in multiple batches: + +To execute this process in 50 hard batches, we need to run this query 50 times +while incremeting the `batch_id` value each time. +In that case, we'll set the value of `num_of_batches` to `50` each call, but +the value of `batch_id` will initially be `0` and we'll increment it each run +until `49`. +Instead of running this query 50 times manually from the UI, we can run this +query 50 times as a REST endpoint from Linux shell: +```bash + +for i in {0..49};do + curl -X GET "http://:9000/query/ER_2023/delete_all_cc_connections?num_of_batches=50&batch_id=${i}"; +done; +``` + +**Note**: By calling this query multiple times and deleting data in hard +batches it allows us to delete data in parallel, and make the deltion process +run smoother and substantially faster on high scale. + +### Step 2: Insert Edges between Matching Entities + +After loading historical Entity and PII data and configuring wcc weights for +matching, run `match_entities` to perform matching on the entire graph. + +The `match_entities` query creates `Same_As` edges (similarity edges) between +Entity vertices that match. + +This query has weights for each PII attribute and threshold parameters used by +the weighted WCC matching process: + +- FLOAT customer_has_birthdate_weight = 0.5 +- FLOAT customer_has_email_address_weight = 0.5 +- FLOAT customer_has_name_weight = 0.2 +- FLOAT customer_has_phone_weight = 0.5 +- FLOAT customer_has_std_city_weight = 0.2 +- FLOAT customer_has_std_postcode_weight = 0.2 +- FLOAT customer_has_std_state_weight = 0.2 +- FLOAT customer_has_std_street_address_weight = 0.2 +- FLOAT customer_has_tax_id_number_weight = 0.5 +- FLOAT customer_has_source_customer_id_weight = 0.5 +- FLOAT threshold = 1.0 + +When an Entity matches another Entity through shared PII attribute(s) the +respective weight(s) will accumulate and any accumulated weights equal or over +threshold is considered a match. For example if a phone number has a weight of +1.0 and the threshold is 1.0 then any single matching phone between any Entities +will cause those respective Entities to be put into the same Connected_Component +community. If phone number has a weight of 0.5 and threshold is 1.0 then it +would require at least two shared phones between any Entities to match etc. +Default weights and thresholds are assigned to these queries but should be +adapted if there are different business rules for matching required to satisfy. + +This query also has batching parameters: + +- INT num_of_source_batches = 10 +- INT num_of_target_batches = 1 + +Setting `num_of_source_batches` to a **higher number** can potentially help +**reduce memory consumption** at runtime, but the query will take **longer to execute**. +Setting `num_of_target_batches` has the same pros/cons as `num_of_source_batches`, +while both pros and cons are to a **greater extent**. + +This query also has degree limitation parameters: + +- INT pii_low_connections_limit = 100 +- INT pii_high_connections_limit = 25000 + +Setting `pii_low_connections_limit` to a **higher number** will allow us to +**detect more matches**, but **the risk is an exponential increase in memory +consumption and computation time**. +Setting `pii_high_connections_limit` to a **higher number** will allow us to +**detect more matches** while there is **no memory consumption penalty**, but +the query will take **longer to execute**. + +The logic behind these parameters determines if a pair of entities will be evaluated +for a potential match or not, depending on the connectivity of their shared +PIIs. If the query finds at least one low-connectivity PII connecting a pair of +entities (while the connectivity limit is determined by `pii_low_connections_limit`), +the query will then also include all other PIIs shared between the two entities +in the decison-making process (as long as these other PIIs' degree does not +exeed the value set by `pii_high_connections_limit`). + +**Note**: Fine-tuning of the parameters in this query allows the process to +identify matches more accuratly while also taking computation time and memory +efficiency into account. + +This query also has a timestamp parameter: + +- DATETIME compute_entities_after_date = to_datetime("1970-01-01 00:00:00") + +The `compute_entities_after_date` defaults to the minimum DATETIME value, +which will make this query execute on all the entities in the graph. +If this parameter is set to another timestamp, then this query will calculate +similarities only for the newer entities inserted to the system after this time +(it will compare these entities to eachother and also compare them to the older +entities, but it won't compare older entities to one another). +This parameter's value could be set to the timestamp we got from the +`unify_entities` query after we executed Entity Resolution in the previous time. + +### Step 3: Form Communities + +Run the `unify_entities` query to associate all matched Entities from the +previous step into a Connected_Component community. + +Subsequent queries can be run following the completion of the aforementioned two. + +This query's output includes a timestamp we can pass into the `match_entities` +query as the `compute_entities_after_date` parameter the next time we execute +the Entity Resolution process. + +### Step 4 (Optional): Clear Out Remaining Disconnected Nodes of Connected Components + +**Note**: This step **can be skipped** if running the Entity Resolution +process for **the first time** on a graph. + +Run the `delete_unused_cc_nodes` query to delete the unused `Connected_Component` +vertices that are not connected to anything as a result of re-running the Entity +Resolution process. + +The default batching parameters allows the user to execute this query in one batch: + +- INT num_of_batches = 1 +- INT batch_id = 0 + +By setting the params differently the user can execute this query in multiple +batches: + +To execute this process in 50 hard batches, we need to run this query 50 times +while incremeting the `batch_id` value each time. +In that case, we'll set the value of `num_of_batches` to `50` each call, +but the value of `batch_id` will initially be `0` and we'll increment it each run +until `49`. +Instead of running this query 50 times manually from the UI, we can run this +query 50 times as a REST endpoint from Linux shell: + +```bash +for i in {0..49}; + do curl -X GET "http://:9000/query/ER_2023/delete_unused_cc_nodes?num_of_batches=50&batch_id=${i}"; +done; +``` + +**Note**: By calling this query multiple times and deleting data in hard batches +it allows us to delete data in parallel, and make the deltion process run +smoother and substantially faster on high scale. + +### Additional Queries + + - `results_file`: Takes a file path as a parameters. This query prints out + the Entity Resolution results to a CSV file. + - `find_shared_piis_of_two_entities`: Takes two Entity IDs as parameters. + This query returns the attribute type, value, and degree for each PII that + is shared between the provided entities. + +## Scalability + +The main heavy query in the Entity Resolution process is `match_entities`, as +it performs the entire matching computation. +The `unify_entities` query is light, and the other deletion queries can be run +in hard batches. + +If the user wishes to execute the Entity Resolution process on high scale and +it is expected to find an inourmous amount of pairs of matching entities, it +might be wise to consider saving the edges on disk instead of RAM. + +The steps to configure the system to save all edges on disk instead of Ram are as follows: + +- Run the command: `gadmin config entry GPE.EdgeDataMemoryLimit` +- Set the value to `0` +- Run the command: `gadmin config apply -y` +- Restart cluster: `gadmin restart -y` + +```bash +gadmin config set GPE.EdgeDataMemoryLimit 0 +gadmin config apply -y +gadmin restart -y +``` + +**Note**: Additional scalability-related information is noted in the query details above. + +Please reference the following link for more information on MinHash: + +[MinHash Based Fuzzy Match on Graph](https://www.tigergraph.com/blog/minhash-based-fuzzy-match-on-graph/) + +## Mock Data + +The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. diff --git a/connected_customer/entity_resolution/meta/Insights_Entity_Resolution.json b/connected_customer/entity_resolution/meta/Insights_Entity_Resolution.json new file mode 100644 index 00000000..4ecaec57 --- /dev/null +++ b/connected_customer/entity_resolution/meta/Insights_Entity_Resolution.json @@ -0,0 +1,332 @@ +{ + "defaultGraph": "Entity_Resolution", + "iconURL": "/insights/static/media/airplay.245a1409e0f415cae4a5c62546967039.svg", + "id": "tPswFHGrTWpL5A64mjdhtV", + "pageConfigSeparated": true, + "pages": [ + { + "chartMap": { + "1sDFWDdKeFuPcv1HJSyFLt": { + "chartSettings": { + "inputStates": [ + { + "dataType": "number", + "id": "input_rW8fa6DcKRSe18pcNWvenV", + "label": "Top N Clusters", + "name": "topN", + "placeholder": "Top N", + "settings": { + "max": "10", + "min": "1", + "step": "1" + }, + "widgetType": "Input" + } + ] + }, + "graphName": "Entity_Resolution", + "hideWidgetName": false, + "id": "1sDFWDdKeFuPcv1HJSyFLt", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "TopN", + "type": "Inputs", + "version": "1764560443931821966" + }, + "8L4MKHtNiBav9ufE6V7U7Z": { + "chartSettings": {}, + "graphName": "Entity_Resolution", + "hideWidgetName": false, + "id": "8L4MKHtNiBav9ufE6V7U7Z", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution {\n\n SumAccum @@entity_cnt;\n SumAccum @@cc_cnt;\n\n // Count all Entity vertices\n _entities =\n SELECT s\n FROM Entity:s\n ACCUM @@entity_cnt += 1;\n\n // Count all Connected_Component vertices\n _components =\n SELECT c\n FROM Connected_Component:c\n ACCUM @@cc_cnt += 1;\n\n // Compute duplicates resolved = entities - components\n PRINT (@@entity_cnt - @@cc_cnt) AS duplicates_resolved;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Entities Resolved", + "type": "value", + "version": "1764562711877287648" + }, + "9PZnMP9LiZrpDkRR3P7Gh4": { + "chartSettings": { + "category": [ + { + "id": "bucket", + "type": "string" + } + ], + "showColumns": [ + { + "isChecked": true, + "name": "bucket" + }, + { + "isChecked": true, + "name": "freq" + } + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "bucket", + "freq" + ], + "tableIndex": 0, + "value": [ + { + "id": "freq", + "type": "number" + } + ] + }, + "graphName": "Entity_Resolution", + "hideWidgetName": false, + "id": "9PZnMP9LiZrpDkRR3P7Gh4", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution {\n\n // Per-component size\n SumAccum @size;\n\n // Global bucket counters\n SumAccum @@size1;\n SumAccum @@size2;\n SumAccum @@size3_5;\n SumAccum @@size6_10;\n SumAccum @@size11_20;\n SumAccum @@size21p;\n\n // Compute size for each Connected_Component\n Seed = { Connected_Component.* };\n\n comps =\n SELECT c\n FROM Seed:c -(Entity_In_Ring:e)- Entity:ent\n ACCUM c.@size += 1;\n\n // Bucket by size\n agg =\n SELECT c\n FROM comps:c\n POST-ACCUM\n CASE\n WHEN c.@size == 1 THEN\n @@size1 += 1\n WHEN c.@size == 2 THEN\n @@size2 += 1\n WHEN c.@size >= 3 AND c.@size <= 5 THEN\n @@size3_5 += 1\n WHEN c.@size >= 6 AND c.@size <= 10 THEN\n @@size6_10 += 1\n WHEN c.@size >= 11 AND c.@size <= 20 THEN\n @@size11_20 += 1\n ELSE\n @@size21p += 1\n END;\n\n // Instead of one row with 6 columns,\n // print 6 rows with 2 columns: (bucket, freq)\n PRINT \"Size = 1\" AS bucket, @@size1 AS freq;\n PRINT \"Size = 2\" AS bucket, @@size2 AS freq;\n PRINT \"Size 3–5\" AS bucket, @@size3_5 AS freq;\n PRINT \"Size 6–10\" AS bucket, @@size6_10 AS freq;\n PRINT \"Size 11–20\" AS bucket, @@size11_20 AS freq;\n PRINT \"Size 21+\" AS bucket, @@size21p AS freq;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Cluster Size Distribution", + "type": "table", + "version": "1764566105487612322" + }, + "rFHwVwsv15H4BKUi13uU5F": { + "chartSettings": { + "markdown": "## Entity Resolution Run Guide\n\nIf the charts and tables on this page look **empty** or show **0 Connected Components**, it means the Entity Resolution process has not been run yet on this graph.\n\nFollow this order when running ER on the **Entity_Resolution** graph:\n\n---\n\n### Load / refresh the base data (one-time or as needed)\n\nThese steps are usually done once when the solution kit is installed, or whenever you refresh the dataset:\n\n1. **Load entities and PII**\n\n - `RUN LOADING JOB load_data_to_er_graph`\n\n2. **Load MinHash hashes for fuzzy matching (email, name, phone)**\n\n - `RUN LOADING JOB load_hash_to_er_graph`\n\nIf the **Entity** count is greater than 0, your base data is loaded.\n\n---\n\n### Run the Entity Resolution Pipeline\n\nFor a fresh run (no Connected_Component data yet), you typically do:\n\n**Match entities (create Same_As edges)** \n Uses MinHash + Jaro–Winkler + PII weights to score similarity. \n\n ```gsql\n RUN QUERY match_entities();\n```\n\n**Unify entities into Connected Components**\n\nAssigns every `Entity` to a `Connected_Component` community based on `Same_As` edges.\n\n```gsql\nRUN QUERY unify_entities();\n```\nThis should fill up the Connected_Component vertices.\n" + }, + "graphName": "Entity_Resolution", + "hideWidgetName": false, + "id": "rFHwVwsv15H4BKUi13uU5F", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "StartUp Notes", + "type": "markdown", + "version": "1764561962840319249" + }, + "sXc7DZ3wwqkqei21z36X2z": { + "chartSettings": {}, + "graphName": "Entity_Resolution", + "hideWidgetName": false, + "id": "sXc7DZ3wwqkqei21z36X2z", + "patternLimit": 5, + "query": "INTERPRET QUERY(INT topN) FOR GRAPH Entity_Resolution {\n\n // Count how many Entity vertices belong to each Connected_Component\n SumAccum @size;\n\n Seed = { Connected_Component.* };\n\n Result =\n SELECT c\n FROM Seed:c -(Entity_In_Ring:e)- Entity:ent\n ACCUM c.@size += 1\n ORDER BY c.@size DESC\n LIMIT topN;\n\n PRINT Result;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top N Clusters", + "type": "table", + "version": "1764323817053623261" + }, + "whCPA74bT5KKghF4aUYAxX": { + "chartSettings": { + "showColumns": [ + { + "isChecked": true, + "name": "avg_score" + } + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "avg_score" + ] + }, + "graphName": "Entity_Resolution", + "hideWidgetName": false, + "id": "whCPA74bT5KKghF4aUYAxX", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution {\n\n // 1) Compute size (number of Entities) for each Connected_Component\n SumAccum @size;\n SetAccum @@same_as_edges; // collect Same_As edges explicitly\n\n comps =\n SELECT c\n FROM Connected_Component:c -(Entity_In_Ring:er)- Entity:ent\n ACCUM c.@size += 1;\n\n // 2) Choose one non-singleton component (size > 1), e.g. the largest\n cc_to_show =\n SELECT c\n FROM comps:c\n WHERE c.@size > 1\n ORDER BY c.@size DESC\n LIMIT 1;\n\n // 3) Get all Entities in that component\n ents =\n SELECT ent\n FROM cc_to_show:c -(Entity_In_Ring:er)- Entity:ent;\n\n // 4) Collect Same_As edges between those Entities\n result_ents =\n SELECT s\n FROM ents:s -(Same_As:e)- ents:t\n ACCUM\n @@same_as_edges += e;\n\n // 5) Print both vertices and edges so the Graph widget can render them\n PRINT result_ents;\n PRINT @@same_as_edges;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Resolved Entity Graph", + "type": "internal-graph", + "version": "1764567808410839821" + } + }, + "globalParameters": { + "topN": { + "id": "input_rW8fa6DcKRSe18pcNWvenV", + "name": "topN", + "type": "NUMBER", + "value": 4 + } + }, + "iconURL": "/insights/static/media/brain-circuit.29a9b2394d2ecb53df1b9e1c861720be.svg", + "id": "6xa7y24BQaKkAS2ddKMEH6", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 12, + "i": "1sDFWDdKeFuPcv1HJSyFLt", + "moved": false, + "static": false, + "w": 2, + "x": 4, + "y": 0 + }, + { + "h": 12, + "i": "8L4MKHtNiBav9ufE6V7U7Z", + "moved": false, + "static": false, + "w": 2, + "x": 10, + "y": 0 + }, + { + "h": 15, + "i": "9PZnMP9LiZrpDkRR3P7Gh4", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 12 + }, + { + "h": 12, + "i": "rFHwVwsv15H4BKUi13uU5F", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 12, + "i": "sXc7DZ3wwqkqei21z36X2z", + "moved": false, + "static": false, + "w": 4, + "x": 6, + "y": 0 + }, + { + "h": 15, + "i": "whCPA74bT5KKghF4aUYAxX", + "moved": false, + "static": false, + "w": 5, + "x": 4, + "y": 12 + } + ], + "xs": [ + { + "h": 12, + "i": "rFHwVwsv15H4BKUi13uU5F", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 12, + "i": "1sDFWDdKeFuPcv1HJSyFLt", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 12 + }, + { + "h": 12, + "i": "sXc7DZ3wwqkqei21z36X2z", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 24 + }, + { + "h": 12, + "i": "8L4MKHtNiBav9ufE6V7U7Z", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 36 + }, + { + "h": 15, + "i": "9PZnMP9LiZrpDkRR3P7Gh4", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 48 + }, + { + "h": 12, + "i": "whCPA74bT5KKghF4aUYAxX", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 63 + } + ] + }, + "title": "Overview", + "version": "1764567463509216731", + "weight": 10 + } + ], + "title": "New Application", + "userRoleForApp": "owner", + "version": "1764322614974303069" +} \ No newline at end of file diff --git a/connected_customer/entity_resolution/queries/delete_all_cc_connections.gsql b/connected_customer/entity_resolution/queries/delete_all_cc_connections.gsql index 948f7954..6969d7cc 100644 --- a/connected_customer/entity_resolution/queries/delete_all_cc_connections.gsql +++ b/connected_customer/entity_resolution/queries/delete_all_cc_connections.gsql @@ -1,7 +1,20 @@ USE GRAPH Entity_Resolution CREATE OR REPLACE DISTRIBUTED QUERY delete_all_cc_connections(INT num_of_batches = 1, INT batch_id = 0) FOR GRAPH Entity_Resolution{ + /* + Query Name: delete_all_cc_connections + Graph Maintenance: Clear Connected Component (CC) Assignments for Entities + Purpose: + Remove all `Entity_In_Ring` edges linking `Entity` vertices to + `Connected_Component` vertices. Do this for a fresh start. + + Key Concept: + . Batch Deletion: + Uses `num_of_batches` and `batch_id` to partition the `Entity` vertex + space so that very large graphs can be processed across multiple jobs + without overloading resources. + */ SumAccum @@count; temp = SELECT t @@ -20,4 +33,4 @@ UPDATE DESCRIPTION OF QUERY delete_all_cc_connections "This query deletes all En UPDATE DESCRIPTION OF QUERY_PARAM delete_all_cc_connections.num_of_batches "Number of batches to partition the deletions for all Entity vertices in the graph. This query must be called the same number of times as batch_num with incrementing batch_id 0 through batch_num - 1 to process all Entity vertices in the graph. Defaults to 1." -UPDATE DESCRIPTION OF QUERY_PARAM delete_all_cc_connections.batch_id "Current batch partition of Entity vertices to process and must be called with batch_id 0 through batch_num - 1. Defaults to 0." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM delete_all_cc_connections.batch_id "Current batch partition of Entity vertices to process and must be called with batch_id 0 through batch_num - 1. Defaults to 0." diff --git a/connected_customer/entity_resolution/queries/delete_unused_cc_nodes.gsql b/connected_customer/entity_resolution/queries/delete_unused_cc_nodes.gsql index cc78f589..979172c9 100644 --- a/connected_customer/entity_resolution/queries/delete_unused_cc_nodes.gsql +++ b/connected_customer/entity_resolution/queries/delete_unused_cc_nodes.gsql @@ -1,7 +1,20 @@ USE GRAPH Entity_Resolution CREATE OR REPLACE DISTRIBUTED QUERY delete_unused_cc_nodes(INT num_of_batches = 1, INT batch_id = 0) FOR GRAPH Entity_Resolution { - + /* + Query Name: delete_unused_cc_nodes + Graph Maintenance: Prune Unused Connected Component (CC) Vertices + + Purpose: + Clean up the graph by deleting `Connected_Component` vertices that are no + longer referenced by any entities. Typically run after clearing CC + assignment edges so that orphaned CC nodes are removed. + + Key Concept: + . Orphaned CC Nodes: + A `Connected_Component` vertex with `outdegree() == 0` is considered + unused and can be safely deleted. + */ SumAccum @@count; start = {Connected_Component.*}; @@ -22,4 +35,4 @@ UPDATE DESCRIPTION OF QUERY delete_unused_cc_nodes "This query deletes all unuse UPDATE DESCRIPTION OF QUERY_PARAM delete_unused_cc_nodes.num_of_batches "Number of batches to partition the deletions for all Connected_Component vertices in the graph. This query must be called the same number of times as batch_num with incrementing batch_id 0 through batch_num - 1 to process all Connected_Component vertices in the graph. Defaults to 1." -UPDATE DESCRIPTION OF QUERY_PARAM delete_unused_cc_nodes.batch_id "Current batch partition of Connected_Component vertices to process and must be called with batch_id 0 through batch_num - 1. Defaults to 0." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM delete_unused_cc_nodes.batch_id "Current batch partition of Connected_Component vertices to process and must be called with batch_id 0 through batch_num - 1. Defaults to 0." diff --git a/connected_customer/entity_resolution/queries/find_shared_piis_of_two_entities.gsql b/connected_customer/entity_resolution/queries/find_shared_piis_of_two_entities.gsql index cc2eec3f..db2db44c 100644 --- a/connected_customer/entity_resolution/queries/find_shared_piis_of_two_entities.gsql +++ b/connected_customer/entity_resolution/queries/find_shared_piis_of_two_entities.gsql @@ -1,7 +1,23 @@ USE GRAPH Entity_Resolution CREATE OR REPLACE QUERY find_shared_piis_of_two_entities(VERTEX entity_1, VERTEX entity_2) { - + /* + Query Name: find_shared_piis_of_two_entities + Real-Time PII Linkage Analysis between Two Entities + + Purpose: + Given two `Entity` vertices, identify all **PII vertices** that are directly + shared between them. + + Key Output Fields: + . pii_type - The type/label of the shared PII vertex. + . pii_value - The shared PII vertex instance + . degree - How many vertices share the PII + + Output: + A list of tuples (pii_type, pii_value, degree) for each PII vertex that is + connected to both entities + */ TYPEDEF TUPLE pii_info; ListAccum @@degrees_of_shared_piis; @@ -31,4 +47,4 @@ UPDATE DESCRIPTION OF QUERY find_shared_piis_of_two_entities "This query returns UPDATE DESCRIPTION OF QUERY_PARAM find_shared_piis_of_two_entities.entity_1 "First Entity vertex used to search for shared PII vertices." -UPDATE DESCRIPTION OF QUERY_PARAM find_shared_piis_of_two_entities.entity_2 "Second Entity vertex used to search for shared PII vertices." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM find_shared_piis_of_two_entities.entity_2 "Second Entity vertex used to search for shared PII vertices." diff --git a/connected_customer/entity_resolution/queries/match_entities.gsql b/connected_customer/entity_resolution/queries/match_entities.gsql index c586ba36..df44a7dc 100644 --- a/connected_customer/entity_resolution/queries/match_entities.gsql +++ b/connected_customer/entity_resolution/queries/match_entities.gsql @@ -19,6 +19,37 @@ CREATE OR REPLACE DISTRIBUTED QUERY match_entities( DATETIME compute_entities_after_date = to_datetime("1970-01-01 00:00:00") // All entities that were created in TG after this date, will be computed to find similarity edges related to these new entities. The default value is the earliest time possible (great for the first time running this query) ) FOR GRAPH Entity_Resolution { + /* + Query Name: match_entities + Batch-Based Weighted & Fuzzy Entity Resolution (All-Pairs Similarity) + + Purpose: + Batch job that finds and links similar `Entity` vertices based on shared PII + using exact and fuzzy matching to compute a weighted similarity score. + When the score exceeds `threshold`, a `Same_As` edge is created between + the matching entities. + This operation may be resource intensive. + + Key Concepts: + . Each PII type contributes a configurable weight to the final match score. + . PIIs like name, email and phone number are hashed and compared with + derived Jaro-Winkler scores. + . Highly connected PII vertices are skipped to prevent false positives and hot spots. + + Parameters: + customer_has_*_weight + Controls how strongly each PII attribute influences the similarity score. + num_of_source_batches, num_of_target_batches + Control batching strategy and parallelism. + threshold + Minimum total similarity score required to create a `Same_As` edge. + compute_entities_after_date + Only `Entity` vertices created after this timestamp are treated as new + and considered for similarity edge creation. + + Output: + Inserts `Same_As` edges between matched `Entity` pairs and prints these stats + */ TYPEDEF TUPLE, str STRING, created_at DATETIME> entity_fuzzy_vertex_info; ListAccum @entity_email_address_list; ListAccum @entity_name_list; @@ -336,4 +367,4 @@ UPDATE DESCRIPTION OF QUERY_PARAM match_entities.pii_low_connections_limit "Maxi UPDATE DESCRIPTION OF QUERY_PARAM match_entities.pii_high_connections_limit "Maximum outdegree of PII vertex considered for 'high connection' matching. The case will be skipped if the same vertex is connected to too many entities. Defaults to 25000." -UPDATE DESCRIPTION OF QUERY_PARAM match_entities.compute_entities_after_date "All Entity vertices with created_at after this date will be computed to find similarity edges related to these new entities. Defaults to 1970-01-01 00:00:00 (which is the earliest possible created_at and great for the initial running of this query)." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM match_entities.compute_entities_after_date "All Entity vertices with created_at after this date will be computed to find similarity edges related to these new entities. Defaults to 1970-01-01 00:00:00 (which is the earliest possible created_at and great for the initial running of this query)." diff --git a/connected_customer/entity_resolution/queries/output_entity_cc_to_file.gsql b/connected_customer/entity_resolution/queries/output_entity_cc_to_file.gsql index 8971a9ff..2ab66b96 100644 --- a/connected_customer/entity_resolution/queries/output_entity_cc_to_file.gsql +++ b/connected_customer/entity_resolution/queries/output_entity_cc_to_file.gsql @@ -1,7 +1,17 @@ USE GRAPH Entity_Resolution CREATE OR REPLACE QUERY output_entity_cc_to_file(STRING output_file_path = "/home/tigergraph/gsql_output/entity_cc_output.csv") { - + /* + Query Name: output_entity_cc_to_file + Export Entity-to-Community Mapping to File + + Purpose: + Generate a CSV file that maps every `Entity` vertex to the + `Connected_Component` vertex it is currently assigned to via + `Entity_In_Ring` edges. Useful for reporting, audits, and + downstream ML or warehousing workflows. + + */ FILE f (output_file_path); f.println("Entity", "Connected_Component"); @@ -19,4 +29,4 @@ CREATE OR REPLACE QUERY output_entity_cc_to_file(STRING output_file_path = "/hom UPDATE DESCRIPTION OF QUERY output_entity_cc_to_file "This query outputs a file containing a mapping of all Entity vertices to their respective Connected_Component community vertex." -UPDATE DESCRIPTION OF QUERY_PARAM output_entity_cc_to_file.output_file_path "File path location containing the output Entity vertex to Connected_Component vertex in CSV format. Defaults to /home/tigergraph/gsql_output/entity_cc_output.csv" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM output_entity_cc_to_file.output_file_path "File path location containing the output Entity vertex to Connected_Component vertex in CSV format. Defaults to /home/tigergraph/gsql_output/entity_cc_output.csv" diff --git a/connected_customer/entity_resolution/queries/unify_entities.gsql b/connected_customer/entity_resolution/queries/unify_entities.gsql index 8c44ae54..ff1ab9e9 100644 --- a/connected_customer/entity_resolution/queries/unify_entities.gsql +++ b/connected_customer/entity_resolution/queries/unify_entities.gsql @@ -1,7 +1,27 @@ USE GRAPH Entity_Resolution CREATE OR REPLACE DISTRIBUTED QUERY unify_entities() FOR GRAPH Entity_Resolution { + /* + Query Name: unify_entities + Finalize Entity Resolution: Connected Component Assignment for Similar Entities + Purpose: + Using the existing `Same_As` similarity edges created by `match_entities`, we + group all connected `Entity` vertices into communities, then each community + is a `Connected_Component` vertex linked by `Entity_In_Ring` edges. + + Core Logic + 1. Each `Entity` vertex is labeled with its own internal ID + 2. The smallest ID is iteratively propagated across `Same_As` edges until + all vertices in a connected group share the same ID. + 3. For each final ID, a `Connected_Component` vertex is created and every + member `Entity` is connected to it via `Entity_In_Ring`. + + Output: + . One `Connected_Component` vertex per connected group of `Entity` vertices. + . One `Entity_In_Ring` edge from each `Entity` to its community. + . Printed execution time + */ MinAccum @cc_id; // Each vertex's tentative component id DATETIME start_time = now(); @@ -41,4 +61,4 @@ CREATE OR REPLACE DISTRIBUTED QUERY unify_entities() FOR GRAPH Entity_Resolution } -UPDATE DESCRIPTION OF QUERY unify_entities "This query associates all Entity vertices in the graph to a Connected_Component vertex using Same_As edge previously inserted by match_entities query." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY unify_entities "This query associates all Entity vertices in the graph to a Connected_Component vertex using Same_As edge previously inserted by match_entities query." diff --git a/connected_customer/entity_resolution/readme.md b/connected_customer/entity_resolution/readme.md deleted file mode 100644 index 52afd951..00000000 --- a/connected_customer/entity_resolution/readme.md +++ /dev/null @@ -1,150 +0,0 @@ -# Narratives - -Entity Resolution is the cornerstone of our approach to addressing the challenges posed by the vast array of data in today's interconnected digital landscape. Specifically tailored to the Connected Customer use-case, our Entity Resolution solution excels at accurately identifying and linking entities across disparate datasets, focusing on Personally Identifiable Information (PII) such as email addresses, phone numbers, and physical addresses. By seamlessly integrating PII from various sources, businesses can create a unified view of each customer. This comprehensive understanding empowers organizations to personalize marketing campaigns, tailor product recommendations, and optimize customer service interactions, fostering stronger customer relationships and driving business success. - -# Components - -This repository includes multiple components: - -- `data` - Sample data. -- `load_jobs` - Scripts for data loading tasks. -- `meta` - Solution Kit metadata. -- `queries` - Collection of GSQL queries. -- `schema` - Definition of database schema. -- `readme.md` - This usage guide. -- `setup.sh` - Automated setup script. - -# Instructions - -The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: - -1. **Schema Creation**: Initiates the schema creation process with the `schema/add_to_er_graph.gsql` script. -2. **Data Loading**: Load data into the schema by running the data loading jobs with the `loading_job/load_data_to_er_graph.gsql` and `loading_job/load_hash_to_er_graph.gsql` scripts. -3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. - -## Query Execution Order and Explanations - -### Step 1: Delete Edges of Connected Components - -**Note**: This step **can be skipped** if running the Entity Resolution process for **the first time** on a graph. - -Run the `delete_all_cc_connections` query to delete the `Entity_In_Ring` edges on the graph. - -The default batching parameters allows the user to execute this query in one batch: - - - INT num_of_batches = 1 - - INT batch_id = 0 - -By setting the params differently the user can execute this query in multiple batches: - -To execute this process in 50 hard batches, we need to run this query 50 times while incremeting the `batch_id` value each time. -In that case, we'll set the value of `num_of_batches` to `50` each call, but the value of `batch_id` will initially be `0` and we'll increment it each run until `49`. -Instead of running this query 50 times manually from the UI, we can run this query 50 times as a REST endpoint from Linux shell: - -`for i in {0..49};do curl -X GET "http://:9000/query/ER_2023/delete_all_cc_connections?num_of_batches=50&batch_id=${i}";done;` - -**Note**: By calling this query multiple times and deleting data in hard batches it allows us to delete data in parallel, and make the deltion process run smoother and substantially faster on high scale. - -### Step 2: Insert Edges between Matching Entities - -After loading historical Entity and PII data and configuring wcc weights for matching, run `match_entities` to perform matching on the entire graph. - -The `match_entities` query creates `Same_As` edges (similarity edges) between Entity vertices that match. - -This query has weights for each PII attribute and threshold parameters used by the weighted WCC matching process: - - - FLOAT customer_has_birthdate_weight = 0.5 - - FLOAT customer_has_email_address_weight = 0.5 - - FLOAT customer_has_name_weight = 0.2 - - FLOAT customer_has_phone_weight = 0.5 - - FLOAT customer_has_std_city_weight = 0.2 - - FLOAT customer_has_std_postcode_weight = 0.2 - - FLOAT customer_has_std_state_weight = 0.2 - - FLOAT customer_has_std_street_address_weight = 0.2 - - FLOAT customer_has_tax_id_number_weight = 0.5 - - FLOAT customer_has_source_customer_id_weight = 0.5 - - FLOAT threshold = 1.0 - -When an Entity matches another Entity through shared PII attribute(s) the respective weight(s) will accumulate and any accumulated weights equal or over threshold is considered a match. For example if a phone number has a weight of 1.0 and the threshold is 1.0 then any single matching phone between any Entities will cause those respective Entities to be put into the same Connected_Component community. If phone number has a weight of 0.5 and threshold is 1.0 then it would require at least two shared phones between any Entities to match etc. Default weights and thresholds are assigned to these queries but should be adapted if there are different business rules for matching required to satisfy. - -This query also has batching parameters: - - - INT num_of_source_batches = 10 - - INT num_of_target_batches = 1 - -Setting `num_of_source_batches` to a **higher number** can potentially help **reduce memory consumption** at runtime, but the query will take **longer to execute**. -Setting `num_of_target_batches` has the same pros/cons as `num_of_source_batches`, while both pros and cons are to a **greater extent**. - -This query also has degree limitation parameters: - - - INT pii_low_connections_limit = 100 - - INT pii_high_connections_limit = 25000 - -Setting `pii_low_connections_limit` to a **higher number** will allow us to **detect more matches**, but **the risk is an exponential increase in memory consumption and computation time**. -Setting `pii_high_connections_limit` to a **higher number** will allow us to **detect more matches** while there is **no memory consumption penalty**, but the query will take **longer to execute**. - -The logic behind these parameters determines if a pair of entities will be evaluated for a potential match or not, depending on the connectivity of their shared PIIs. If the query finds at least one low-connectivity PII connecting a pair of entities (while the connectivity limit is determined by `pii_low_connections_limit`), the query will then also include all other PIIs shared between the two entities in the decison-making process (as long as these other PIIs' degree does not exeed the value set by `pii_high_connections_limit`). - -**Note**: Fine-tuning of the parameters in this query allows the process to identify matches more accuratly while also taking computation time and memory efficiency into account. - -This query also has a timestamp parameter: - - - DATETIME compute_entities_after_date = to_datetime("1970-01-01 00:00:00") - -The `compute_entities_after_date` defaults to the minimum DATETIME value, which will make this query execute on all the entities in the graph. -If this parameter is set to another timestamp, then this query will calculate similarities only for the newer entities inserted to the system after this time (it will compare these entities to eachother and also compare them to the older entities, but it won't compare older entities to one another). -This parameter's value could be set to the timestamp we got from the `unify_entities` query after we executed Entity Resolution in the previous time. - -### Step 3: Form Communities - -Run the `unify_entities` query to associate all matched Entities from the previous step into a Connected_Component community. - -Subsequent queries can be run following the completion of the aforementioned two. - -This query's output includes a timestamp we can pass into the `match_entities` query as the `compute_entities_after_date` parameter the next time we execute the Entity Resolution process. - -### Step 4 (Optional): Clear Out Remaining Disconnected Nodes of Connected Components - -**Note**: This step **can be skipped** if running the Entity Resolution process for **the first time** on a graph. - -Run the `delete_unused_cc_nodes` query to delete the unused `Connected_Component` vertices that are not connected to anything as a result of re-running the Entity Resolution process. - -The default batching parameters allows the user to execute this query in one batch: - - - INT num_of_batches = 1 - - INT batch_id = 0 - -By setting the params differently the user can execute this query in multiple batches: - -To execute this process in 50 hard batches, we need to run this query 50 times while incremeting the `batch_id` value each time. -In that case, we'll set the value of `num_of_batches` to `50` each call, but the value of `batch_id` will initially be `0` and we'll increment it each run until `49`. -Instead of running this query 50 times manually from the UI, we can run this query 50 times as a REST endpoint from Linux shell: - -`for i in {0..49};do curl -X GET "http://:9000/query/ER_2023/delete_unused_cc_nodes?num_of_batches=50&batch_id=${i}";done;` - -**Note**: By calling this query multiple times and deleting data in hard batches it allows us to delete data in parallel, and make the deltion process run smoother and substantially faster on high scale. - -### Additional Queries - - - `results_file`: Takes a file path as a parameters. This query prints out the Entity Resolution results to a CSV file. - - `find_shared_piis_of_two_entities`: Takes two Entity IDs as parameters. This query returns the attribute type, value, and degree for each PII that is shared between the provided entities. - -## Scalability - -The main heavy query in the Entity Resolution process is `match_entities`, as it performs the entire matching computation. -The `unify_entities` query is light, and the other deletion queries can be run in hard batches. - -If the user wishes to execute the Entity Resolution process on high scale and it is expected to find an inourmous amount of pairs of matching entities, it might be wise to consider saving the edges on disk instead of RAM. - -The steps to configure the system to save all edges on disk instead of Ram are as follows: - - - Run the command: `gadmin config entry GPE.EdgeDataMemoryLimit` - - Set the value to `0` - - Run the command: `gadmin config apply -y` - - Restart cluster: `gadmin restart -y` - -**Note**: Additioanl scalability-related information is noted in the query details above. - -## Mock Data - -The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. diff --git a/connected_customer/product_recommendations/README.md b/connected_customer/product_recommendations/README.md new file mode 100644 index 00000000..8376ec6a --- /dev/null +++ b/connected_customer/product_recommendations/README.md @@ -0,0 +1,306 @@ +# Product Recommendations - Retail (TigerGraph Solution Kit) + +A TigerGraph solution kit that builds a graph-powered product recommendation +engine for retail and e-commerce. + +This kit provides a complete graph environment to understand customer behavior +and generate personalized product recommendations based on purchases, +attributes, and shared affinities. + +With this kit you can: + +- Cluster customers by behavior and numeric attributes (e.g., age). +- Discover products that are frequently co-purchased or share similar audiences. +- Build and explore combined "feature" nodes (clusters, styles, attributes). +- Generate ranked product recommendations for one or many customers. + +--- + +## Contents + +- [Overview](#overview) +- [Features](#features) +- [Prerequisites](#prerequisites) +- [Setup Instructions](#setup-instructions) +- [Query Explanations](#query-explanations) +- [Run Example Queries](#run-example-queries) +- [Using Your Own Data](#using-your-own-data) +- [Resetting the Environment](#resetting-the-environment) + +--- + +## Overview + +Product recommendations are about connecting who your customers are with +what they buy and why they choose certain products - and then turning +those insights into relevant suggestions in real time. Graph databases like +TigerGraph are well-suited for this because they can traverse deeply connected +data (e.g., customers -> purchases -> products -> attributes -> clusters) efficiently. + +This solution kit models a retail / e-commerce environment with: + +- Customers and their attributes (like age). +- Products and Product Variants organized under a category hierarchy. +- Attributes for both customers and products (e.g., style and color). +- Clusters and Combined_Feature nodes that represent higher-level + behavioral or attribute groupings. +- Interactions capturing purchase events between customers and product variants. + +You can use the included queries as building blocks for recommendation, marketing, +and analytics use cases, or extend the graph with your own entities, attributes, +and interaction types. + +--- + +## Features + +- **Pre-built TigerGraph schema** + Vertices and edges for Customers, Products, Product Variants, Product Categories, + Attributes, Clusters, Combined_Features, and Interactions, all defined in + `schema/schema.gsql`. + +- **Sample product recommendation dataset** + Mock CSV data for customers, products, categories, styles, and purchases is + provided in the `data/` directory for inspection and column mapping. The + actual data used by the loading job is pulled from a public S3 bucket but it + contains the same data as the CSV files. + +- **End-to-end loading job** + A single loading job in `loading_job/load_jobs.gsql` that: + - Defines an S3 data source for the solution-kit dataset. + - Loads vertices and edges from: + - `product_categories.csv` + - `products.csv` + - `customers.csv` + - `customer_styles.csv` + - `customer_purchases.csv` + +- **Curated query library** + GSQL queries in `queries/queries.gsql` that cover: + - `k_means`: clustering vertices using numeric attributes. + - `combine_features`: merging high-degree feature nodes into Combined_Feature vertices. + - `recommend_products`: generating ranked product recommendations per customer. + +- **Reset script** + `reset/reset.gsql` cleans up the environment by dropping the loading job, + data source, queries, and the `Product_Recommendation` graph itself. + + +## Prerequisites + +Before you run this solution kit, make sure you have: + +- **A running TigerGraph instance** + - TigerGraph installed and running (TigerGraph Server or TigerGraph Cloud). + - Permissions to create/drop graphs and run GSQL commands. + +- **GSQL client access** + - The `gsql` command-line tool available on the same machine/container where you cloned this repo. + - Ability to connect to your TigerGraph service + +- **Network access for sample data** + - Outbound internet access from the TigerGraph machine to read the sample + CSV files from the public S3 bucket used in `loading_job/load_data.gsql`. + +- **Shell environment** + - A Unix-like shell (Linux, macOS, or WSL) to run `gsql` commands and any helper scripts you may add. + +- **(Optional) Python environment for local mock data** + - Python 3.x if you want to run `data/data_generator.py` to regenerate local mock CSVs. + + Note that regenerating the data by itself does NOT change the data in the + graph. You may also want to be careful if there's already some data being + loaded if there's a new mock data you want to put in. + + - Python packages: + - `numpy` + - `scipy` + +> **Graph name:** This kit creates and uses a graph named `Product_Recommendation`. + +--- + + +## Setup Instructions + +The instructions below assume that: + +- You have cloned or copied this repository onto the TigerGraph server. +- The `gsql` command is available in your shell. + +If you don't yet have the `gsql` command available, see the TigerGraph documentation: + +- **Local GSQL shell on the server** + [The GSQL Shell](https://docs.tigergraph.com/tigergraph-server/4.2/gsql-shell/) + (explains how to run `gsql` directly on a TigerGraph server) + +- **Remote GSQL client (from your laptop or another machine)** + [Using a Remote GSQL Client](https://docs.tigergraph.com/tigergraph-server/4.2/gsql-shell/using-a-remote-gsql-client) + (explains how to download the GSQL client JAR, configure SSL, and create a `gsql` alias) + +### Run the setup script (recommended) + +From the root of the repository, make the setup script executable (once), then run it: + +```bash +chmod +x setup.sh +./setup.sh +``` + +## Query Explanations + +This solution kit includes several pre-built queries. Below are the most useful +queries and what you can get from each of them, grouped by business purpose. + +--- + +### 1. Customer Segmentation & Profiling + +**k_means(v_type, attr_set, min_cluster_count, max_cluster_count, ")** +This uses the K-Means algorithm to turn random data into groups of k clusters. +Clusters customers (or any vertex type) are grouped into behaviorally similar +groups based on numerical attributes. The query normalizes the selected +attributes, tries different numbers of clusters, and uses the elbow method to +pick a good number of clusters. The output is a set of Cluster vertices and In_Cluster edges +linking every input vertex to its assigned cluster. + +Use this to build customer segments for look-alike modeling, targeted campaigns, +A/B tests, or to enrich downstream analytics with stable cluster labels. + +--- + +### 2. Feature Engineering & Graph Compression + +**combine_features(hub_v_type, e_type, target_v_type, feature_v_type, hub_threshold, split_threshold)** +This builds combined feature concepts from frequently co-occurring attributes. +The query first finds "hub" feature vertices whose degree is above hub_threshold. +Hub vertices are those that have a very high degree - above our threshold. +It then looks for other feature vertices that share a moderate number of common +target vertices with each hub vertex, where the shared neighbor count is between 2 and +split_threshold. For each such pair of features, the query: + 1. Makes a Combined_Feature representing the pattern + 2. Links these Combined_Features to the original via Linked edges + 3. Links the target vertex that qualitied via a Has_Attribute edge +We flag `e.ignore_edge = TRUE` on the Combined_Feature edges for future queries +to avoid them and reduce noise. + +Use this to turn very popular but generic features into more informative +interaction features (e.g. "Nike x RunningShoes" instead of just "Nike"), +to enrich recommendation or similarity queries with more meaningful combinations +of customer and product attributes. + +--- + +### 3. Product Recommendations & Personalization + +**recommend_products(src_customer_input, batch_index, num_batches, target_batch, +ignore_threshold, recommendation_count, data_types, edge_importance_factors, +vertex_degree_scales, customer_popularity_scale, item_popularity_scale)** + +This query generates personalized product recommendations for each customer by +combining customer-customer similarity and item-item co-purchasing patterns +over the graph. + +It outputs a ranked list of Product_Variant vertices for each source +customer, with scores that reflect both what similar customers like and which +products are frequently bought together. + +Use this to power e-commerce product recommendations, cross-sell/upsell offers, +and personalized merchandising experiences directly on top of your TigerGraph data. + +--- + +## Run Example Queries + +After the setup is complete and the data is loaded, you can run an example query +to validate that everything is working correctly. + +On TigerGraph Cloud (or locally), open the **GSQL Console** (the interactive `gsql` +shell) for your `Product_Recommendation` graph, then run: + +```gsql +USE GRAPH Product_Recommendation +RUN QUERY recommend_products({}, 0, 1) +``` + +The first parameter (`{}`) leaves `src_customer_input` empty, so the query falls +back to **batch mode** (controlled by `batch_index` and `num_batches`). + +`batch_index = 0` and `num_batches = 1` mean "run for all customers in a single batch," +returning up to the default `recommendation_count` recommendations per customer. + +## Using Your Own Data + +By default, this solution kit loads **sample product recommendation data** from a +TigerGraph-hosted public S3 bucket, using the data source and loading job defined in +`loading_job/load_data.gsql`. The five CSV files it loads are: + +- `product_categories.csv` +- `products.csv` +- `customers.csv` +- `customer_styles.csv` +- `customer_purchases.csv` + +TigerGraph supports multiple loading options. For detailed, up-to-date examples, +please refer to the official documentation: + +- **Data loading overview** - supported sources and general workflow + https://docs.tigergraph.com/tigergraph-server/current/data-loading/data-loading-overview + +- **Data loading entry page** - index of all loading methods (local files, cloud storage, warehouses, Spark, etc.) + https://docs.tigergraph.com/tigergraph-server/current/data-loading/ + +- **Load data from cloud storage** - step-by-step guide for Amazon S3, Google Cloud Storage, and Azure Blob Storage + https://docs.tigergraph.com/tigergraph-server/current/data-loading/load-from-cloud + +At a high level, the steps to use your own data are: + +1. For cloud or external systems, define a `DATA_SOURCE` object that matches your storage system and credentials. + For local files on the TigerGraph server, you can omit `DATA_SOURCE`. +2. Update the `DEFINE FILENAME` lines in `load_all` so they point to your own + URIs (for example, your customer, product, and transaction files). +3. Keep the `LOAD` blocks the same unless your column structure changes-if it + does, adjust the `VALUES(...)` mappings so each vertex/edge attribute + (for example `Customer.id`, `Product_Variant.id`, `Interacted.timestamp`, + `Has_Attribute.score`) receives the correct input column. + +Once those changes are in place, you can reload your data with: + +```bash +gsql loading_job/load_data.gsql +``` + +## Resetting the Environment + +Sometimes you'll want to wipe the graph environment and start fresh - for example, +when switching to a new dataset or after changing the schema, loading logic, or +recommendation queries. + +This repository provides a reset script for this purpose. + +> **Warning** +> All of the commands in this section permanently delete data (and, for a full reset, +> the graph and its queries). Use them only in development or when you are sure +> you want to clear the environment. + +### Full reset using the `reset` script + +If you need a complete reset of the Product Recommendation solution kit - not just +data, but also the loading job, data source, queries, and the graph definition - +you can use the `reset/reset.gsql` script. + +This script performs the following operations on the `Product_Recommendation` graph: + +```gsql +USE GRAPH Product_Recommendation + +-- data loading job & data source +DROP JOB load_all +DROP DATA_SOURCE s1 + +-- clear all installed queries (e.g., recommend_products, k_means, combine_features) +DROP QUERY ALL + +-- clear schema and data +DROP GRAPH Product_Recommendation +``` diff --git a/connected_customer/product_recommendations/meta/Insights_Product_Recommendations.json b/connected_customer/product_recommendations/meta/Insights_Product_Recommendations.json new file mode 100644 index 00000000..5de9f36a --- /dev/null +++ b/connected_customer/product_recommendations/meta/Insights_Product_Recommendations.json @@ -0,0 +1,288 @@ +{ + "defaultGraph": "Product_Recommendation", + "iconURL": "/insights/static/media/atom.14f5dd297b1a450cae3413a44f69a75b.svg", + "id": "t7fsPyC7qKW5V5Rngu9JnB", + "pageConfigSeparated": true, + "pages": [ + { + "chartMap": { + "9ArCu34uUJA22RCnncEKUk": { + "chartSettings": {}, + "graphName": "Product_Recommendation", + "hideWidgetName": false, + "id": "9ArCu34uUJA22RCnncEKUk", + "patternLimit": 5, + "query": "INTERPRET QUERY get_customer_neighborhood()\nFOR GRAPH Product_Recommendation\nSYNTAX V1 {\n\n // Pick a small sample of customers (to keep the graph readable)\n customers =\n SELECT c\n FROM Customer:c\n LIMIT 3;\n\n // Customer --(Interacted)-- Product_Variant (purchases)\n variants =\n SELECT pv\n FROM customers:c -(Interacted:e)- Product_Variant:pv\n WHERE e.interaction_type == \"purchase\";\n\n // Product_Variant --(Belongs_To)-> Product\n prods =\n SELECT p\n FROM variants:pv -(Belongs_To:e)-> Product:p;\n\n // Product --(Belongs_To)-> Product_Category\n cats =\n SELECT cat\n FROM prods:p -(Belongs_To:e)-> Product_Category:cat;\n\n // Customers' style preferences: Customer --(Has_Attribute)-- Customer_Attribute\n cust_attrs =\n SELECT ca\n FROM customers:c -(Has_Attribute:e)- Customer_Attribute:ca;\n\n // Product attributes: Product_Variant --(Has_Attribute)-- Product_Attribute\n prod_attrs =\n SELECT pa\n FROM variants:pv -(Has_Attribute:e)- Product_Attribute:pa;\n\n // Output all vertex sets so Insights can draw them\n PRINT customers;\n // PRINT cust_attrs;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Customer Neighborhood", + "type": "internal-graph", + "version": "1764000506666763971" + } + }, + "globalParameters": {}, + "iconURL": "/insights/static/media/globe-lock.ce36fa55b625acabad473652568e1813.svg", + "id": "hxb4w9c9zu3Lyzmgkhptsm", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 32, + "i": "9ArCu34uUJA22RCnncEKUk", + "moved": false, + "static": false, + "w": 8, + "x": 0, + "y": 0 + } + ], + "xs": [ + { + "h": 12, + "i": "9ArCu34uUJA22RCnncEKUk", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + } + ] + }, + "title": "Customer Explorer", + "version": "1763990634442345384", + "weight": 20 + }, + { + "chartMap": { + "2FYeuKde8FNqTMhxYgtkA6": { + "chartSettings": { + "category": [ + { + "id": "v_id", + "type": "string" + } + ], + "tableHeaders": [ + "v_id", + "v_type", + "@purchase_count", + "id" + ], + "tableIndex": 0, + "value": [ + { + "id": "@purchase_count", + "type": "number" + } + ] + }, + "graphName": "Product_Recommendation", + "hideWidgetName": false, + "id": "2FYeuKde8FNqTMhxYgtkA6", + "patternLimit": 5, + "query": "INTERPRET QUERY get_purchases_by_category()\nFOR GRAPH Product_Recommendation\nSYNTAX V1 {\n\n // We’ll reuse this vertex-attached accumulator on multiple vertex types.\n // It will hold:\n // - on Product_Variant: number of purchases for that variant\n // - on Product: total purchases for that product\n // - on Product_Category:total purchases for that category\n SumAccum @purchase_count;\n\n // Start from all customers\n start = {Customer.*};\n\n // Customer --(Interacted)--> Product_Variant\n // Count how many purchase edges hit each variant.\n variants =\n SELECT pv\n FROM start:s -(Interacted:e)- Product_Variant:pv\n WHERE e.interaction_type == \"purchase\"\n ACCUM pv.@purchase_count += 1;\n\n // Product_Variant --(Belongs_To)-> Product\n // Roll up purchase counts from variants to products.\n prods =\n SELECT p\n FROM variants:pv -(Belongs_To:e)-> Product:p\n ACCUM p.@purchase_count += pv.@purchase_count;\n\n // Product --(Belongs_To)-> Product_Category\n // Roll up purchase counts from products to categories.\n categories =\n SELECT cat\n FROM prods:p -(Belongs_To:e)-> Product_Category:cat\n ACCUM cat.@purchase_count += p.@purchase_count;\n\n // Print categories\n PRINT categories;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Purchases By Product Category", + "type": "pie", + "version": "1763984583051629662" + }, + "82qGfNTjt6wBAxcx6qnhmp": { + "chartSettings": { + "category": [ + { + "id": "v_id", + "type": "string" + } + ], + "tableHeaders": [ + "v_id", + "v_type", + "@purchase_count", + "id", + "in_stock", + "price", + "timestamp" + ], + "tableIndex": 0, + "value": [ + { + "id": "@purchase_count", + "type": "number" + } + ] + }, + "graphName": "Product_Recommendation", + "hideWidgetName": false, + "id": "82qGfNTjt6wBAxcx6qnhmp", + "patternLimit": 5, + "query": "INTERPRET QUERY get_top_products_by_purchases()\nFOR GRAPH Product_Recommendation\nSYNTAX V1 {\n\n // Each Product_Variant will store how many times it was purchased.\n SumAccum @purchase_count;\n\n // Start from all customers\n start = {Customer.*};\n\n // Customer --(Interacted)--> Product_Variant\n // Count purchases per variant, then sort & limit in the SELECT.\n variants =\n SELECT pv\n FROM start:s -(Interacted:e)- Product_Variant:pv\n WHERE e.interaction_type == \"purchase\"\n ACCUM pv.@purchase_count += 1\n ORDER BY pv.@purchase_count DESC\n LIMIT 10; \n\n // Print the vertex set for Insights\n PRINT variants;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top Products By Purchases", + "type": "bar", + "version": "1763985739818153518" + }, + "aGXWqVkdrcr1fJD3Nr2PZQ": { + "chartSettings": {}, + "graphName": "Product_Recommendation", + "hideWidgetName": false, + "id": "aGXWqVkdrcr1fJD3Nr2PZQ", + "patternLimit": 5, + "query": "INTERPRET QUERY get_total_purchase_events() FOR GRAPH Product_Recommendation {\n\n SumAccum @@total_purchases;\n\n start = {Customer.*};\n\n start =\n SELECT s FROM start:s -(Interacted:e)- Product_Variant:t\n WHERE e.interaction_type == \"purchase\"\n ACCUM\n @@total_purchases += 1;\n\n PRINT @@total_purchases AS total_purchase_events;\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Total Purchases", + "type": "value", + "version": "1763983800972271547" + }, + "qiJvz1R3jtwheHY7n248fh": { + "chartSettings": {}, + "graphName": "Product_Recommendation", + "hideWidgetName": false, + "id": "qiJvz1R3jtwheHY7n248fh", + "patternLimit": 5, + "query": "INTERPRET QUERY get_total_customers() FOR GRAPH Product_Recommendation {\n start = {Customer.*};\n PRINT start.size();\n}\n", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Total Customers", + "type": "value", + "version": "1763983589358074731" + }, + "rtLCUDSijWCnyJ4VZKigVE": { + "chartSettings": {}, + "graphName": "Product_Recommendation", + "hideWidgetName": false, + "id": "rtLCUDSijWCnyJ4VZKigVE", + "patternLimit": 5, + "query": "INTERPRET QUERY get_total_products() FOR GRAPH Product_Recommendation {\n start = {Product.*};\n PRINT start.size();\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Total Products", + "type": "value", + "version": "1763983674813496143" + } + }, + "globalParameters": {}, + "iconURL": "/insights/static/media/badge-info.d87dd45b19b490018f24dc6e82113abc.svg", + "id": "oPpn6XgAzT2Barjjzit6U5", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 21, + "i": "2FYeuKde8FNqTMhxYgtkA6", + "moved": false, + "static": false, + "w": 3, + "x": 0, + "y": 9 + }, + { + "h": 36, + "i": "82qGfNTjt6wBAxcx6qnhmp", + "moved": false, + "static": false, + "w": 7, + "x": 3, + "y": 9 + }, + { + "h": 9, + "i": "aGXWqVkdrcr1fJD3Nr2PZQ", + "moved": false, + "static": false, + "w": 2, + "x": 4, + "y": 0 + }, + { + "h": 9, + "i": "qiJvz1R3jtwheHY7n248fh", + "moved": false, + "static": false, + "w": 2, + "x": 0, + "y": 0 + }, + { + "h": 9, + "i": "rtLCUDSijWCnyJ4VZKigVE", + "moved": false, + "static": false, + "w": 2, + "x": 2, + "y": 0 + } + ], + "xs": [ + { + "h": 9, + "i": "qiJvz1R3jtwheHY7n248fh", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 9, + "i": "rtLCUDSijWCnyJ4VZKigVE", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 9 + }, + { + "h": 9, + "i": "aGXWqVkdrcr1fJD3Nr2PZQ", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 18 + }, + { + "h": 21, + "i": "2FYeuKde8FNqTMhxYgtkA6", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 27 + }, + { + "h": 36, + "i": "82qGfNTjt6wBAxcx6qnhmp", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 48 + } + ] + }, + "title": "Overview", + "version": "1763994516788879761", + "weight": 10 + } + ], + "title": "Retail Insights", + "userRoleForApp": "owner", + "version": "1763982709365410423" +} \ No newline at end of file diff --git a/connected_customer/product_recommendations/meta/images/Product_Recommendation_Schema.png b/connected_customer/product_recommendations/meta/images/Product_Recommendation_Schema.png new file mode 100644 index 00000000..914f73ba Binary files /dev/null and b/connected_customer/product_recommendations/meta/images/Product_Recommendation_Schema.png differ diff --git a/connected_customer/product_recommendations/queries/combine_features.gsql b/connected_customer/product_recommendations/queries/combine_features.gsql new file mode 100644 index 00000000..7f3f71b4 --- /dev/null +++ b/connected_customer/product_recommendations/queries/combine_features.gsql @@ -0,0 +1,107 @@ +CREATE QUERY combine_features( + SET hub_v_type, + SET e_type, + STRING target_v_type="Customer", + SET feature_v_type, + INT hub_threshold, + INT split_threshold) SYNTAX V2 +{ + /* + Query Name: combine_features + Feature Co-Occurrence Grouping + + Purpose: + Identify frequently co-occurring feature pairs around high-degree + feature vertices and collapse them into `Combined_Feature` vertices. + + High-Level Flow: + . Find hub vertices. + . For each hub-feature pair connected through `target_v_type` vertices + (e.g., Customers), count how many targets they share. + . Keep hub-feature pairs whose shared target count lies between 2 and + `split_threshold`, treating them as meaningful but not overly common + co-occurrences. + . Combined feature creation: + For each selected pair and its associated targets: + - Create/attach a `Combined_Feature` vertex. + - Flag original hub/feature edges as `ignore_edge = TRUE`. + - Connect targets to the combined feature via `Has_Attribute`. + - Link original feature vertices to the combined feature via `Linked`. + + Configuration: + `hub_v_type`, `feature_v_type`, `target_v_type`, and `e_type` define the + schema context, while `hub_threshold` and `split_threshold` control how aggressively + features are combined. + + Output: + Inserts `Combined_Feature` vertices, `Has_Attribute` edges from targets to + combined features, `Linked` edges from original features to combined features, + and marks affected original edges as ignored. + */ + TYPEDEF TUPLE Vertex_Tuple; + MapAccum> @intersection_size_map; + OrAccum @hub; + DATETIME timestamp = now(); + SetAccum @combined_feature_ids; + SetAccum @combined_verts; + + // pick hub vertices + hub_verts = {hub_v_type}; + hub_verts = + SELECT s FROM hub_verts:s + WHERE s.outdegree(e_type) > hub_threshold + POST-ACCUM s.@hub += TRUE; + + // count how many customers each pair of features shares + other_verts = + SELECT t FROM hub_verts:s -(e_type)- target_v_type -(e_type)- feature_v_type:t + WHERE t.@hub == FALSE OR getvid(s) > getvid(t) + ACCUM + s.@intersection_size_map += (t -> 1); + + // mark (hub, feature) pairs with moderate overlap on each target vertex + // This section of the code cannot be made schema-free + hub_verts = + SELECT s FROM hub_verts:s -(e_type:e1)- target_v_type:v -(e_type:e2)- feature_v_type:t + WHERE (t.@hub == FALSE OR getvid(s) > getvid(t)) AND s.@intersection_size_map.get(t) BETWEEN 2 AND split_threshold + ACCUM + v.@combined_feature_ids += to_string(getvid(s)) + "_" + to_string(getvid(t)), + v.@combined_verts += s, + v.@combined_verts += t; + + all_verts = hub_verts UNION other_verts; + + // assign combined_feature_ids to each feature in pair (s, t) + hub_verts = + SELECT s FROM hub_verts:s -(e_type)- target_v_type -(e_type)- feature_v_type:t + WHERE (t.@hub == FALSE OR getvid(s) > getvid(t)) AND s.@intersection_size_map.get(t) BETWEEN 2 AND split_threshold + PER (s, t) + ACCUM + s.@combined_feature_ids += to_string(getvid(s)) + "_" + to_string(getvid(t)), + t.@combined_feature_ids += to_string(getvid(s)) + "_" + to_string(getvid(t)); + + // connect to combined features + all_verts = + SELECT s FROM all_verts:s -(e_type:e)- target_v_type:t + WHERE t.@combined_verts.contains(s) + ACCUM + e.ignore_edge = TRUE + POST-ACCUM (t) + FOREACH combined_feature_id IN t.@combined_verts DO + INSERT INTO Has_Attribute VALUES (t, combined_feature_id Combined_Feature, timestamp, 1, _) + END + POST-ACCUM (s) + FOREACH combined_feature_id IN s.@combined_feature_ids DO + INSERT INTO Linked VALUES (s, combined_feature_id Combined_Feature, timestamp) + END; +} + +UPDATE DESCRIPTION OF QUERY combine_features "This query concatenates feature vertices with a degree above a certain threshold. Feature vertex types in this case refer to Cluster, Customer_Attribute, Product_Attribute, and Product_Variant. If two feature vertices are merged, a Combined_Feature vertex gets created and the constituent features get linked to the Combined_Feature via the Linked edge type." + +UPDATE DESCRIPTION OF QUERY_PARAM combine_features.hub_v_type "These are the feature vertex types which can be considered for the purposes of combination." +UPDATE DESCRIPTION OF QUERY_PARAM combine_features.e_type "These are the edge types which should be traversed in the query for the purposes of edge counting and combination." +UPDATE DESCRIPTION OF QUERY_PARAM combine_features.target_v_type "This is the vertex type which adjoins two different feature vertices for the purpose of combination. Defaults to 'Customer'." +UPDATE DESCRIPTION OF QUERY_PARAM combine_features.feature_v_type "These are the feature vertex types which the 'hub_v_types' should traverse to via the 'target_v_type' vertex type." +UPDATE DESCRIPTION OF QUERY_PARAM combine_features.hub_threshold "If a feature vertex has more than this number of edges, it is considered a hub vertex and is eligible to be combined into a Combined_Feature vertex." +UPDATE DESCRIPTION OF QUERY_PARAM combine_features.split_threshold "This number is the upper limit of neighbors that the source and target feature vertices may share in order to be combined into a Combined_Feature vertex." + diff --git a/connected_customer/product_recommendations/queries/k_means.gsql b/connected_customer/product_recommendations/queries/k_means.gsql new file mode 100644 index 00000000..6cd5aaaa --- /dev/null +++ b/connected_customer/product_recommendations/queries/k_means.gsql @@ -0,0 +1,274 @@ +CREATE QUERY k_means( + STRING v_type="Customer", + SET attr_set, + INT min_cluster_count, + INT max_cluster_count, + INT cluster_inc=1, + INT random_iter_count=1, + INT conv_iter_limit=25, + FLOAT conv_threshold=0.1, + INT random_seed=42, + BOOL use_custom_timestamp=FALSE, + DATETIME custom_timestamp=to_datetime("1970-01-01")) SYNTAX V1 +{ + /* + Query Name: k_means + K-Means Clustering over Vertex Attribute Vectors + + Purpose: + Cluster vertices of type `v_type` using K-Means on normalized numeric + features taken from each vertex's `attr_map`. The query sweeps K from + `min_cluster_count` to `max_cluster_count` and picks the "best" cluster + count based on changes in sum of squared errors (SSE), approximating the + elbow point. + + High-Level Flow: + . Select a subset of numeric keys from `attr_map` and construct per-vertex + vectors. + . For each candidate K in [min_cluster_count, max_cluster_count] (step + `cluster_inc`), run up to `random_iter_count` random initializations of + centroids + . Assign each vertex to its closest centroid, recompute centroids from + assigned members, and repeat until convergence + . Track SSE across K values and iterations, approximate the elbow via + second differences in SSE, and optionally merge nearly identical + centroids. + + Configuration: + Parameters control the vertex type (`v_type`), feature subset (`attr_set`), + K range and convergence behavior (min/max_cluster_count, cluster_inc, + random_iter_count, conv_iter_limit, conv_threshold, random_seed), and + the timestamp used on `Cluster`/`In_Cluster` (`use_custom_timestamp`, + `custom_timestamp`). + + Output: + Inserts `Cluster` vertices and `In_Cluster` edges for all clustered + vertices, then prints the number of clusters created. + */ + + TYPEDEF TUPLE Centroid_Distance; + TYPEDEF TUPLE Cluster_Count_Tuple; + // accum to store the smallest distance + HeapAccum (1, distance ASC) @centroid_distance_heap; + MaxAccum @@max_sse_delta; + MapAccum> @cluster_assignment; + MapAccum> @@centroid_write_values; + MapAccum> @@centroid_read_values; + MapAccum> @@cluster_rep_verts; + + GroupByAccum centroid_values> @@final_centroid_values; + + MapAccum> @@cluster_count_sse_map; + + SumAccum @random_init, @@sse; + AvgAccum @avg_value; + SumAccum @random_base; + MaxAccum @cluster_id; + + DATETIME cluster_timestamp = now(); + IF use_custom_timestamp == TRUE THEN + cluster_timestamp = custom_timestamp; + END; + + INT _mod, _mult, _inc; + _mod = pow(2, 31)-1; + _mult = 1664525; + _inc = 1013904223; + + verts = {v_type}; + INT feature_count = attr_set.size(); + + // build feature vectors + MapAccum> @@attr_map; + ListAccum @attr_values; + MaxAccum @@max_attr_value; + MinAccum @@min_attr_value; + + DATETIME timestamp = now(); + + JSONARRAY arr; + // parse attr set features and store in arr + FOREACH attr IN attr_set DO + arr = parse_json_array(attr); + @@attr_map += (arr.getInt(0) -> arr.getString(1)); + END; + + STRING attr_name; + // compute normalized features for each vertex + FOREACH attr_idx IN RANGE[0, attr_set.size()-1] DO + attr_name = @@attr_map.get(attr_idx); + PRINT attr_name; + verts = + SELECT s FROM verts:s + ACCUM + FLOAT value = 0, + IF s.attr_map.containsKey(attr_name) THEN + value = s.attr_map.get(attr_name) + END, + @@max_attr_value += s.attr_map.get(attr_name), + @@min_attr_value += s.attr_map.get(attr_name) + POST-ACCUM + FLOAT denominator = @@max_attr_value - @@min_attr_value, + IF denominator <= 0 THEN + denominator = 1 + END, + s.@attr_values += (s.attr_map.get(attr_name)-@@min_attr_value) / denominator, + s.@random_base = (((getvid(s)+_inc)+_mult*random_seed) % _mod), + s.@random_init = s.@random_base / (_mod * 1.0); + @@min_attr_value = GSQL_INT_MAX; + @@max_attr_value = GSQL_INT_MIN; + END; + + // if two clusters converge, merge them and just stop there. + FLOAT merge_threshold=0.01; + GroupByAccum merge_centroid_id> @@merge_map; + SumAccum @@terminate_count; + INT terminate_threshold = 5; + // Centroid Initialization + FOREACH cluster_count IN RANGE[min_cluster_count, max_cluster_count].STEP(cluster_inc) DO + IF @@terminate_count >= terminate_threshold THEN + BREAK; + END; + FOREACH iter IN RANGE[0, random_iter_count-1] DO + IF @@terminate_count >= terminate_threshold THEN + BREAK; + END; + @@terminate_count = 0; + // CENTROID INITIALIZATION + centroids = + SELECT s FROM verts:s + // refresh the random number for each vertex + POST-ACCUM + s.@random_base = (s.@random_base * _mult + (getvid(s)+_inc)) % _mod, + s.@random_init = s.@random_base / (_mod * 1.0) + ORDER BY s.@random_init + LIMIT cluster_count; + // Choose k vertices uniformly at random to be initial centroids + centroids = + SELECT s FROM centroids:s + POST-ACCUM + FOREACH i IN RANGE[0, feature_count-1] DO + @@centroid_read_values += (s -> s.@attr_values.get(i) + s.@random_init) + END; + + // PRINT cluster_count, iter, centroids, @@centroid_read_values; + @@sse = 0; + FLOAT last_sse = 1; + BOOL first_iter = TRUE; + // KMEANS ITERATION UNTIL COVNVERGENCE + WHILE (abs(@@sse - last_sse) > conv_threshold OR first_iter == TRUE) AND last_sse > @@sse LIMIT conv_iter_limit DO + first_iter = FALSE; + last_sse = @@sse; + @@sse = 0; + verts = + SELECT s FROM verts:s + ACCUM + FOREACH (centroid, centroid_values) IN @@centroid_read_values DO + // compute distance + s.@centroid_distance_heap += Centroid_Distance(tg_similarity_accum(s.@attr_values, centroid_values, "EUCLIDEAN"), centroid) + END + POST-ACCUM + FOREACH i IN RANGE[0, feature_count-1] DO + @@centroid_write_values += (s.@centroid_distance_heap.top().centroid -> (i -> s.@attr_values.get(i))) + END, + // compute sum squared errors + @@sse += pow(s.@centroid_distance_heap.top().distance, 2); + // move the newly computed centroids back to the read values for the next iteration + @@centroid_read_values.clear(); + centroids = + SELECT s FROM centroids:s + POST-ACCUM + FOREACH i IN RANGE[0, feature_count-1] DO + @@centroid_read_values += (s -> @@centroid_write_values.get(s).get(i)) + END; + @@centroid_write_values.clear(); + END; + // CLEAN UP + centroids = + SELECT s FROM centroids:s + POST-ACCUM + // compare against other centroids + // if too close, and self less than other cluster id, add to merge map + // for now, even a single overlap results in termination + // in the future, we might want to add a threshold for X overlaps + BOOL terminate = FALSE, + FOREACH (centroid, centroid_values) IN @@centroid_read_values DO + IF + centroid != s AND + getvid(s) < getvid(centroid) AND + tg_similarity_accum(@@centroid_read_values.get(s), centroid_values, "EUCLIDEAN") < merge_threshold + THEN + @@merge_map += (cluster_count, iter, getvid(s) -> getvid(centroid)), + @@terminate_count += 1, + terminate = TRUE, + BREAK + END + END, + IF terminate == FALSE THEN + @@final_centroid_values += (cluster_count, iter, getvid(s) -> @@centroid_read_values.get(s)) + END; + @@centroid_read_values.clear(); + + verts = + SELECT s FROM verts:s + POST-ACCUM + // remember this vertex's assigned centroid and the corresponding SSE + s.@cluster_assignment += (cluster_count -> Centroid_Distance(@@sse, s.@centroid_distance_heap.top().centroid)), + s.@centroid_distance_heap.clear(); + // store the sse for each vertex to compare accross k values + @@cluster_count_sse_map += (cluster_count -> @@sse); + // approximate the elbow with the second difference + IF (cluster_count - min_cluster_count) > 2*cluster_inc THEN + FLOAT prev_sse_delta = @@cluster_count_sse_map.get(cluster_count-(2*cluster_inc)) - @@cluster_count_sse_map.get(cluster_count-(1*cluster_inc)); + FLOAT next_sse_delta = @@cluster_count_sse_map.get(cluster_count-(1*cluster_inc)) - @@cluster_count_sse_map.get(cluster_count); + @@max_sse_delta += Cluster_Count_Tuple(next_sse_delta - prev_sse_delta, cluster_count-(1*cluster_inc), iter); + END; + END; + END; + + rep_verts = + SELECT s FROM verts:s + ACCUM + INT cluster_id = getvid(s.@cluster_assignment.get(@@max_sse_delta.cluster_count).centroid), + IF @@merge_map.containsKey( + @@max_sse_delta.cluster_count, + @@max_sse_delta.iter, + cluster_id) THEN + cluster_id = @@merge_map.get(@@max_sse_delta.cluster_count, @@max_sse_delta.iter,cluster_id).merge_centroid_id + END, + s.@cluster_id = cluster_id, + @@cluster_rep_verts += (cluster_id -> s) + POST-ACCUM + INSERT INTO In_Cluster VALUES (s, s.@cluster_id, cluster_timestamp, _) + HAVING + @@cluster_rep_verts.get(s.@cluster_id) == s; + + rep_verts = + SELECT s FROM rep_verts:s + POST-ACCUM + // for each cluster_id create a cluster vertex using the stored centroid values + INSERT INTO Cluster VALUES ( + s.@cluster_id, + @@final_centroid_values.get( + @@max_sse_delta.cluster_count, + @@max_sse_delta.iter, + s.@cluster_id + ).centroid_values + ); + + PRINT "Created Cluster Count:", rep_verts.size(); +} + +UPDATE DESCRIPTION OF QUERY k_means "This query clusters vertices according to some subset of their numerical attributes using the KMeans algorithm, which is a vector quantization clustering algorithm. The algorithm iteratively attempts to cluster the vertices using an incremental number of centroids and then selects the centroid assignment which results in the best sum of squared errors, such that the clusters are not under or overfit to the dataset. The algorithm inserts the centroids as Cluster vertices and links all vertices to their respective centroids via the In_Cluster edge type." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.v_type "This is the target vertex type to cluster. It defaults to Customer." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.attr_set "This is a SET of JSON-formatted string where each string represent a list of tuples containing an index and the key of a numerical value contained in the MAP attribute, 'attr_map', on the target vertex type. Example of ONE of the strings in attr_set: `[0, \"age\"]`" +UPDATE DESCRIPTION OF QUERY_PARAM k_means.min_cluster_count "This is the minimum number of centroids to try out when clustering." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.max_cluster_count "This is the maximum number of centroids to try out when clustering. Must be greater than or equal to min_cluster_count." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.cluster_inc "This is the value by which the centroid count is incremented when moving onto the next centroid count. Defaults to 1." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.random_iter_count "The is the number of random initializations to attempt for a given centroid count. Defaults to 1." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.conv_iter_limit "This is the maximum number of iterations to perform for a given centroid count. Defaults to 25." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.conv_threshold "This is the minimum change in SSE during a clustering attempt before the algorithm considers a clustering to have converged. Defaults to 0.1" +UPDATE DESCRIPTION OF QUERY_PARAM k_means.random_seed "This is a seed to pseudorandomize the selection of centroids for a given centroid count." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.use_custom_timestamp "This is the timestamp attributed to the Cluster vertices. If you wish to use a custom timestamp, set this to TRUE. Defaults to FALSE." +UPDATE DESCRIPTION OF QUERY_PARAM k_means.custom_timestamp "If 'use_custom_timestamp' is set to TRUE, then this value will be used as the timestamp for the Cluster vertices. Defaults to 'to_datetime(\"1970-01-01\")'" + diff --git a/connected_customer/product_recommendations/queries/queries.gsql b/connected_customer/product_recommendations/queries/recommend_products.gsql similarity index 56% rename from connected_customer/product_recommendations/queries/queries.gsql rename to connected_customer/product_recommendations/queries/recommend_products.gsql index 44fce1c4..f6a9e6d1 100644 --- a/connected_customer/product_recommendations/queries/queries.gsql +++ b/connected_customer/product_recommendations/queries/recommend_products.gsql @@ -1,294 +1,3 @@ -CREATE QUERY k_means( - STRING v_type="Customer", - SET attr_set, - INT min_cluster_count, - INT max_cluster_count, - INT cluster_inc=1, - INT random_iter_count=1, - INT conv_iter_limit=25, - FLOAT conv_threshold=0.1, - INT random_seed=42, - BOOL use_custom_timestamp=FALSE, - DATETIME custom_timestamp=to_datetime("1970-01-01")) SYNTAX V1 -{ - TYPEDEF TUPLE Centroid_Distance; - TYPEDEF TUPLE Cluster_Count_Tuple; - HeapAccum (1, distance ASC) @centroid_distance_heap; - MaxAccum @@max_sse_delta; - MapAccum> @cluster_assignment; - MapAccum> @@centroid_write_values; - MapAccum> @@centroid_read_values; - MapAccum> @@cluster_rep_verts; - - GroupByAccum centroid_values> @@final_centroid_values; - - MapAccum> @@cluster_count_sse_map; - - ListAccum @attr_values; - SumAccum @random_init, @@sse; - AvgAccum @avg_value; - SumAccum @random_base; - MaxAccum @cluster_id; - MaxAccum @@max_attr_value; - MinAccum @@min_attr_value; - - DATETIME cluster_timestamp = now(); - IF use_custom_timestamp == TRUE THEN - cluster_timestamp = custom_timestamp; - END; - - INT _mod, _mult, _inc; - _mod = pow(2, 31)-1; - _mult = 1664525; - _inc = 1013904223; - - verts = {v_type}; - INT feature_count = attr_set.size(); - - MapAccum> @@attr_map; - - DATETIME timestamp = now(); - - JSONARRAY arr; - FOREACH attr IN attr_set DO - arr = parse_json_array(attr); - @@attr_map += (arr.getInt(0) -> arr.getString(1)); - END; - - STRING attr_name; - FOREACH attr_idx IN RANGE[0, attr_set.size()-1] DO - attr_name = @@attr_map.get(attr_idx); - PRINT attr_name; - verts = - SELECT s FROM verts:s - ACCUM - FLOAT value = 0, - IF s.attr_map.containsKey(attr_name) THEN - value = s.attr_map.get(attr_name) - END, - @@max_attr_value += s.attr_map.get(attr_name), - @@min_attr_value += s.attr_map.get(attr_name) - POST-ACCUM - FLOAT denominator = @@max_attr_value - @@min_attr_value, - IF denominator <= 0 THEN - denominator = 1 - END, - s.@attr_values += (s.attr_map.get(attr_name)-@@min_attr_value) / denominator, - s.@random_base = (((getvid(s)+_inc)+_mult*random_seed) % _mod), - s.@random_init = s.@random_base / (_mod * 1.0); - @@min_attr_value = GSQL_INT_MAX; - @@max_attr_value = GSQL_INT_MIN; - END; - - // if two clusters converge, merge them and just stop there. - FLOAT merge_threshold=0.01; - GroupByAccum merge_centroid_id> @@merge_map; - SumAccum @@terminate_count; - INT terminate_threshold = 5; - - FOREACH cluster_count IN RANGE[min_cluster_count, max_cluster_count].STEP(cluster_inc) DO - IF @@terminate_count >= terminate_threshold THEN - BREAK; - END; - FOREACH iter IN RANGE[0, random_iter_count-1] DO - IF @@terminate_count >= terminate_threshold THEN - BREAK; - END; - @@terminate_count = 0; - // CENTROID INITIALIZATION - centroids = - SELECT s FROM verts:s - POST-ACCUM - s.@random_base = (s.@random_base * _mult + (getvid(s)+_inc)) % _mod, - s.@random_init = s.@random_base / (_mod * 1.0) - ORDER BY s.@random_init - LIMIT cluster_count; - centroids = - SELECT s FROM centroids:s - POST-ACCUM - FOREACH i IN RANGE[0, feature_count-1] DO - @@centroid_read_values += (s -> s.@attr_values.get(i) + s.@random_init) - END; - - // PRINT cluster_count, iter, centroids, @@centroid_read_values; - @@sse = 0; - FLOAT last_sse = 1; - BOOL first_iter = TRUE; - // KMEANS ITERATION UNTIL COVNVERGENCE - WHILE (abs(@@sse - last_sse) > conv_threshold OR first_iter == TRUE) AND last_sse > @@sse LIMIT conv_iter_limit DO - first_iter = FALSE; - last_sse = @@sse; - @@sse = 0; - verts = - SELECT s FROM verts:s - ACCUM - FOREACH (centroid, centroid_values) IN @@centroid_read_values DO - // compute distance - s.@centroid_distance_heap += Centroid_Distance(tg_similarity_accum(s.@attr_values, centroid_values, "EUCLIDEAN"), centroid) - END - POST-ACCUM - FOREACH i IN RANGE[0, feature_count-1] DO - @@centroid_write_values += (s.@centroid_distance_heap.top().centroid -> (i -> s.@attr_values.get(i))) - END, - @@sse += pow(s.@centroid_distance_heap.top().distance, 2); - @@centroid_read_values.clear(); - centroids = - SELECT s FROM centroids:s - POST-ACCUM - FOREACH i IN RANGE[0, feature_count-1] DO - @@centroid_read_values += (s -> @@centroid_write_values.get(s).get(i)) - END; - @@centroid_write_values.clear(); - END; - // CLEAN UP - centroids = - SELECT s FROM centroids:s - POST-ACCUM - // compare against other centroids - // if too close, and self less than other cluster id, add to merge map - // for now, even a single overlap results in termination, in the future, we might want to add a threshold for X overlaps - BOOL terminate = FALSE, - FOREACH (centroid, centroid_values) IN @@centroid_read_values DO - IF - centroid != s AND - getvid(s) < getvid(centroid) AND - tg_similarity_accum(@@centroid_read_values.get(s), centroid_values, "EUCLIDEAN") < merge_threshold - THEN - @@merge_map += (cluster_count, iter, getvid(s) -> getvid(centroid)), - @@terminate_count += 1, - terminate = TRUE, - BREAK - END - END, - IF terminate == FALSE THEN - @@final_centroid_values += (cluster_count, iter, getvid(s) -> @@centroid_read_values.get(s)) - END; - @@centroid_read_values.clear(); - verts = - SELECT s FROM verts:s - POST-ACCUM - s.@cluster_assignment += (cluster_count -> Centroid_Distance(@@sse, s.@centroid_distance_heap.top().centroid)), - s.@centroid_distance_heap.clear(); - @@cluster_count_sse_map += (cluster_count -> @@sse); - - IF (cluster_count - min_cluster_count) > 2*cluster_inc THEN - FLOAT prev_sse_delta = @@cluster_count_sse_map.get(cluster_count-(2*cluster_inc)) - @@cluster_count_sse_map.get(cluster_count-(1*cluster_inc)); - FLOAT next_sse_delta = @@cluster_count_sse_map.get(cluster_count-(1*cluster_inc)) - @@cluster_count_sse_map.get(cluster_count); - @@max_sse_delta += Cluster_Count_Tuple(next_sse_delta - prev_sse_delta, cluster_count-(1*cluster_inc), iter); - END; - END; - END; - - rep_verts = - SELECT s FROM verts:s - ACCUM - INT cluster_id = getvid(s.@cluster_assignment.get(@@max_sse_delta.cluster_count).centroid), - IF @@merge_map.containsKey( - @@max_sse_delta.cluster_count, - @@max_sse_delta.iter, - cluster_id) THEN - cluster_id = @@merge_map.get(@@max_sse_delta.cluster_count, @@max_sse_delta.iter,cluster_id).merge_centroid_id - END, - s.@cluster_id = cluster_id, - @@cluster_rep_verts += (cluster_id -> s) - POST-ACCUM - INSERT INTO In_Cluster VALUES (s, s.@cluster_id, cluster_timestamp, _) - HAVING - @@cluster_rep_verts.get(s.@cluster_id) == s; - - rep_verts = - SELECT s FROM rep_verts:s - POST-ACCUM - INSERT INTO Cluster VALUES (s.@cluster_id, @@final_centroid_values.get(@@max_sse_delta.cluster_count, @@max_sse_delta.iter, s.@cluster_id).centroid_values); - - PRINT "Created Cluster Count:", rep_verts.size(); -} - -UPDATE DESCRIPTION OF QUERY k_means "This query clusters vertices according to some subset of their numerical attributes using the KMeans algorithm, which is a vector quantization clustering algorithm. The algorithm iteratively attempts to cluster the vertices using an incremental number of centroids and then selects the centroid assignment which results in the best sum of squared errors, such that the clusters are not under or overfit to the dataset. The algorithm inserts the centroids as Cluster vertices and links all vertices to their respective centroids via the In_Cluster edge type." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.v_type "This is the target vertex type to cluster. It defaults to Customer." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.attr_set "This is a JSON-formatted string representing a list of tuples containing an index and the key of a numerical value contained in the MAP attribute, 'attr_map', on the target vertex type. Example: '[0, \"Age\"]'" -UPDATE DESCRIPTION OF QUERY_PARAM k_means.min_cluster_count "This is the minimum number of centroids to try out when clustering." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.max_cluster_count "This is the maximum number of centroids to try out when clustering. Must be greater than or equal to min_cluster_count." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.cluster_inc "This is the value by which the centroid count is incremented when moving onto the next centroid count. Defaults to 1." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.random_iter_count "The is the number of random initializations to attempt for a given centroid count. Defaults to 1." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.conv_iter_limit "This is the maximum number of iterations to perform for a given centroid count. Defaults to 25." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.conv_threshold "This is the minimum change in SSE during a clustering attempt before the algorithm considers a clustering to have converged. Defaults to 0.1" -UPDATE DESCRIPTION OF QUERY_PARAM k_means.random_seed "This is a seed to pseudorandomize the selection of centroids for a given centroid count." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.use_custom_timestamp "This is the timestamp attributed to the Cluster vertices. If you wish to use a custom timestamp, set this to TRUE. Defaults to FALSE." -UPDATE DESCRIPTION OF QUERY_PARAM k_means.custom_timestamp "If 'use_custom_timestamp' is set to TRUE, then this value will be used as the timestamp for the Cluster vertices. Defaults to 'to_datetime(\"1970-01-01\")'" - - -CREATE QUERY combine_features( - SET hub_v_type, - SET e_type, - STRING target_v_type="Customer", - SET feature_v_type, - INT hub_threshold, - INT split_threshold) SYNTAX V2 -{ - TYPEDEF TUPLE Vertex_Tuple; - MapAccum> @intersection_size_map; - OrAccum @hub; - DATETIME timestamp = now(); - SetAccum @combined_feature_ids; - SetAccum @combined_verts; - - hub_verts = {hub_v_type}; - hub_verts = - SELECT s FROM hub_verts:s - WHERE s.outdegree(e_type) > hub_threshold - POST-ACCUM s.@hub += TRUE; - - other_verts = - SELECT t FROM hub_verts:s -(e_type)- target_v_type -(e_type)- feature_v_type:t - WHERE t.@hub == FALSE OR getvid(s) > getvid(t) - ACCUM - s.@intersection_size_map += (t -> 1); - - // This section of the code cannot be made schema-free - hub_verts = - SELECT s FROM hub_verts:s -(e_type:e1)- target_v_type:v -(e_type:e2)- feature_v_type:t - WHERE (t.@hub == FALSE OR getvid(s) > getvid(t)) AND s.@intersection_size_map.get(t) BETWEEN 2 AND split_threshold - ACCUM - v.@combined_feature_ids += to_string(getvid(s)) + "_" + to_string(getvid(t)), - v.@combined_verts += s, - v.@combined_verts += t; - - all_verts = hub_verts UNION other_verts; - - hub_verts = - SELECT s FROM hub_verts:s -(e_type)- target_v_type -(e_type)- feature_v_type:t - WHERE (t.@hub == FALSE OR getvid(s) > getvid(t)) AND s.@intersection_size_map.get(t) BETWEEN 2 AND split_threshold - PER (s, t) - ACCUM - s.@combined_feature_ids += to_string(getvid(s)) + "_" + to_string(getvid(t)), - t.@combined_feature_ids += to_string(getvid(s)) + "_" + to_string(getvid(t)); - - all_verts = - SELECT s FROM all_verts:s -(e_type:e)- target_v_type:t - WHERE t.@combined_verts.contains(s) - ACCUM - e.ignore_edge = TRUE - POST-ACCUM (t) - FOREACH combined_feature_id IN t.@combined_verts DO - INSERT INTO Has_Attribute VALUES (t, combined_feature_id Combined_Feature, timestamp, 1, _) - END - POST-ACCUM (s) - FOREACH combined_feature_id IN s.@combined_feature_ids DO - INSERT INTO Linked VALUES (s, combined_feature_id Combined_Feature, timestamp) - END; -} - -UPDATE DESCRIPTION OF QUERY combine_features "This query concatenates feature vertices with a degree above a certain threshold. Feature vertex types in this case refer to Cluster, Customer_Attribute, Product_Attribute, and Product_Variant. If two feature vertices are merged, a Combined_Feature vertex gets created and the constituent features get linked to the Combined_Feature via the Linked edge type." - -UPDATE DESCRIPTION OF QUERY_PARAM combine_features.hub_v_type "These are the feature vertex types which can be considered for the purposes of combination." -UPDATE DESCRIPTION OF QUERY_PARAM combine_features.e_type "These are the edge types which should be traversed in the query for the purposes of edge counting and combination." -UPDATE DESCRIPTION OF QUERY_PARAM combine_features.target_v_type "This is the vertex type which adjoins two different feature vertices for the purpose of combination. Defaults to 'Customer'." -UPDATE DESCRIPTION OF QUERY_PARAM combine_features.feature_v_type "These are the feature vertex types which the 'hub_v_types' should traverse to via the 'target_v_type' vertex type." -UPDATE DESCRIPTION OF QUERY_PARAM combine_features.hub_threshold "If a feature vertex has more than this number of edges, it is considered a hub vertex and is eligible to be combined into a Combined_Feature vertex." -UPDATE DESCRIPTION OF QUERY_PARAM combine_features.split_threshold "This number is the upper limit of neighbors that the source and target feature vertices may share in order to be combined into a Combined_Feature vertex." - - CREATE DISTRIBUTED QUERY recommend_products( SET> src_customer_input, INT batch_index, @@ -302,6 +11,36 @@ CREATE DISTRIBUTED QUERY recommend_products( FLOAT customer_popularity_scale = 1, FLOAT item_popularity_scale = 1) SYNTAX V1 { + + /* + Query Name: recommend_products + Hybrid Customer-Item Collaborative Filtering Recommender + + Purpose: + For each `Customer` vertex, generate a ranked list of `Product_Variant` + recommendations using a combination of: + . customer-customer similarity. + . item-item co-occurrence (co-purchasing behavior). + + High-Level Flow: + . Select source customers + . Build customer-customer similarity from feature overlaps, weighted by + interaction strength, edge importance, and feature popularity. + . Derive item popularity per customer from similar customers and + co-occurring items. + . (Optionally reweight by category importance and vertex-degree scaling.) + . Keep up to `recommendation_count` best items per customer. + + Configuration: + Parameters that control which customers are processed (src_customer_input, + batch_index/num_batches), filtering and list length (ignore_threshold, + recommendation_count), and how schema/weights are applied + (data_types, edge_importance_factors, vertex_degree_scales, + customer_popularity_scale, item_popularity_scale). + + Output: + For each `Customer` vertex, returns a list of `Product_Variant` recommendations + */ TYPEDEF TUPLE Item_Tuple; HeapAccum (recommendation_count, score DESC, item DESC) @recommended_items; MapAccum> @sum_intersection_size, @@sum_set_size; @@ -322,6 +61,7 @@ CREATE DISTRIBUTED QUERY recommend_products( JSONARRAY arr = parse_json_array(data_types); JSONARRAY inner_arr; JSONARRAY entry; + // parse types out of json array FOREACH i IN RANGE[0, arr.size()-1] DO entry = arr.getJSONArray(i); // Customer/Item, Feature/Target, Edge/Vertex -> Type @@ -342,7 +82,7 @@ CREATE DISTRIBUTED QUERY recommend_products( END; src_customers (ANY) = {src_customer_input}; - + // select customers if no input was provided IF src_customers.size() == 0 THEN all_customers (ANY) = {Customer.*}; @@ -372,6 +112,7 @@ CREATE DISTRIBUTED QUERY recommend_products( previous_items = SELECT t FROM previous_items:s -(reverse_Belongs_To)- Product_Variant:t ACCUM + // avoid recommendations to the customers since they already bought the variants t.@ignore += s.@temp_set POST-ACCUM s.@temp_set.clear(); @@ -399,12 +140,14 @@ CREATE DISTRIBUTED QUERY recommend_products( SELECT s FROM included_customers:s -(@@e_types:e)- @@v_types:t WHERE @@vertex_degree_scales.contains(t.type) ACCUM + // measure feature popularity @@max_value += (t.type -> t.outdegree(@@data_types.get("Customer", "Target", "Edge").types)), @@min_value += (t.type -> t.outdegree(@@data_types.get("Customer", "Target", "Edge").types)); @@e_types = @@data_types.get("Customer", "Feature", "Edge").types; @@v_types = @@data_types.get("Customer", "Feature", "Vertex").types; features = + // calculate per customer feature weights SELECT t FROM included_customers:s -(@@e_types:e)- @@v_types:t ACCUM FLOAT sim_score = 1, @@ -425,17 +168,20 @@ CREATE DISTRIBUTED QUERY recommend_products( @@e_types = @@data_types.get("Customer", "Target", "Edge").types; @@v_types = @@data_types.get("Customer", "Target", "Vertex").types; + // convert feature overlaps into customer-customer similarity other_customers = SELECT t FROM features:s -(@@e_types:e)- @@v_types:t ACCUM t.@sum_intersection_size += s.@sum_similarity POST-ACCUM (t) + //weighted Jaccard similarity between customers FOREACH (tgt, score) IN t.@sum_intersection_size DO FLOAT div = (@@sum_set_size.get(t) + @@sum_set_size.get(tgt) - score), IF div <= 0 THEN div = 1.0 END, + //Jaccard similarity stored here t.@sum_similarity += (tgt -> score*1.0/div) END, t.@sum_intersection_size.clear() @@ -499,11 +245,13 @@ CREATE DISTRIBUTED QUERY recommend_products( @@e_types = @@data_types.get("Item", "Feature", "Edge").types; @@v_types = @@data_types.get("Item", "Feature", "Vertex").types; + // find customers connected to seed items customers (ANY) = SELECT t FROM interacted_items:s -(@@e_types)- @@v_types:t; @@e_types = @@data_types.get("Item", "Target", "Edge").types; @@v_types = @@data_types.get("Item", "Target", "Vertex").types; + // from these customers find other items other_items = SELECT t FROM customers:s -(@@e_types)- @@v_types:t; diff --git a/connected_customer/product_recommendations/setup.sh b/connected_customer/product_recommendations/setup.sh new file mode 100644 index 00000000..8688b726 --- /dev/null +++ b/connected_customer/product_recommendations/setup.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +# Graph name for this solution kit +GRAPH_NAME="Product_Recommendation" + +echo "=== Creating schema for $GRAPH_NAME ===" +gsql schema/schema.gsql + +echo "=== Loading sample data into $GRAPH_NAME ===" +gsql loading_job/load_jobs.gsql + +echo "=== Installing queries for $GRAPH_NAME ===" +QUERY_DIR="./queries" + +# Run each .gsql query file against the graph +for file in "$QUERY_DIR"/*.gsql; do + echo "Running $file..." + gsql -g "$GRAPH_NAME" "$file" +done + +echo "Installing all queries on $GRAPH_NAME..." +gsql --graph "$GRAPH_NAME" INSTALL QUERY ALL + +echo "=== Setup complete for $GRAPH_NAME ===" +echo "You can now open the GSQL console and run, for example:" +echo " USE GRAPH Product_Recommendation" +echo " RUN QUERY recommend_products({}, 0, 1)" diff --git a/financial_crime/application_fraud/README.md b/financial_crime/application_fraud/README.md new file mode 100644 index 00000000..d213a052 --- /dev/null +++ b/financial_crime/application_fraud/README.md @@ -0,0 +1,719 @@ +# Application Fraud (TigerGraph Solution Kit) + +A TigerGraph solution kit for detecting **application fraud** and uncovering +fraud rings using shared Personally Identifiable Information (PII) and **entity resolution**. + +This project provisions a complete graph environment to analyze how applications are connected via devices, IPs, +addresses, and other identity signals. + +With this kit you can: + +- Link applications into fraud rings based on shared PII. +- Judge a new application's proximity to known fraud. +- Quantify the impact of graph features with uplift metrics and other insights +- Generate rich graph features for downstream fraud ML models. + +--- + +## Contents + +- [Overview](#overview) +- [Features](#features) +- [Prerequisites](#prerequisites) +- [Setup Instructions](#setup-instructions) +- [Installation Note for Queries](#installation-note-for-queries) +- [Query Execution Order and Explanations](#query-execution-order-and-explanations) +- [Run an Example Query](#run-an-example-query) +- [Using Your Own Data](#using-your-own-data) +- [Resetting the Environment](#resetting-the-environment) +- [ML Model and Insights Application](#ml-model-and-insights-application) + +--- + +## Overview + +Application fraud detection focuses on spotting suspicious credit applications +before they are approved, when: + +- Multiple applications share the same PII. +- Applications are part of a larger, tightly connected ring. +- New applications appear "close" in the graph to known fraud. + +Relational databases struggle to track these many-to-many, multi-hop +relationships at scale. TigerGraph's native graph engine is well suited +because it can: + +**Fast traversals.** Move from an application through its PII and back to other +applications in just a few hops. + +**Community detection.** Group related applications into communities that +represent fraud rings or clusters of highly related applications. + +**Real-time signals.** Compute graph-based fraud features on demand for +scoring new applications as they arrive. + +In this solution kit, we model a fraud detection environment with: + +1. Application vertices +2. PII vertices such as name, DOB, email, phone, address, IP, ID, device, etc. +3. Location vertices +4. Connected_Component vertices: representing communities +5. Product vertices linked to applications. + +You can use the included queries as building blocks for fraud analytics, +operational alerting, and model feature generation. + +> **Graph name:** This kit creates and uses a graph named `Application_Fraud`. + +--- + +## Features + +- **Application Fraud Graph Environment** + +This kit includes a ready-to-use `Application_Fraud` graph schema, sample data, +and a loading job that pulls this data from a public S3 bucket into TigerGraph. +The loaded data is identical to the data found in the `data/` folder of the +kit. + +- **Entity resolution pipeline** + +Use queries to determine if different records with possibly different and +overlapping details actually refer to the same real-world entity -> the same +person, account or application. + +We do this by creating a pipeline that computes similarity between +applications based on shared PII, groups related applications and then +supports incremental matching of new applications in real time. + +- **Graph feature and proximity queries** + +Queries that generate: + - Component-level features (ring size, number of fraud apps, distinct PII). + - Distances and paths from a given application to known fraud. + - CSV feature files for ML training. + +- **Insights analysis** + +Queries that take confusion-matrix counts from your ML model and calculate: + - Precision, recall, specificity, and NPV. + - Detected and missed fraud amounts. + - Net financial gain and relative performance uplift. + +- **Automated setup script** + `setup.sh` creates the graph, loads the data, and installs all queries in one go. + +--- +## Prerequisites + +Before you run this solution kit, make sure you have: + +- **A running TigerGraph instance** + - TigerGraph installed and running, or use the prebuilt kit on TG cloud. + - You must have permission to create graphs and run GSQL commands. + +- **GSQL client access** + - The `gsql` command-line tool available on the same machine/container where you cloned this repo. + - Ability to connect to your TigerGraph service + +- **Network access for sample data** + - Outbound internet access from the TigerGraph machine to read the sample + CSV files from the public S3 bucket used in `loading_job/load_data.gsql`. + +- **Shell environment** + - A Unix-like shell (Linux, macOS, or WSL) to run `setup.sh` and `queries/install_queries.sh`. + - Executable permissions for the scripts: + ```bash + chmod +x setup.sh + chmod +x queries/install_queries.sh + ``` + +> **Graph name:** This kit creates and uses a graph named `Customer_360_Financial`. + +# Setup Instructions +The following instructions assume that you are running the following scripts +with `gsql` command installed. + +If you don't yet have the `gsql` command available, see the TigerGraph documentation: + +- **Local GSQL shell on the server** + [The GSQL Shell](https://docs.tigergraph.com/tigergraph-server/4.2/gsql-shell/) + (explains how to run `gsql` directly on a TigerGraph server) + +- **Remote GSQL client (from your laptop or another machine)** + [Using a Remote GSQL Client](https://docs.tigergraph.com/tigergraph-server/4.2/gsql-shell/using-a-remote-gsql-client) + (explains how to download the GSQL client JAR, configure SSL, and create a `gsql` alias) +Ensure that the script is executable with: +```bash + +chmod +x setup.sh + +``` +Then, run the automated script using: +```bash +./setup.sh +``` +This script with create and run the schema, load all the mock data into graph +and install all available queries. + +To check if all the queries have run successfully, please run the following + command: + ```bash + gsql -g Customer_360_Financial "SHOW QUERY *" + ``` + + +## Installation Note for Queries + +When installing the queries there are **queries** that require special attention: + +- `match_application_entities` and `incremental_match_application_entities`. + +These queries have weights for each PII attribute and threshold parameters used by the weighted WCC matching process. + + - FLOAT Application_Full_Name_weight = 0.2 + - FLOAT Application_DOB_weight = 0.2 + - FLOAT Application_Email_weight = 0.2 + - FLOAT Application_Phone_weight = 0.2 + - FLOAT Application_Address_weight = 0.2 + - FLOAT Application_IP_weight = 0.2 + - FLOAT Application_ID_weight = 1.0 + - FLOAT Application_Device_weight = 1.0 + - FLOAT Application_Party_weight = 1.0 + - FLOAT Application_Account_weight = 1.0 + - FLOAT Application_Card_weight = 1.0 + - FLOAT threshold = 1.0 + +When an Application matches another Application through shared PII attribute(s) the respective weight(s) will accumulate and any accumulated weights equal or over threshold is considered a match. For example if device id has a weight of 1.0 and the threshold is 1.0 then any single matching device id between any Applications will cause those respective Applications to be put into the same Connected_Component community. If device id has a weight of 0.5 and threshold is 1.0 then it would require at least two shared device ids between any Applications to match etc. Default weights and thresholds are assigned to these queries but should be adapted if there are different business rules for matching required to satisfy. +## Query Execution Order and Explanations + +### Step 1: Insert Edges between Matching Applications + +After loading historical Application and PII data and configuring wcc weights for matching run `match_application_entities` to perform matching on the entire graph. + +### Step 2: Form Communities + +Run `unify_application_entities` to associate all matched Applications from Step 1 into a Connected_Component community. + +Subsequent queries can be run following the completion of the aforementioned two. + +### Step 3: Feature Engineering Queries + +The feature engineering queries generate feature values to test and train the downstream ML model. These queries aggregate the following graph features as bulk CSV file output for all Applications in the graph: + +`batch_application_cc_features` + +Application, Application.is_fraud, Application Connected_Component, Application nodes in Connected_Component, Fraud Application Nodes in Connected_Component, distinct PII nodes in Connected_Component, number of Application nodes in Connected_Component connected by PII, number of Application nodes in Connected_Component only connected by PII. PII includes Name, DOB, Email, Phone, Address, IP, ID, Device, Party, Account, and Card vertices. + +`batch_application_distance_and_path` + +Application, Application.is_fraud, Application Connected_Component, Fraud Application, Fraud Application Connected_Component, Degree of Connection from Application to Fraud Application, Path of Connection from Application to Fraud Application. + +### Step 4: Add New Applications and Detect Fraud in Real-Time + +As new Applications get submitted these can be added to the graph and associated to a Connected_Component community in near real time using `incremental_application_match` query accepting a JSON payload containing all Application and PII data including Name, DOB, Email, Phone, Address, IP, ID, Device, Party, Account, and Card vertices. It's typical to have consistent weight and threshold settings between this and the former `match_application_entities` query for consistent matching behavior. + +**Raw Request Example** + +```http +GET http://18.217.164.69:14240/restpp/query/Application_Fraud/incremental_application_match + +``` + +```json +{ + "application": "6f9e1ca5-e60c-48fd-a1c1-03d13348f888", + "created_at": "2024-04-17 11:15:26.076265", + "status": "740e989a-3614-427b-b3b0-21fc96ce2fc4", + "line_of_credit": "9884.82", + "annual_percentage_rate": "36.06", + "fraud": true, + "product": "414882cd-3fcb-42ef-8ee6-0423ce526124", + "name": "3a58b7ce-33cb-46c2-ab65-527e1334927b", + "dob": "a1d4472b-0f3c-4465-82a0-e2f889138989", + "email": "18baaf68-12de-4291-8fae-8df7f2bf6026", + "phone_numbers": [ + { + "type": "mobile", + "id": "5ed411af-e38c-4e97-b657-25017695148a" + }, + { + "type": "landline", + "id": "60669163-67e7-4e0e-a46b-2c25bdd0ee6b" + } + ], + "addresses": [ + { + "type": "mailing", + "line_1": "61d2bd45-57f1-4b3c-98a9-265cbcefdead", + "line_2": "ed7020ae-35ef-4aae-a13d-3513222fcc50", + "city": "fd1e10fb-91e4-4ecf-9ec1-afb2280dfbb5", + "state": "694d083b-7006-4561-a2c1-913d80831ba7", + "zipcode": "5b8eafe4-01c0-4b1e-8024-595ac6969a7c", + "county": "fb4b12a7-bc0e-47cc-852f-ff7da7e61d66", + "country": "US" + }, + { + "type": "physical", + "line_1": "8f1a04ed-ef4a-4324-8321-be3fe85f17da", + "line_2": "e9a58e29-68cd-4e35-a43b-f28914c56e5a", + "city": "13cf8f10-814b-4db6-930d-c0a162ef24f1", + "state": "d8d7e179-5bb7-4cde-8fb5-97f0e9e55bde", + "zipcode": "ff1456dd-b2f8-4c55-9dcc-e2b13b42d05b", + "county": "6ba0eec9-4e72-45f1-91aa-1fd246d7400c", + "country": "US" + } + ], + "ip_address": "83f7dc2f-928c-48a7-b157-0c50572071aa", + "ids": [ + { + "type": "Passport", + "id": "36f26ab2-35ba-4361-b719-964ae23283b4" + }, + { + "type": "Driver's License Number", + "id": "8cefe5da-9f83-4013-a1e8-2e59108726c7" + } + ], + "device_id": "7a1f0167-1004-4d6c-8411-e38e414a6129", + "party": "00cb01dc-e3e3-492d-b1fe-7c225ba3dde1", + "accounts": [ + { + "type": "savings", + "id": "fb08e35b-b428-4840-8800-c0f841e7650e" + }, + { + "type": "checking", + "id": "3f41189c-b55b-4ec5-b073-182f9f87ceee" + } + ], + "cards": [ + { + "type": "credit_card", + "id": 241556 + }, + { + "type": "debit_card", + "id": 278916 + } + ] +} +``` +```Response + + {'Application': '6f9e1ca5-e60c-48fd-a1c1-03d13348f888', 'entity_resolution': True} + +``` + +If the entity_resolution response is True the incoming Application was successfully able to be matched with the historical Application dataset using the matching logic. +The `distance_and_path_to_fraud_account` and `get_application_cc_features` queries can be called to retrieve the graph features to augment the downstream ML decision model in near real time. +**Check Distance to Fraud** + +```http +GET http://18.217.164.69:14240/restpp/query/Application_Fraud/distance_and_path_to_fraud_application +``` + +**Response** + +```json +{ + "version": { + "edition": "enterprise", + "api": "v2", + "schema": 59 + }, + "error": false, + "message": "", + "results": [ + { + "result": [ + { + "v_id": "31aeadde-d12a-486d-b2b4-a217daa9404d", + "v_type": "Application", + "attributes": { + "input_application_id": "fa881c0a-adff-42d5-adf5-19de0cbe3547", + "input_application_id_cc": "675282944", + "degree_of_connection": 2, + "is_fraud": true, + "application_id": "31aeadde-d12a-486d-b2b4-a217daa9404d", + "application_id_cc": "675282944", + "path_of_connection": [ + "Device" + ] + } + } + ] + } + ] +} +``` + +**Get Application Connected Component Features** + +```http +GET http://18.217.164.69:14240/restpp/query/Application_Fraud/get_application_cc_features +``` + +**Response** + +```json +{ + "version": { + "edition": "enterprise", + "api": "v2", + "schema": 59 + }, + "error": false, + "message": "", + "results": [ + { + "result": [ + { + "v_id": "31aeadde-d12a-486d-b2b4-a217daa9404d", + "v_type": "Application", + "attributes": { + "connectedComponentID": 643825664, + "total_cc_nodes": 2, + "fraud_applications_in_cc": 2, + "name_distinct_in_cc": 2, + "name_number_nodes": 0, + "name_number_nodes_only_connected": 0, + "dob_distinct_in_cc": 2, + "dob_number_nodes": 0, + "dob_number_nodes_only_connected": 0, + "email_distinct_in_cc": 2, + "email_number_nodes": 0, + "email_number_nodes_only_connected": 0, + "phone_distinct_in_cc": 4, + "phone_number_nodes": 0, + "phone_number_nodes_only_connected": 0, + "address_distinct_in_cc": 4, + "address_number_nodes": 0, + "address_number_nodes_only_connected": 0, + "ip_distinct_in_cc": 2, + "ip_number_nodes": 0, + "ip_number_nodes_only_connected": 0, + "id_distinct_in_cc": 4, + "id_number_nodes": 0, + "id_number_nodes_only_connected": 0, + "device_distinct_in_cc": 1, + "device_number_nodes": 0, + "device_number_nodes_only_connected": 0, + "party_distinct_in_cc": 0, + "party_number_nodes": 0, + "party_number_nodes_only_connected": 0, + "account_distinct_in_cc": 4, + "account_number_nodes": 0, + "account_number_nodes_only_connected": 0, + "card_distinct_in_cc": 4, + "card_number_nodes": 0, + "card_number_nodes_only_connected": 0 + } + }, + { + "v_id": "fa881c0a-adff-42d5-adf5-19de0cbe3547", + "v_type": "Application", + "attributes": { + "connectedComponentID": 643825664, + "total_cc_nodes": 2, + "fraud_applications_in_cc": 2, + "name_distinct_in_cc": 2, + "name_number_nodes": 0, + "name_number_nodes_only_connected": 0, + "dob_distinct_in_cc": 2, + "dob_number_nodes": 0, + "dob_number_nodes_only_connected": 0, + "email_distinct_in_cc": 2, + "email_number_nodes": 0, + "email_number_nodes_only_connected": 0, + "phone_distinct_in_cc": 4, + "phone_number_nodes": 0, + "phone_number_nodes_only_connected": 0, + "address_distinct_in_cc": 4, + "address_number_nodes": 0, + "address_number_nodes_only_connected": 0, + "ip_distinct_in_cc": 2, + "ip_number_nodes": 0, + "ip_number_nodes_only_connected": 0, + "id_distinct_in_cc": 4, + "id_number_nodes": 0, + "id_number_nodes_only_connected": 0, + "device_distinct_in_cc": 1, + "device_number_nodes": 0, + "device_number_nodes_only_connected": 0, + "party_distinct_in_cc": 0, + "party_number_nodes": 0, + "party_number_nodes_only_connected": 0, + "account_distinct_in_cc": 4, + "account_number_nodes": 0, + "account_number_nodes_only_connected": 0, + "card_distinct_in_cc": 4, + "card_number_nodes": 0, + "card_number_nodes_only_connected": 0 + } + } + ] + } + ] +} +``` + +If entity_resolution is False the incoming Application was not able to be matched with the historical Application dataset and the same respective Application should be sent to `incremental_unify_entity` query so it's eligible for matching against future Applications in near real time. + + + Here is a write-up of +some of the queries along with their uses: + +--- + +### 1. Entity Resolution & Fraud Ring Construction + +**match_application_entities(...)** +Creates a `Same_Application` similarity edge if the computed score of +application similarity exceeds a given threshold by comparing applications +using weighted matches on shared PII. + +--- + +**unify_application_entities()** +Scans the applications that are linked by `Same_Application` similarity edges, +and groups them into a shared `Connected_Component` vertex, and connects each +member via `Application_In_Ring`. Pairwise similarity links are turned into +"fraud ring" clusters. + +--- + +**incremental_application_match(input JSON, weights..., threshold)** +Scores a *single* new or updated application in near real time. It parses a +JSON payload containing application attributes + PII, upserts it into the graph, and +compares it against existing applications to see if it is part of the +`Connected_Component` or an existing fraud ring. + +--- + +**incremental_application_unify(SET\)** +Given a set of Application vertices, this query walks their existing +Same_Application (if any) links to determine which applications belong together, +ensures there is a Connected_Component vertex for each group, and creates the +corresponding Application_In_Ring edges. + +--- + +**delete_all_application_cc_connections(num_of_batches, batch_id)** +Deletes all `Application_In_Ring` edges in batches. Use this before re-running +a full entity-resolution cycle so that you can rebuild communities from +scratch without dropping the whole graph. + +--- + +**delete_unused_cc_nodes(num_of_batches, batch_id)** +Cleans up `Connected_Component` vertices that no longer have any edges. +This keeps the graph lean and avoids clutter from obsolete components. + +--- + +**output_application_cc_to_file(output_file_path)** +Exports a simple mapping of `Application` -> `Connected_Component` into a CSV +file. This is useful for joining graph-based communities back into downstream +systems such as your warehouse, feature store, or BI dashboards. + +--- + +### 2. Graph Features for Machine Learning + +**batch_application_cc_features(connections, output_file_path)** +Generates a CSV of connected-component features for every `Application`. +For each application's community, it counts distinct PII nodes and how many +applications are connected via each PII type. Use this to build offline +training datasets capturing the structure and density of each fraud ring. + +--- + +**batch_application_distance_and_path(depth, output_file_path)** +For every `Application`, finds the closest fraud application reachable via +shared PII within a maximum hop distance, and writes out features such as: +degree of connection (number of hops), the specific fraud counterpart, and a +path description. These features can further be used for ML training. + +--- + +**get_application_cc_features(application, connections)** +Computes graph features for the connected component of a single Application +Returns a structured result you can use directly at scoring time, with +`connections` acting as a cap to filter out very high-degree PII hubs. + +--- + +**get_application_fraud_status(SET\)** +Given a set of application vertices, returns the `is_fraud` boolean label for +each one. + +--- + +**set_application_fraud_status(application, fraud_status)** +Updates the `is_fraud` attribute for a specific `Application`. Use this to push +investigation outcomes or model decisions back into the graph so that future +graph features and proximity queries are based on the latest fraud labels. + +--- + +### 3. Fraud Proximity & Case Investigation + +**distance_and_path_to_fraud_application(application, depth)** +Starting from a single `Application`, this query traverses through shared PII +to find any reachable fraud applications within a given hop limit. It returns +the distance in hops, the matching fraud applications, their communities, and +the path of connection. + +--- + +**distance_and_path_to_fraud_application_vis(application, min_depth, max_depth)** +Finds fraud applications reachable from the input Application between min_depth +and max_depth hops, then collects the vertices and edges on those paths. +Returns a subgraph that can be rendered in TigerGraph visualization. + +--- + +**find_shared_piis_of_two_applications(...)** +Highlights the PII elements shared between two selected applications, +such as common device, IP, phone, or address. + +--- + +**get_top_k_connected_components_by_num_applications(top_k)** +Returns the `Connected_Component` communities with the most applications attached, +along with their member applications. Use this to find your largest rings. + +--- + +### 4. Portfolio & Product Insights + +**get_num_applications_by_app_status()** +Counts applications by their status (e.g., PENDING, APPROVED, DECLINED). +View your onboarding pipeline and how many applications are sitting in each decision bucket. + +--- + +**get_num_applications_by_fraud_status()** +Returns the total number of fraudulent versus legitimate applications. + +--- + +**get_top_k_products_by_num_applications(top_k, input_application_fraud_status, input_application_status)** +Returns the top products ranked by number of connected applications, +optionally filtered by application fraud status and/or application status. + +--- + +**get_top_k_products_by_num_applications_with_other(top_k, input_application_fraud_status, input_application_status)** +Provides a "top-k + Other" breakdown: the top products by number of applications +plus an aggregated "Other Products" bucket. + +--- + +### 5. Business Impact & Uplift Analytics + +These queries work together to quantify how much value you get from using graph +features (TigerGraph) versus your baseline ML model or rules. They take +confusion-matrix counts *with* and *without* graph features and turn them +into uplift and dollar amounts. + +--- + +**insights_get_application_count_by_binary_classification(...)** +Takes confusion-matrix counts for two model runs- a baseline model +and a model augmented with graph features-and returns a side-by-side table +plus totals. + +--- + +**insights_get_binary_classification_ratios(...)** +Computes standard model performance metrics (precision, recall, specificity, +negative predictive value) for both scenarios and expresses them as percentages. + +--- + +**insights_get_detected_fraud_amount(no_tg_true_positive, tg_true_positive, avg_loss_per_app_fraud)** +Given an average monetary loss per fraudulent application, this query converts +true positives into "detected fraud dollars" with and without graph features. + +--- + +**insights_get_missed_fraud_amount(no_tg_false_negative, tg_false_negative, avg_loss_per_app_fraud)** +Calculates how much fraud slips through undetected in each scenario. This +highlights the reduction in missed fraud when graph features improve recall. + +--- + +**insights_get_net_gain_numbers_and_percentages(...)** +Computes the *counts* and *percentages* of: additional fraud blocked, net +reduction in missed fraud, and net reduction in false positives. + +--- + +### 6. Metric Utility Functions + +**utils_get_precision(true_positive, false_positive)** +**utils_get_recall(true_positive, false_negative)** +**utils_get_specificity(true_negative, false_positive)** +**utils_get_negative_predictive_value(true_negative, false_negative)** + +--- + + +## Run an Example Query + +Once the schema and data are loaded, you can inspect graph-based features for a single application using `get_application_cc_features`. + +1. Pick an `Application` ID from `data/application_fraud.csv` (for example `APP_10001`). +2. From the GSQL shell, run: + +```bash +gsql 'USE GRAPH Application_Fraud RUN QUERY get_application_cc_features("7da7048b-16f3-46f0-bdf4-a1ddd5c8005a", 25000)' +``` + + +This returns a JSON-style record with the application's connected-component +ID, fraud count in its ring, and PII-based graph feature that you can feed +directly into an ML model or rules engine. + +## Using Your Own Data + +By default, this solution kit loads **mock application and product data** from a +TigerGraph-hosted public S3 bucket, using the loading job defined in +`loading_job/load_data.gsql`. The two CSV files it loads are: + +- `application_fraud.csv` +- `products.csv` + +TigerGraph supports multiple loading options. For detailed, up-to-date examples, +please refer to the official documentation: + +- **Data loading overview** - supported sources and general workflow + https://docs.tigergraph.com/tigergraph-server/4.2/data-loading/data-loading-overview + +- **Data loading entry page** - index of all loading methods (local files, cloud storage, warehouses, Spark, etc.) + https://docs.tigergraph.com/tigergraph-server/4.2/data-loading/ + +- **Load data from cloud storage** - step-by-step guide for Amazon S3, Google Cloud Storage, and Azure Blob Storage + https://docs.tigergraph.com/tigergraph-server/4.2/data-loading/load-from-cloud + + + +## Resetting the Environment + +Sometimes you'll want to wipe the graph data and start fresh - for example, when +switching to a new dataset or after updating the schema and loading logic. + +This repository provides a utility query to delete all data, and a separate +reset script for more advanced workflows. + +To run it: + +```bash +gsql reset/reset.gsql +``` + +## ML Model and Insights Application + The Insights Applications are available as JSON files in the meta folder. + diff --git a/financial_crime/application_fraud/queries/batch_application_cc_features.gsql b/financial_crime/application_fraud/queries/batch_application_cc_features.gsql index 3976d124..c9f20ad2 100644 --- a/financial_crime/application_fraud/queries/batch_application_cc_features.gsql +++ b/financial_crime/application_fraud/queries/batch_application_cc_features.gsql @@ -1,5 +1,16 @@ CREATE OR REPLACE DISTRIBUTED QUERY batch_application_cc_features(INT connections=25000, STRING output_file_path = "/home/tigergraph/gsql_output/batch_application_cc_features.csv") { - +/* + Description: + Runs a batch export of connected-component PII connectivity + features for each Application vertex and writes them to a CSV + file for downstream fraud analytics. + + Parameters: + connections: + Maximum outdegree for a PII vertex when counting shared links; + output_file_path: + CSV Output file path +*/ MapAccum @@connect_component, @@distinct_name, @@distinct_dob, @@distinct_email, @@distinct_phone, @@distinct_address, @@distinct_ip, @@distinct_id, @@distinct_device, @@distinct_party, @@distinct_account, @@distinct_card, @@connected_via_name, @@connected_via_dob, @@connected_via_email, @@connected_via_phone, @@connected_via_address, @@connected_via_ip, @@connected_via_id, @@connected_via_device, @@connected_via_party, @@connected_via_account, @@connected_via_card, @@only_connected_via_name, @@only_connected_via_dob, @@only_connected_via_email, @@only_connected_via_phone, @@only_connected_via_address, @@only_connected_via_ip, @@only_connected_via_id, @@only_connected_via_device, @@only_connected_via_party, @@only_connected_via_account, @@only_connected_via_card; OrAccum @connected_via_name, @connected_via_dob, @connected_via_email, @connected_via_phone, @connected_via_address, @connected_via_ip, @connected_via_id, @connected_via_device, @connected_via_party, @connected_via_account, @connected_via_card; SetAccum @middle_type; diff --git a/financial_crime/application_fraud/queries/batch_application_distance_and_path.gsql b/financial_crime/application_fraud/queries/batch_application_distance_and_path.gsql index 8c4aa344..0fa2729e 100644 --- a/financial_crime/application_fraud/queries/batch_application_distance_and_path.gsql +++ b/financial_crime/application_fraud/queries/batch_application_distance_and_path.gsql @@ -1,5 +1,20 @@ CREATE DISTRIBUTED QUERY batch_application_distance_and_path(INT depth=5, STRING output_file_path = "/home/tigergraph/gsql_output/batch_application_distance_and_path_features.csv") { +/* + Description: + For each Application, exports to CSV the shortest-hop distance + and PII-based connection path to any reachable fraudulent + Applications, along with their connected-component ids. + Parameters: + depth: + Maximum number of hops from the source Application when + traversing via PII links. + output_file_path: + Absolute path of the CSV output file. + + Output: + CSV of app-fraud distances and PII paths +*/ MapAccum, VERTEX> @@cc_map; MaxAccum @dis; MaxAccum @visited; diff --git a/financial_crime/application_fraud/queries/delete_all_application_cc_connections.gsql b/financial_crime/application_fraud/queries/delete_all_application_cc_connections.gsql index 3ef93cd8..56d9a61c 100644 --- a/financial_crime/application_fraud/queries/delete_all_application_cc_connections.gsql +++ b/financial_crime/application_fraud/queries/delete_all_application_cc_connections.gsql @@ -1,5 +1,20 @@ CREATE OR REPLACE DISTRIBUTED QUERY delete_all_application_cc_connections(INT num_of_batches = 1, INT batch_id = 0) { +/* + Description: + Deletes Application_In_Ring edges linking Application vertices to + Connected_Component vertices in batches. + Parameters: + num_of_batches: + Total number of partitions to divide Application vertices into. + The query must be executed once per batch_id from 0 to + num_of_batches - 1 to cover all vertices. + batch_id: + Index of the current batch to process (0-based). + Output: + A summary message indicating how many Application_In_Ring + edges were deleted for the current batch. +*/ SumAccum @@count; temp = SELECT t diff --git a/financial_crime/application_fraud/queries/delete_unused_cc_nodes.gsql b/financial_crime/application_fraud/queries/delete_unused_cc_nodes.gsql index 04be56a2..e57b295f 100644 --- a/financial_crime/application_fraud/queries/delete_unused_cc_nodes.gsql +++ b/financial_crime/application_fraud/queries/delete_unused_cc_nodes.gsql @@ -1,5 +1,20 @@ CREATE OR REPLACE DISTRIBUTED QUERY delete_unused_cc_nodes(INT num_of_batches = 1, INT batch_id = 0) { - +/* + Description: + Deletes unused Connected_Component vertices in batches + + Parameters: + num_of_batches: + Total number of partitions to divide Connected_Component vertices + into. The query must be executed once per batch_id from 0 to + num_of_batches - 1 to cover all vertices. + batch_id: + Index of the current batch to process (0-based). + + Output: + Prints a summary message indicating how many unused + Connected_Component vertices were deleted for the current batch. +*/ SumAccum @@count; start = {Connected_Component.*}; diff --git a/financial_crime/application_fraud/queries/distance_and_path_to_fraud_application.gsql b/financial_crime/application_fraud/queries/distance_and_path_to_fraud_application.gsql index 88956c59..0b3dff1f 100644 --- a/financial_crime/application_fraud/queries/distance_and_path_to_fraud_application.gsql +++ b/financial_crime/application_fraud/queries/distance_and_path_to_fraud_application.gsql @@ -1,6 +1,22 @@ CREATE OR REPLACE QUERY distance_and_path_to_fraud_application(VERTEX input, INT depth=5) { +/* + Description: + From an input Application, finds fraudulent Applications reachable + within a given hop limit and returns distance, PII-based path, + and connected-component ids. - MapAccum> @@cc_map; + Parameters: + input: + Source Application vertex. + depth: + Max hop distance from input (default 5). + + Output: + One row per reachable fraud Application with + ids, hop distance, and PII path from input. +*/ + + MapAccum> @@cc_map; MinAccum @dis; OrAccum @visited; SetAccum> @@fraud_app; diff --git a/financial_crime/application_fraud/queries/distance_and_path_to_fraud_application_vis.gsql b/financial_crime/application_fraud/queries/distance_and_path_to_fraud_application_vis.gsql index 32182b2d..d1e986b9 100644 --- a/financial_crime/application_fraud/queries/distance_and_path_to_fraud_application_vis.gsql +++ b/financial_crime/application_fraud/queries/distance_and_path_to_fraud_application_vis.gsql @@ -1,4 +1,22 @@ CREATE OR REPLACE DISTRIBUTED QUERY distance_and_path_to_fraud_application_vis(VERTEX input, INT min_depth = 2, INT max_depth = 5) { + +/* + Description: + From an input Application, finds fraudulent Applications within a hop + range and returns both summary rows and a subgraph for visualization. + + Parameters: + input: + Source Application vertex. + min_depth: + Min hop distance for fraud matches. + max_depth: + Max hop distance from input (default 5). + + Output: + Result sets: fraud rows plus vertices/edges + for visualization. +*/ TYPEDEF TUPLE Edge_Info; MapAccum> @@cc_map; MinAccum @dis; diff --git a/financial_crime/application_fraud/queries/find_shared_piis_of_two_applications.gsql b/financial_crime/application_fraud/queries/find_shared_piis_of_two_applications.gsql index e4f92322..07b3e73c 100644 --- a/financial_crime/application_fraud/queries/find_shared_piis_of_two_applications.gsql +++ b/financial_crime/application_fraud/queries/find_shared_piis_of_two_applications.gsql @@ -1,4 +1,18 @@ CREATE OR REPLACE QUERY find_shared_piis_of_two_applications(VERTEX application_1, VERTEX application_2) { +/* + Description: + Given two Applications, finds shared PII vertices and returns their + type, value, and degree (outdegree). + + Parameters: + application_1: + First Application vertex used to search for shared PII vertices. + application_2: + Second Application vertex used to search for shared PII vertices. + + Output: + Result set of (pii_type, pii_value, degree) for PII shared by both apps. +*/ TYPEDEF TUPLE pii_info; ListAccum @@degrees_of_shared_piis; diff --git a/financial_crime/application_fraud/queries/get_application_cc_features.gsql b/financial_crime/application_fraud/queries/get_application_cc_features.gsql index 5eea27fb..9f64f58d 100644 --- a/financial_crime/application_fraud/queries/get_application_cc_features.gsql +++ b/financial_crime/application_fraud/queries/get_application_cc_features.gsql @@ -1,5 +1,19 @@ CREATE OR REPLACE QUERY get_application_cc_features(VERTEX application, INT connections=25000) { - +/* + Description: + For an input Application, computes connected-component-level PII + connectivity features for use in near real-time fraud analysis. + + Parameters: + application: + Source Application whose CC features are + being computed. + connections: + Max PII vertex outdegree when traversing; + + Output: + Summary of application CC size, fraud count, and PII connectivity. + */ MapAccum @@connect_component, @@distinct_name, @@distinct_dob, @@distinct_email, @@distinct_phone, @@distinct_address, @@distinct_ip, @@distinct_id, @@distinct_device, @@distinct_party, @@distinct_account, @@distinct_card, @@connected_via_name, @@connected_via_dob, @@connected_via_email, @@connected_via_phone, @@connected_via_address, @@connected_via_ip, @@connected_via_id, @@connected_via_device, @@connected_via_party, @@connected_via_account, @@connected_via_card, @@only_connected_via_name, @@only_connected_via_dob, @@only_connected_via_email, @@only_connected_via_phone, @@only_connected_via_address, @@only_connected_via_ip, @@only_connected_via_id, @@only_connected_via_device, @@only_connected_via_party, @@only_connected_via_account, @@only_connected_via_card; OrAccum @connected_via_name, @connected_via_dob, @connected_via_email, @connected_via_phone, @connected_via_address, @connected_via_ip, @connected_via_id, @connected_via_device, @connected_via_party, @connected_via_account, @connected_via_card; SetAccum @middle_type; diff --git a/financial_crime/application_fraud/queries/get_num_applications_by_app_status.gsql b/financial_crime/application_fraud/queries/get_num_applications_by_app_status.gsql index 6039ab0a..e491f5a1 100644 --- a/financial_crime/application_fraud/queries/get_num_applications_by_app_status.gsql +++ b/financial_crime/application_fraud/queries/get_num_applications_by_app_status.gsql @@ -1,4 +1,15 @@ CREATE OR REPLACE DISTRIBUTED QUERY get_num_applications_by_app_status () { +/* + Description: + Counts Applications grouped by status + (e.g. PENDING, APPROVED). + + Parameters: + (None) + + Output: + Summary of application counts by status value. +*/ TYPEDEF TUPLE App_Status_Info; ListAccum @@app_status_info_list; MapAccum> @@app_status_count_map; diff --git a/financial_crime/application_fraud/queries/get_num_applications_by_fraud_status.gsql b/financial_crime/application_fraud/queries/get_num_applications_by_fraud_status.gsql index 4b0697c0..38131b2b 100644 --- a/financial_crime/application_fraud/queries/get_num_applications_by_fraud_status.gsql +++ b/financial_crime/application_fraud/queries/get_num_applications_by_fraud_status.gsql @@ -1,4 +1,9 @@ CREATE OR REPLACE DISTRIBUTED QUERY get_num_applications_by_fraud_status () { +/* + Description: + Counts all Application vertices by fraud status and returns the + total number of fraudulent and legitimate applications. +*/ TYPEDEF TUPLE Fraud_Status_Info; ListAccum @@fraud_status_info_list; SumAccum @@num_fraud_applications; diff --git a/financial_crime/application_fraud/queries/get_top_k_connected_components_by_num_applications.gsql b/financial_crime/application_fraud/queries/get_top_k_connected_components_by_num_applications.gsql index 6eaabf47..7cf0358c 100644 --- a/financial_crime/application_fraud/queries/get_top_k_connected_components_by_num_applications.gsql +++ b/financial_crime/application_fraud/queries/get_top_k_connected_components_by_num_applications.gsql @@ -1,6 +1,19 @@ CREATE OR REPLACE DISTRIBUTED QUERY get_top_k_connected_components_by_num_applications ( INT top_k = 10 ) { +/* + Description: + Finds top-k Connected_Component vertices by number of Applications and + returns components, their applications, and edges for visualization. + + Parameters: + top_k: + Number of top components to return + (default 10). + + Output: + Summary of top-k components, apps, and CC edges. +*/ TYPEDEF TUPLE cc_vertex, INT cc_id, INT num_applications> Connected_Component_Info; HeapAccum(top_k, num_applications DESC, cc_id ASC) @@top_cc_heap; SetAccum> @@top_k_connected_components_set; @@ -40,4 +53,4 @@ CREATE OR REPLACE DISTRIBUTED QUERY get_top_k_connected_components_by_num_applic UPDATE DESCRIPTION OF QUERY get_top_k_connected_components_by_num_applications "This query returns the top top_k connected components and their applications, ranked by the number of applications it connects to." -UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_connected_components_by_num_applications.top_k "The number of top connected components we want to return for this query." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_connected_components_by_num_applications.top_k "The number of top connected components we want to return for this query." diff --git a/financial_crime/application_fraud/queries/get_top_k_products_by_num_applications.gsql b/financial_crime/application_fraud/queries/get_top_k_products_by_num_applications.gsql index e2664adb..0889d3fb 100644 --- a/financial_crime/application_fraud/queries/get_top_k_products_by_num_applications.gsql +++ b/financial_crime/application_fraud/queries/get_top_k_products_by_num_applications.gsql @@ -3,6 +3,23 @@ CREATE OR REPLACE DISTRIBUTED QUERY get_top_k_products_by_num_applications ( STRING input_application_fraud_status = "", STRING input_application_status = "" ) { +/* + Description: + Finds top-k Products by number of linked Applications, with optional + filters on application fraud status and status. + + Parameters: + top_k: + Number of top Products to return + (default 10). + input_application_fraud_status: + Optional fraud-status filter for + input_application_status: + Optional Application.status filter + + Output: + Summary of top-k Products with application counts and basic attributes. +*/ TYPEDEF TUPLE < VERTEX product_vertex, STRING product_id, @@ -86,4 +103,4 @@ UPDATE DESCRIPTION OF QUERY get_top_k_products_by_num_applications "This query r UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_products_by_num_applications.top_k "The number of top products we want to return for this query." UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_products_by_num_applications.input_application_fraud_status "(Optional) Filter for fraud status of an application. Defaults to '' (no filter)." -UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_products_by_num_applications.input_application_status "(Optional) Filter for the status of an application (case-sensitive). Defaults to '' (no filter)." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_products_by_num_applications.input_application_status "(Optional) Filter for the status of an application (case-sensitive). Defaults to '' (no filter)." diff --git a/financial_crime/application_fraud/queries/get_top_k_products_by_num_applications_with_other.gsql b/financial_crime/application_fraud/queries/get_top_k_products_by_num_applications_with_other.gsql index 96bc34c9..c373924e 100644 --- a/financial_crime/application_fraud/queries/get_top_k_products_by_num_applications_with_other.gsql +++ b/financial_crime/application_fraud/queries/get_top_k_products_by_num_applications_with_other.gsql @@ -3,6 +3,22 @@ CREATE OR REPLACE DISTRIBUTED QUERY get_top_k_products_by_num_applications_with_ STRING input_application_fraud_status = "", STRING input_application_status = "" ) { +/* + Description: + Finds top-k Products by number of linked Applications and adds an + aggregated "Other" bucket for all remaining Products. + + Parameters: + top_k: + Number of top Products to return + input_application_fraud_status: + Optional fraud-status filter + input_application_status: + Optional Application.status filter + + Output: + Summary of top-k Products and application counts. +*/ TYPEDEF TUPLE < VERTEX product_vertex, STRING product_id, @@ -112,4 +128,4 @@ UPDATE DESCRIPTION OF QUERY get_top_k_products_by_num_applications_with_other "T UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_products_by_num_applications_with_other.top_k "The number of top products we want to return for this query." UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_products_by_num_applications_with_other.input_application_fraud_status "(Optional) Filter for fraud status of an application. Defaults to '' (no filter)." -UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_products_by_num_applications_with_other.input_application_status "(Optional) Filter for the status of an application (case-sensitive). Defaults to '' (no filter)." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM get_top_k_products_by_num_applications_with_other.input_application_status "(Optional) Filter for the status of an application (case-sensitive). Defaults to '' (no filter)." diff --git a/financial_crime/application_fraud/queries/incremental_application_match.gsql b/financial_crime/application_fraud/queries/incremental_application_match.gsql index c68829d4..ab7320ba 100644 --- a/financial_crime/application_fraud/queries/incremental_application_match.gsql +++ b/financial_crime/application_fraud/queries/incremental_application_match.gsql @@ -13,7 +13,25 @@ CREATE OR REPLACE QUERY incremental_application_match( FLOAT Application_Card_weight = 1.0, FLOAT threshold = 1.0 // if a total matching score between two entities exceed the threshold, a similarity edge will be created between the matching entities ) { - +/* + Description: + Parses a JSON payload for one Application, upserts its PII into the + graph, and uses weighted PII similarity to match it into an + existing Connected_Component (if any). + + Parameters: + input: + JSON string for the Application and its PII attributes. + Application_*_weight: + Per-PII weight used when scoring matches + threshold: + Minimum total similarity score required to link into an existing + component. + + Output: + Summary of Application id and whether it was matched into an existing + component. +*/ TYPEDEF TUPLE ver> Connected_Component_heap_tuple; HeapAccum(1, score DESC) @@Connected_Component_ID_heap; SetAccum @@edge_type_set, @@application_vertex, @@name, @@dob, @@email, @@phone_numbers, @@addresses, @@ip, @@ids, @@device, @@party, @@accounts, @@cards; diff --git a/financial_crime/application_fraud/queries/incremental_application_unify.gsql b/financial_crime/application_fraud/queries/incremental_application_unify.gsql index 51566034..b34400f7 100644 --- a/financial_crime/application_fraud/queries/incremental_application_unify.gsql +++ b/financial_crime/application_fraud/queries/incremental_application_unify.gsql @@ -1,4 +1,14 @@ CREATE OR REPLACE QUERY incremental_application_unify(SET> applications) { +/* + Description: + Unifies a given set of Applications into connected components via + Same_Application, updating Connected_Component and Application_In_Ring + in near real time. + + Parameters: + applications: + Set of Application vertices to group into Connected_Components. +*/ MinAccum @cc_id; // Each vertex's tentative component id diff --git a/financial_crime/application_fraud/queries/insights_get_application_count_by_binary_classification.gsql b/financial_crime/application_fraud/queries/insights_get_application_count_by_binary_classification.gsql index a3dd4fde..ce305d1f 100644 --- a/financial_crime/application_fraud/queries/insights_get_application_count_by_binary_classification.gsql +++ b/financial_crime/application_fraud/queries/insights_get_application_count_by_binary_classification.gsql @@ -8,6 +8,11 @@ CREATE OR REPLACE QUERY insights_get_application_count_by_binary_classification INT tg_false_negative, INT tg_false_positive ) { +/* + Description: + Builds a comparison table of binary classification counts for + scenarios without vs. with TigerGraph. +*/ TYPEDEF TUPLE Table_Entry; ListAccum @@application_count_stats; SumAccum @@total_applications_tg; @@ -30,4 +35,4 @@ CREATE OR REPLACE QUERY insights_get_application_count_by_binary_classification @@application_count_stats += Table_Entry("TOTAL", @@total_applications_no_tg, @@total_applications_tg); PRINT @@application_count_stats; -} \ No newline at end of file +} diff --git a/financial_crime/application_fraud/queries/insights_get_binary_classification_ratios.gsql b/financial_crime/application_fraud/queries/insights_get_binary_classification_ratios.gsql index 1545a882..acec4814 100644 --- a/financial_crime/application_fraud/queries/insights_get_binary_classification_ratios.gsql +++ b/financial_crime/application_fraud/queries/insights_get_binary_classification_ratios.gsql @@ -8,6 +8,10 @@ CREATE OR REPLACE QUERY insights_get_binary_classification_ratios ( INT tg_false_negative, INT tg_false_positive ) { +/* + Description: + Computes binary classification metrics for insights application +*/ TYPEDEF TUPLE Table_Entry; ListAccum @@performance_metrics_stats; @@ -26,4 +30,4 @@ CREATE OR REPLACE QUERY insights_get_binary_classification_ratios ( @@performance_metrics_stats += Table_Entry("Negative Predictive Value", no_tg_npv * 100, tg_npv * 100); @@performance_metrics_stats += Table_Entry("True Negative Rate (Specificity)", no_tg_specificity * 100, tg_specificity * 100); PRINT @@performance_metrics_stats; -} \ No newline at end of file +} diff --git a/financial_crime/application_fraud/queries/insights_get_detected_fraud_amount.gsql b/financial_crime/application_fraud/queries/insights_get_detected_fraud_amount.gsql index 30463e6e..b63c9172 100644 --- a/financial_crime/application_fraud/queries/insights_get_detected_fraud_amount.gsql +++ b/financial_crime/application_fraud/queries/insights_get_detected_fraud_amount.gsql @@ -3,6 +3,11 @@ CREATE OR REPLACE QUERY insights_get_detected_fraud_amount ( INT tg_true_positive, DOUBLE avg_loss_per_app_fraud ) { +/* + Description: + Estimates total detected fraud loss for insights application + +*/ TYPEDEF TUPLE Table_Entry; ListAccum @@detected_fraud_stats; @@ -12,4 +17,4 @@ CREATE OR REPLACE QUERY insights_get_detected_fraud_amount ( @@detected_fraud_stats += Table_Entry("Total Detected Fraud (with TigerGraph)", tg_detected_fraud_amount, "Detected Fraud"); @@detected_fraud_stats += Table_Entry("Total Detected Fraud (w/o TigerGraph)", no_tg_detected_fraud_amount, "Detected Fraud"); PRINT @@detected_fraud_stats; -} \ No newline at end of file +} diff --git a/financial_crime/application_fraud/queries/insights_get_missed_fraud_amount.gsql b/financial_crime/application_fraud/queries/insights_get_missed_fraud_amount.gsql index 20641644..5eaaa98d 100644 --- a/financial_crime/application_fraud/queries/insights_get_missed_fraud_amount.gsql +++ b/financial_crime/application_fraud/queries/insights_get_missed_fraud_amount.gsql @@ -3,6 +3,10 @@ CREATE OR REPLACE QUERY insights_get_missed_fraud_amount ( INT tg_false_negative, DOUBLE avg_loss_per_app_fraud ) { +/* + Description: + Estimates total missed fraud loss (in dollars) for insigts application +*/ TYPEDEF TUPLE Table_Entry; ListAccum @@missed_fraud_stats; @@ -12,4 +16,4 @@ CREATE OR REPLACE QUERY insights_get_missed_fraud_amount ( @@missed_fraud_stats += Table_Entry("Total Missed Fraud (with TG)", tg_missed_fraud_amount, "Missed Fraud"); @@missed_fraud_stats += Table_Entry("Total Missed Fraud (w/o TG)", no_tg_missed_fraud_amount, "Missed Fraud"); PRINT @@missed_fraud_stats; -} \ No newline at end of file +} diff --git a/financial_crime/application_fraud/queries/insights_get_net_benefit_amount.gsql b/financial_crime/application_fraud/queries/insights_get_net_benefit_amount.gsql index fc1a757f..ef477ffc 100644 --- a/financial_crime/application_fraud/queries/insights_get_net_benefit_amount.gsql +++ b/financial_crime/application_fraud/queries/insights_get_net_benefit_amount.gsql @@ -5,6 +5,12 @@ CREATE OR REPLACE QUERY insights_get_net_benefit_amount ( INT tg_true_positive, DOUBLE avg_loss_per_app_fraud ) { +/* + Description: + Estimates the net financial benefit of using TigerGraph features + for fraud detection by comparing detected and missed fraud losses + Used in insights application +*/ TYPEDEF TUPLE Table_Entry; ListAccum @@net_benefit_stats; @@ -16,4 +22,4 @@ CREATE OR REPLACE QUERY insights_get_net_benefit_amount ( @@net_benefit_stats += Table_Entry("Net Deduction in Missed Fraud", net_deduction_missed_fraud_amount, "Net Benefit"); @@net_benefit_stats += Table_Entry("Total Net Gain", net_gain_amount, "Net Benefit"); PRINT @@net_benefit_stats; -} \ No newline at end of file +} diff --git a/financial_crime/application_fraud/queries/insights_get_net_gain_numbers_and_percentages.gsql b/financial_crime/application_fraud/queries/insights_get_net_gain_numbers_and_percentages.gsql index e4913641..6f671bdb 100644 --- a/financial_crime/application_fraud/queries/insights_get_net_gain_numbers_and_percentages.gsql +++ b/financial_crime/application_fraud/queries/insights_get_net_gain_numbers_and_percentages.gsql @@ -6,6 +6,14 @@ CREATE OR REPLACE QUERY insights_get_net_gain_numbers_and_percentages ( INT tg_false_positive, INT tg_false_negative ) { +/* + Description: + Compares fraud detection outcomes between scenarios without and + with TigerGraph features and reports: additional fraud blocked, + net reduction in missed fraud, and net reduction in false positives, + both as counts and as percentage improvements. + Used in insights application +*/ INT additional_fraud_blocked = tg_true_positive - no_tg_true_positive; DOUBLE additional_fraud_blocked_pct = (100.0 * additional_fraud_blocked) / no_tg_true_positive; STRING additional_fraud_blocked_pct_str = to_string(round(additional_fraud_blocked_pct, 3)) + " %"; @@ -26,4 +34,4 @@ CREATE OR REPLACE QUERY insights_get_net_gain_numbers_and_percentages ( net_deduction_false_positive, net_deduction_false_positive_pct_str ; -} \ No newline at end of file +} diff --git a/financial_crime/application_fraud/queries/insights_get_relative_precision_recall_gain.gsql b/financial_crime/application_fraud/queries/insights_get_relative_precision_recall_gain.gsql index 1e152630..f1985e7e 100644 --- a/financial_crime/application_fraud/queries/insights_get_relative_precision_recall_gain.gsql +++ b/financial_crime/application_fraud/queries/insights_get_relative_precision_recall_gain.gsql @@ -6,6 +6,14 @@ CREATE OR REPLACE QUERY insights_get_relative_precision_recall_gain ( INT tg_false_positive, INT tg_false_negative ) { +/* + Description: + Computes the relative percentage gain in recall and precision + when using TigerGraph features versus not using them, based on + binary classification outcomes. + Used in insights application + +*/ DOUBLE no_tg_recall = utils_get_recall(no_tg_true_positive, no_tg_false_negative); DOUBLE tg_recall = utils_get_recall(tg_true_positive, tg_false_negative); DOUBLE relative_recall_gain = (tg_recall - no_tg_recall) / no_tg_recall * 100.0; @@ -20,4 +28,4 @@ CREATE OR REPLACE QUERY insights_get_relative_precision_recall_gain ( relative_recall_gain_str, relative_precision_gain_str ; -} \ No newline at end of file +} diff --git a/financial_crime/application_fraud/queries/match_application_entities.gsql b/financial_crime/application_fraud/queries/match_application_entities.gsql index d13eb6eb..b24191ff 100644 --- a/financial_crime/application_fraud/queries/match_application_entities.gsql +++ b/financial_crime/application_fraud/queries/match_application_entities.gsql @@ -17,6 +17,15 @@ CREATE OR REPLACE DISTRIBUTED QUERY match_application_entities( INT pii_high_connections_limit = 25000, // maximum out degree of an attribute. The case will be skipped if the same attribute is connected to too many entities DATETIME compute_entities_after_date = to_datetime("1970-01-01 00:00:00") // All entities that were created in TG after this date, will be computed to find similarity edges related to these new entities. The default value is the earliest time possible (great for the first time running this query) ) { +/* + Description: + Runs batched entity resolution on all Applications using weighted PII + similarity and inserts Same_Application edges for pairs whose score + exceeds a threshold. + + Output: + Summary of matched pairs and score statistics plus total execution time. +*/ ListAccum> @entity_list; MapAccum,FLOAT> @entity_map; diff --git a/financial_crime/application_fraud/queries/output_application_cc_to_file.gsql b/financial_crime/application_fraud/queries/output_application_cc_to_file.gsql index 919a1f83..dc2dc0f9 100644 --- a/financial_crime/application_fraud/queries/output_application_cc_to_file.gsql +++ b/financial_crime/application_fraud/queries/output_application_cc_to_file.gsql @@ -1,4 +1,13 @@ CREATE OR REPLACE DISTRIBUTED QUERY output_application_cc_to_file(STRING output_file_path = "/home/tigergraph/gsql_output/application_cc_output.csv") { +/* + Description: + Exports a CSV mapping each Application vertex to its + Connected_Component via Application_In_Ring edges. + + Output: + CSV file listing Application id and Connected_Component id + for all mapped Applications, plus a status message. +*/ FILE f (output_file_path); diff --git a/financial_crime/application_fraud/queries/set_application_fraud_status.gsql b/financial_crime/application_fraud/queries/set_application_fraud_status.gsql index 21ad83eb..844680db 100644 --- a/financial_crime/application_fraud/queries/set_application_fraud_status.gsql +++ b/financial_crime/application_fraud/queries/set_application_fraud_status.gsql @@ -1,5 +1,5 @@ CREATE OR REPLACE QUERY set_application_fraud_status(VERTEX application, BOOL fraud_status) { - + start = {application}; start = select s from start:s post-accum s.is_fraud = fraud_status; diff --git a/financial_crime/application_fraud/queries/unify_application_entities.gsql b/financial_crime/application_fraud/queries/unify_application_entities.gsql index 7be94454..da503ca0 100644 --- a/financial_crime/application_fraud/queries/unify_application_entities.gsql +++ b/financial_crime/application_fraud/queries/unify_application_entities.gsql @@ -1,5 +1,20 @@ CREATE OR REPLACE DISTRIBUTED QUERY unify_application_entities() { - +/* + Description: + Groups all Application vertices in the graph into connected + components via Same_Application edges, creating or updating the + corresponding Connected_Component vertices and Application_In_Ring + edges. + + Parameters: + (None) + + Output: + A single row with: + - execution_time_in_seconds : total query runtime in seconds + - timestamp : end-time timestamp for the unify + operation. +*/ MinAccum @cc_id; // Each vertex's tentative component id DATETIME start_time = now(); @@ -7,13 +22,13 @@ CREATE OR REPLACE DISTRIBUTED QUERY unify_application_entities() { start = {Application.*}; - # Initialize: Label each vertex with its own internal ID + // Initialize: Label each vertex with its own internal ID S = SELECT x FROM start:x POST-ACCUM x.@cc_id = getvid(x) ; - # Propagate smaller internal IDs until no more ID changes can be done + // Propagate smaller internal IDs until no more ID changes can be done WHILE (S.size()>0) DO S = SELECT t FROM S:s -(Same_Application:e)- :t diff --git a/financial_crime/application_fraud/queries/utils_get_negative_predictive_value.gsql b/financial_crime/application_fraud/queries/utils_get_negative_predictive_value.gsql index dbf33092..b2e985f3 100644 --- a/financial_crime/application_fraud/queries/utils_get_negative_predictive_value.gsql +++ b/financial_crime/application_fraud/queries/utils_get_negative_predictive_value.gsql @@ -3,4 +3,4 @@ CREATE OR REPLACE QUERY utils_get_negative_predictive_value ( INT false_negative ) RETURNS (DOUBLE) { RETURN (true_negative * 1.0) / (true_negative + false_negative); -} \ No newline at end of file +} diff --git a/financial_crime/application_fraud/readme.md b/financial_crime/application_fraud/readme.md deleted file mode 100644 index e776e5be..00000000 --- a/financial_crime/application_fraud/readme.md +++ /dev/null @@ -1,101 +0,0 @@ -# Narratives -Application fraud detection identifies and prevents unauthorized or deceptive financial applications in real-time. It analyzes shared PII attributes between Applications to detect anomalies and suspicious activity indicative of fraud. TigerGraph models complex relationships and patterns among entities such as shared Name, DOB, Email, Phone, Address, IP, ID, Device, Party, Account, and Card. This enables detection of fraudulent networks and patterns that may be difficult to uncover using traditional relational databases. Using graph algorithms enable organizations to detect and respond to fraudulent transactions quickly and efficiently, ultimately reducing financial losses and protecting consumers from fraudulent activity. - -# Components -This repository includes multiple components: - -- `data` - Sample data. -- `load_jobs` - Scripts for data loading tasks. -- `meta` - Solution Kit metadata. -- `queries` - Collection of GSQL queries. -- `schema` - Definition of database schema. -- `readme.md` - This usage guide. -- `setup.sh` - Automated setup script. - -# Instructions - -The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: - -1. **Schema Creation**: Initiates the schema creation process using the `schema/create_schema.gsql` script. This schema is a subset of the Super Schema in financial_crime/library. -2. **Data Loading**: Load data into the schema by running the data loading job with the `loading_job/load_data.gsql` script. -3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. - -## Installation Note for Queries - -When installing the queries there are **queries** that require special attention: - -- `match_application_entities` and `incremental_match_application_entities`. - -These queries have weights for each PII attribute and threshold parameters used by the weighted WCC matching process. - - - FLOAT Application_Full_Name_weight = 0.2 - - FLOAT Application_DOB_weight = 0.2 - - FLOAT Application_Email_weight = 0.2 - - FLOAT Application_Phone_weight = 0.2 - - FLOAT Application_Address_weight = 0.2 - - FLOAT Application_IP_weight = 0.2 - - FLOAT Application_ID_weight = 1.0 - - FLOAT Application_Device_weight = 1.0 - - FLOAT Application_Party_weight = 1.0 - - FLOAT Application_Account_weight = 1.0 - - FLOAT Application_Card_weight = 1.0 - - FLOAT threshold = 1.0 - -When an Application matches another Application through shared PII attribute(s) the respective weight(s) will accumulate and any accumulated weights equal or over threshold is considered a match. For example if device id has a weight of 1.0 and the threshold is 1.0 then any single matching device id between any Applications will cause those respective Applications to be put into the same Connected_Component community. If device id has a weight of 0.5 and threshold is 1.0 then it would require at least two shared device ids between any Applications to match etc. Default weights and thresholds are assigned to these queries but should be adapted if there are different business rules for matching required to satisfy. - -## Query Execution Order and Explanations - -### Step 1: Insert Edges between Matching Applications - -After loading historical Application and PII data and configuring wcc weights for matching run `match_application_entities` to perform matching on the entire graph. - -### Step 2: Form Communities - -Run `unify_application_entities` to associate all matched Applications from Step 1 into a Connected_Component community. - -Subsequent queries can be run following the completion of the aforementioned two. - -### Step 3: Feature Engineering Queries - -The feature engineering queries generate feature values to test and train the downstream ML model. These queries aggregate the following graph features as bulk CSV file output for all Applications in the graph: - -`batch_application_cc_features` - -Application, Application.is_fraud, Application Connected_Component, Application nodes in Connected_Component, Fraud Application Nodes in Connected_Component, distinct PII nodes in Connected_Component, number of Application nodes in Connected_Component connected by PII, number of Application nodes in Connected_Component only connected by PII. PII includes Name, DOB, Email, Phone, Address, IP, ID, Device, Party, Account, and Card vertices. - -`batch_application_distance_and_path` - -Application, Application.is_fraud, Application Connected_Component, Fraud Application, Fraud Application Connected_Component, Degree of Connection from Application to Fraud Application, Path of Connection from Application to Fraud Application. - -### Step 4: Add New Applications and Detect Fraud in Real-Time - -As new Applications get submitted these can be added to the graph and associated to a Connected_Component community in near real time using `incremental_application_match` query accepting a JSON payload containing all Application and PII data including Name, DOB, Email, Phone, Address, IP, ID, Device, Party, Account, and Card vertices. It's typical to have consistent weight and threshold settings between this and the former `match_application_entities` query for consistent matching behavior. - -Raw Request Example - - http://18.217.164.69:14240/restpp/query/Application_Fraud/incremental_application_match - - input={"application": "6f9e1ca5-e60c-48fd-a1c1-03d13348f888", "created_at": "2024-04-17 11:15:26.076265", "status": "740e989a-3614-427b-b3b0-21fc96ce2fc4", "line_of_credit": "9884.82", "annual_percentage_rate": "36.06", "fraud": true, "product": "414882cd-3fcb-42ef-8ee6-0423ce526124", "name": "3a58b7ce-33cb-46c2-ab65-527e1334927b", "dob": "a1d4472b-0f3c-4465-82a0-e2f889138989", "email": "18baaf68-12de-4291-8fae-8df7f2bf6026", "phone_numbers": [{"type": "mobile", "id": "5ed411af-e38c-4e97-b657-25017695148a"}, {"type": "landline", "id": "60669163-67e7-4e0e-a46b-2c25bdd0ee6b"}], "addresses": [{"type": "mailing", "line_1": "61d2bd45-57f1-4b3c-98a9-265cbcefdead", "line_2": "ed7020ae-35ef-4aae-a13d-3513222fcc50", "city": "fd1e10fb-91e4-4ecf-9ec1-afb2280dfbb5", "state": "694d083b-7006-4561-a2c1-913d80831ba7", "zipcode": "5b8eafe4-01c0-4b1e-8024-595ac6969a7c", "county": "fb4b12a7-bc0e-47cc-852f-ff7da7e61d66", "country": "US"}, {"type": "physical", "line_1": "8f1a04ed-ef4a-4324-8321-be3fe85f17da", "line_2": "e9a58e29-68cd-4e35-a43b-f28914c56e5a", "city": "13cf8f10-814b-4db6-930d-c0a162ef24f1", "state": "d8d7e179-5bb7-4cde-8fb5-97f0e9e55bde", "zipcode": "ff1456dd-b2f8-4c55-9dcc-e2b13b42d05b", "county": "6ba0eec9-4e72-45f1-91aa-1fd246d7400c", "country": "US"}], "ip_address": "83f7dc2f-928c-48a7-b157-0c50572071aa", "ids": [{"type": "Passport", "id": "36f26ab2-35ba-4361-b719-964ae23283b4"}, {"type": "Driver's License Number", "id": "8cefe5da-9f83-4013-a1e8-2e59108726c7"}], "device_id": "7a1f0167-1004-4d6c-8411-e38e414a6129", "party": "00cb01dc-e3e3-492d-b1fe-7c225ba3dde1", "accounts": [{"type": "savings", "id": "fb08e35b-b428-4840-8800-c0f841e7650e"}, {"type": "checking", "id": "3f41189c-b55b-4ec5-b073-182f9f87ceee"}], "cards": [{"type": "credit_card", "id": 241556}, {"type": "debit_card", "id": 278916}]} - -Response - - {'Application': '6f9e1ca5-e60c-48fd-a1c1-03d13348f888', 'entity_resolution': True} - -If the entity_resolution response is True the incoming Application was successfully able to be matched with the historical Application dataset using the matching logic. The `distance_and_path_to_fraud_account` and `get_application_cc_features` queries can be called to retrieve the graph features to augment the downstream ML decision model in near real time. - - http://18.217.164.69:14240/restpp/query/Application_Fraud/distance_and_path_to_fraud_application - - {"version":{"edition":"enterprise","api":"v2","schema":59},"error":false,"message":"","results":[{"result":[{"v_id":"31aeadde-d12a-486d-b2b4-a217daa9404d","v_type":"Application","attributes":{"input_application_id":"fa881c0a-adff-42d5-adf5-19de0cbe3547","input_application_id_cc":"675282944","degree_of_connection":2,"is_fraud":true,"application_id":"31aeadde-d12a-486d-b2b4-a217daa9404d","application_id_cc":"675282944","path_of_connection":["Device"]}}]}]} - - http://18.217.164.69:14240/restpp/query/Application_Fraud/get_application_cc_features - - {"version":{"edition":"enterprise","api":"v2","schema":59},"error":false,"message":"","results":[{"result":[{"v_id":"31aeadde-d12a-486d-b2b4-a217daa9404d","v_type":"Application","attributes":{"connectedComponentID":643825664,"total_cc_nodes":2,"fraud_applications_in_cc":2,"name_distinct_in_cc":2,"name_number_nodes":0,"name_number_nodes_only_connected":0,"dob_distinct_in_cc":2,"dob_number_nodes":0,"dob_number_nodes_only_connected":0,"email_distinct_in_cc":2,"email_number_nodes":0,"email_number_nodes_only_connected":0,"phone_distinct_in_cc":4,"phone_number_nodes":0,"phone_number_nodes_only_connected":0,"address_distinct_in_cc":4,"address_number_nodes":0,"address_number_nodes_only_connected":0,"ip_distinct_in_cc":2,"ip_number_nodes":0,"ip_number_nodes_only_connected":0,"id_distinct_in_cc":4,"id_number_nodes":0,"id_number_nodes_only_connected":0,"device_distinct_in_cc":1,"device_number_nodes":0,"device_number_nodes_only_connected":0,"party_distinct_in_cc":0,"party_number_nodes":0,"party_number_nodes_only_connected":0,"account_distinct_in_cc":4,"account_number_nodes":0,"account_number_nodes_only_connected":0,"card_distinct_in_cc":4,"card_number_nodes":0,"card_number_nodes_only_connected":0}},{"v_id":"fa881c0a-adff-42d5-adf5-19de0cbe3547","v_type":"Application","attributes":{"connectedComponentID":643825664,"total_cc_nodes":2,"fraud_applications_in_cc":2,"name_distinct_in_cc":2,"name_number_nodes":0,"name_number_nodes_only_connected":0,"dob_distinct_in_cc":2,"dob_number_nodes":0,"dob_number_nodes_only_connected":0,"email_distinct_in_cc":2,"email_number_nodes":0,"email_number_nodes_only_connected":0,"phone_distinct_in_cc":4,"phone_number_nodes":0,"phone_number_nodes_only_connected":0,"address_distinct_in_cc":4,"address_number_nodes":0,"address_number_nodes_only_connected":0,"ip_distinct_in_cc":2,"ip_number_nodes":0,"ip_number_nodes_only_connected":0,"id_distinct_in_cc":4,"id_number_nodes":0,"id_number_nodes_only_connected":0,"device_distinct_in_cc":1,"device_number_nodes":0,"device_number_nodes_only_connected":0,"party_distinct_in_cc":0,"party_number_nodes":0,"party_number_nodes_only_connected":0,"account_distinct_in_cc":4,"account_number_nodes":0,"account_number_nodes_only_connected":0,"card_distinct_in_cc":4,"card_number_nodes":0,"card_number_nodes_only_connected":0}}]}]} - -If entity_resolution is False the incoming Application was not able to be matched with the historical Application dataset and the same respective Application should be sent to `incremental_unify_entity` query so it's eligible for matching against future Applications in near real time. - -## Mock Data - -The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. - -## ML Model and Insights Application -You can find the instructions for training the ML model and its performance metrics within the documentation located in the model folder. The Insights Applications are available as JSON files in the meta folder. diff --git a/financial_crime/application_fraud/schema/local_schema.txt b/financial_crime/application_fraud/schema/local_schema.txt new file mode 100644 index 00000000..11cb39c7 --- /dev/null +++ b/financial_crime/application_fraud/schema/local_schema.txt @@ -0,0 +1,171 @@ +USE GLOBAL + +// (PII) +CREATE VERTEX Phone ( + PRIMARY_ID phone_number STRING +) WITH primary_id_as_attribute="true" + +CREATE VERTEX Device ( + PRIMARY_ID id STRING, + is_blocked BOOL +) WITH primary_id_as_attribute="true" + +CREATE VERTEX Full_Name ( + PRIMARY_ID name STRING +) WITH primary_id_as_attribute="true" + +CREATE VERTEX Card ( + PRIMARY_ID card_number INT, + is_fraud INT, + pagerank FLOAT, + c_id INT, + c_size INT, + occupation STRING +) WITH primary_id_as_attribute="true" + +CREATE VERTEX DOB ( + PRIMARY_ID dob STRING +) WITH primary_id_as_attribute="true" + +// Products (cards, loans, etc.) +CREATE VERTEX Product ( + PRIMARY_ID id STRING, + name STRING, + description STRING, + type_of STRING +) WITH primary_id_as_attribute="true" + +// Connected component / fraud ring ID +CREATE VERTEX Connected_Component ( + PRIMARY_ID id INT +) WITH primary_id_as_attribute="true" + +// Account entity +CREATE VERTEX Account ( + PRIMARY_ID id STRING, + create_Time DATETIME, + is_fraud INT, + account_type STRING, + account_level STRING, + com_size INT, + pagerank FLOAT, + shortest_path_length INT, + ip_collision INT, + fraud_ip INT, + device_collision INT, + fraud_device INT, + trans_in_mule_ratio FLOAT, + trans_out_mule_ratio FLOAT, + mule_cnt INT, + com_id INT, +) WITH primary_id_as_attribute="true" + +CREATE VERTEX State ( + PRIMARY_ID id STRING +) WITH primary_id_as_attribute="true" + +CREATE VERTEX IP ( + PRIMARY_ID id STRING, + is_blocked BOOL +) WITH primary_id_as_attribute="true" + +CREATE VERTEX Address ( + PRIMARY_ID address STRING +) WITH primary_id_as_attribute="true" + +CREATE VERTEX Email ( + PRIMARY_ID email STRING +) WITH primary_id_as_attribute="true" + +CREATE VERTEX Country ( + PRIMARY_ID country STRING +) WITH primary_id_as_attribute="true" + +CREATE VERTEX City ( + PRIMARY_ID id STRING, + city STRING, + population INT +) WITH primary_id_as_attribute="true" + +CREATE VERTEX County ( + PRIMARY_ID id STRING +) WITH primary_id_as_attribute="true" + +CREATE VERTEX Zipcode ( + PRIMARY_ID id STRING +) WITH primary_id_as_attribute="true" + +// Party (person / organization) +CREATE VERTEX Party ( + PRIMARY_ID id STRING, + is_fraud INT, + gender STRING, + dob DATETIME, + party_type STRING, + name STRING, + created_at DATETIME +) WITH primary_id_as_attribute="true" + +CREATE VERTEX ID ( + PRIMARY_ID id STRING, + id_type STRING +) WITH primary_id_as_attribute="true" + +// Credit application +CREATE VERTEX Application ( + PRIMARY_ID id STRING, + created_at DATETIME, + status STRING, + line_of_credit FLOAT, + annual_percentage_rate FLOAT, + is_fraud BOOL +) WITH primary_id_as_attribute="true" + + +CREATE UNDIRECTED EDGE Has_Address (FROM Party, TO Address) +CREATE UNDIRECTED EDGE Has_ID (FROM Party, TO ID) +CREATE UNDIRECTED EDGE Has_IP (FROM Party, TO IP) +CREATE UNDIRECTED EDGE Has_Device (FROM Party, TO Device) +CREATE UNDIRECTED EDGE Has_Phone (FROM Party, TO Phone) +CREATE UNDIRECTED EDGE Has_Email (FROM Party, TO Email) +CREATE UNDIRECTED EDGE Has_DOB (FROM Party, TO DOB) +CREATE UNDIRECTED EDGE Has_Full_Name (FROM Party, TO Full_Name) + +CREATE UNDIRECTED EDGE Application_Has_Full_Name (FROM Application, TO Full_Name) +CREATE UNDIRECTED EDGE Application_Has_Phone (FROM Application, TO Phone) +CREATE UNDIRECTED EDGE Application_Has_Email (FROM Application, TO Email) +CREATE UNDIRECTED EDGE Application_Has_DOB (FROM Application, TO DOB) +CREATE UNDIRECTED EDGE Application_Has_ID (FROM Application, TO ID) +CREATE UNDIRECTED EDGE Application_Has_Device (FROM Application, TO Device) +CREATE UNDIRECTED EDGE Application_Has_IP (FROM Application, TO IP) +CREATE UNDIRECTED EDGE Application_Has_Address (FROM Application, TO Address) +CREATE UNDIRECTED EDGE Application_Has_Product (FROM Application, TO Product) +CREATE UNDIRECTED EDGE Application_Has_Account (FROM Application, TO Account) +CREATE UNDIRECTED EDGE Application_Has_Party (FROM Party, TO Application) +CREATE UNDIRECTED EDGE Application_Has_Card (FROM Application, TO Card) + + +CREATE UNDIRECTED EDGE Assigned_To (FROM Address|Zipcode, TO Zipcode|City) +CREATE UNDIRECTED EDGE Assigned_To_County (FROM Zipcode, TO County) +CREATE UNDIRECTED EDGE Located_In (FROM Address|City, TO City|State) +CREATE UNDIRECTED EDGE Located_In_State (FROM County, TO State) +CREATE UNDIRECTED EDGE Located_In_Country (FROM State, TO Country) + + +CREATE UNDIRECTED EDGE Application_In_Ring (FROM Application, TO Connected_Component) +CREATE UNDIRECTED EDGE Entity_In_Ring (FROM Connected_Component, TO Party) +CREATE UNDIRECTED EDGE Same_As (FROM Party, TO Party, score DOUBLE) +CREATE UNDIRECTED EDGE Same_Application (FROM Application, TO Application, score DOUBLE) + + +CREATE GRAPH Application_Fraud(Phone, Email, Address, IP, Device, City, +Country,State, Full_Name, Zipcode, County, ID, Party, Connected_Component, DOB, +Application, Product, Account, Card, Has_Address, Has_ID, Has_IP, Has_Device, +Has_Phone, Has_Email, Located_In, Assigned_To_County, Located_In_State, +Located_In_Country, Same_As, Entity_In_Ring, Has_DOB, Has_Full_Name, +Application_Has_Full_Name, Application_Has_Phone, Application_Has_Email, +Application_Has_DOB, Application_Has_ID, Application_Has_Device, +Application_Has_IP, Same_Application, Application_Has_Address, +Application_In_Ring, Application_Has_Product, Application_Has_Account, +Application_Has_Party, Assigned_To, Application_Has_Card) +set exit_on_error = "false" diff --git a/financial_crime/entity_resolution_kyc/README.md b/financial_crime/entity_resolution_kyc/README.md new file mode 100644 index 00000000..99c4601f --- /dev/null +++ b/financial_crime/entity_resolution_kyc/README.md @@ -0,0 +1,128 @@ +# Entitiy Resolution (ER) and Know Your Customer (KYC) +This solution kit provides a comprehensive graph-based approach to Entity Resolution (ER) +and Know Your Customer (KYC) compliance. It enables the creation of a Single Customer +View (SCV) by linking scattered records through weighted attribute matching, +detecting fraud rings in real-time, and generating powerful graph features for +downstream machine learning models. + +--- + +## Contents + +- [Overview](#overview) +- [Components](#components) +- [ML Features](#ml-features-er-derived-features-for-risk-scoring) +- [Instructions](#instructions) + - [Setup and Installation](#setup-and-installation) + - [Query Execution Order and Explanations](#query-execution-order-and-explanations) + +--- + +## Overview + +Accurate **Entity Resolution (ER)** is the foundation of effective Know Your Customer (KYC) compliance and financial crime detection. Traditional systems often fail to link scattered customer records, leading to gaps in risk profiles. This TigerGraph solution addresses this by: + +* **Establishing a Single Customer View (SCV):** By clustering all PII-linked records (via the **`Same_As`** edge) into a single **`Connected_Component`**, the solution creates an authoritative view of the customer, crucial for compliance. +* **Weighted Weakly Connected Components (WCC):** We use an approximate **weighted matching algorithm** where shared attributes (Email, Phone, Name, etc.) contribute a customizable **score** to the total match. This provides a high-fidelity, tunable mechanism for linking parties. +* **Real-Time Fraud Linkage:** Any new Party is instantly matched against existing entities using `incremental_party_match`. If a match is found and the existing entity belongs to a component linked to a known fraudulent party, the new entity is immediately flagged, drastically reducing risk exposure. +* **Feature Engineering for Downstream ML:** The resulting `Connected_Component` structure allows for powerful feature extraction, providing machine learning models with metrics like "Count of Parties in Component" or "Shortest Path to Known Fraudulent Party." + +*** + +## Components + +This repository is structured to provide a comprehensive, ready-to-deploy graph solution. + +| Component | Description | +| :--- | :--- | +| `Schema` | Definition of the database schema, focusing on the **`Party`** vertex, PII, and the `Connected_Component` structure (18 core vertices). | +| `Queries` | A collection of **GSQL queries** for matching, unification, feature engineering, and real-time lookup. | +| `Mock Data` | Sample data for Party, PII, and relationship loading. | +| `Loading Jobs` | Two loading jobs: one for cloud integration and the other for on-prem local deployment. | +| `README.md` | This usage guide. | +| `setup.sh` | Automated setup script for schema creation, loading, and query installation. | + +*** + +## ML Features: ER-Derived Features for Risk Scoring + +The structure created by the Entity Resolution process generates powerful features for risk scoring and identifying high-risk entities. + +1. **Community Size :** + * **Feature:** The total count of **`Party`** vertices belonging to the same **`Connected_Component`**. + * **Insight:** Larger components may indicate potential fraud rings, as numerous distinct records share PII/attributes. + * **Query Source:** `batch_party_cc_features` or `get_party_cc_features`. + +2. **Shortest Path Length to Fraud:** + * **Feature:** The shortest path length from a given **`Party`** or **`Connected_Component`** to any known **fraudulent Party** (`Party.is_fraud = 1`). + * **Insight:** Direct proximity (short path) is a primary risk metric for linkage analysis. + * **Query Source:** `batch_party_distance_and_path` or `distance_and_path_to_fraud_party`. + +3. **PII Overlap Count:** + * **Feature:** The accumulated weighted score of matching PII attributes (Full Name, DOB, Email, etc.) between a target Party and other Parties in the graph. + * **Insight:** Quantifies the **strength** and **type** of common links that resulted in the entity match. + * **Query Source:** Calculated within `incremental_party_match` and `match_party_entities`. + +4. **PII Multi-Party Degree:** + * **Feature:** The out-degree of a critical PII vertex (e.g., `Email`, `Phone`) that is shared by multiple distinct parties. + * **Insight:** High-degree PIIs (like an IP address used by many accounts) are strong indicators of shared risk or a single fraud operation. + +5. **Linked Asset Count:** + * **Feature:** The number of unique `Account`, `Card`, and `Application` vertices linked to the Party's **`Connected_Component`**. + * **Insight:** Assesses the total scope and exposure of the consolidated customer view. + +*** + +## Instructions + +### Setup and Installation + +1. **Schema Creation:** Run the general schema creation script, followed by the specific ER/KYC schema script. +2. **Data Loading:** Load data into the schema by running the data loading job script (`local_loading_job.gsql`). +3. **Query Installation:** Install all **15 GSQL queries** by executing the `install_queries.sh` script. + +### Query Execution Order and Explanations + +The solution requires a sequential execution of the batch queries to populate the core ER structure. + +#### Step 1: Graph Cleanup (Maintenance) + +*(Optional, but highly recommended before a fresh batch run)* + +| Query | Purpose | +| :--- | :--- | +| **`delete_all_party_cc_connections`** | Deletes old `Entity_In_Ring` edges to clear previous clustering results. | +| **`delete_unused_cc_nodes`** | Removes `Connected_Component` vertices that no longer link to any `Party`. | + +#### Step 2: Batch Entity Resolution (Matching and Unification) + +This is the core process to establish the links and the clusters. + +| Query | Purpose | Resulting Edges/Vertices | +| :--- | :--- | :--- | +| **`match_party_entities`** | Calculates the weighted similarity score between all potential Party pairs and inserts the **`Same_As`** edge if the score exceeds the `threshold`. | Inserts **`Same_As`** edges | +| **`unify_party_entities`** | Runs the Connected Components algorithm based on all existing `Same_As` edges. | Creates **`Connected_Component`** vertices; Inserts **`Entity_In_Ring`** edges | + +#### Step 3: Feature Engineering (Batch Calculation) + +These queries run after Step 2 is complete. They calculate the ML features and output the results. + +| Query | Purpose | Output/Target | +| :--- | :--- | :--- | +| **`batch_party_cc_features`** | Calculates structural features (e.g., CC size) for every Party in the graph. | Writes results to output file. | +| **`batch_party_distance_and_path`** | Finds the shortest path and minimum distance from all parties to known fraud (`Party.is_fraud = 1`). | Writes results to output file. | +| **`output_party_cc_to_file`** | Exports the final Party-to-Connected\_Component mapping. | Writes mapping to CSV file. | + +#### Step 4: Real-Time / Incremental Queries + +These queries are used for real-time lookups and transactional updates. + +| Query | Purpose | Type | +| :--- | :--- | :--- | +| **`incremental_party_match`** | Inserts a new Party and attempts to match it to an existing `Connected_Component`. | Real-Time Insert/Match | +| **`incremental_party_unify`** | Creates a new `Connected_Component` for a Party that did not match any existing component. | Real-Time Unification | +| **`set_party_fraud_status`** | Updates the `is_fraud` attribute for a given Party. | Real-Time Update | +| **`get_party_cc_features`** | Retrieves pre-computed features for a single Party. | Real-Time Lookup | +| **`distance_and_path_to_fraud_party`** | Retrieves the path to fraud for a single Party. | Investigation/Lookup | +| **`find_shared_piis_of_two_parties`** | Finds the specific PII attributes that connect any two parties. | Investigation/Lookup | + diff --git a/financial_crime/entity_resolution_kyc/meta/Insights_Application_Entity_Resolution_kyc.json b/financial_crime/entity_resolution_kyc/meta/Insights_Application_Entity_Resolution_kyc.json new file mode 100644 index 00000000..f2167dce --- /dev/null +++ b/financial_crime/entity_resolution_kyc/meta/Insights_Application_Entity_Resolution_kyc.json @@ -0,0 +1,488 @@ +{ + "defaultGraph": "Entity_Resolution_KYC", + "iconURL": "/insights/static/media/atom.14f5dd297b1a450cae3413a44f69a75b.svg", + "id": "arDCiwHPNbgkqU51x1tBNY", + "pageConfigSeparated": true, + "pages": [ + { + "chartMap": { + "2g1ThadQg96eQw9vWvjWKc": { + "chartSettings": { + "rulesByType": { + "Party": [ + { + "condition": "=", + "conditionEndValue": 1, + "conditionStartValue": 0, + "conditionValue": "1", + "fieldName": "is_fraud", + "fieldType": "number", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#ff0000" + } + ] + } + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "2g1ThadQg96eQw9vWvjWKc", + "patternLimit": 5, + "query": "INTERPRET QUERY(VERTEX componentId) FOR GRAPH Entity_Resolution_KYC {\n src = {componentId};\n SetAccum @@edge;\n SetAccum @@node;\n res = \n SELECT s FROM\n src:s - (:e1) - Party:p - (:e2) - (Address|Phone):t \n ACCUM @@edge += e1,@@edge += e2,@@node += s,@@node += p,@@node += t\n ;\n PRINT @@edge;\n \n nodes = {@@node};\n PRINT nodes;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Connected Component Graph", + "type": "internal-graph", + "version": "1764696139513871285" + }, + "2zfNGnX5qw4YvjKuguE776": { + "chartSettings": { + "values": [ + { + "fontColor": "#000000", + "fontSize": 32, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "id": "h48iV6XssVA3mGqRSn3dVa", + "key": "party", + "label": "Total Parties", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 20, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 32, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "id": "1D9rXhSo9qv3jjqj6S7Z59", + "key": "active_address", + "label": "Active Address", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 20, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 32, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "id": "jJhtv54FHCFSaoUZD7Bcy7", + "key": "connected_component", + "label": "Connected Component", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 20, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 32, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "id": "uh2uw457meuyBNT7TnVhyW", + "key": "risk_alert", + "label": "Risk Alert", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 20, + "styleRule": [] + } + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "2zfNGnX5qw4YvjKuguE776", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution_KYC {\n res = \n SELECT s from Party:s\n ;\n PRINT res.size() as party;\n \n res = SELECT s from\n Address:s - () - Party:p;\n PRINT res.size() as active_address;\n \n res = SELECT s FROM Connected_Component:s - () - Party:p;\n PRINT res.size() as connected_component;\n \n res = SELECT s FROM Connected_Component:s - () - Party:p WHERE p.is_fraud > 0;\n PRINT res.size() as risk_alert;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "value", + "version": "1764693452951940468" + }, + "6Yj4WGLfhs5iobubJJAgza": { + "chartSettings": { + "category": [ + { + "id": "typ", + "type": "string" + } + ], + "radius": [ + 0, + 71 + ], + "tableHeaders": [ + "cnt", + "typ" + ], + "tableIndex": 0, + "value": [ + { + "id": "cnt", + "type": "number" + } + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "6Yj4WGLfhs5iobubJJAgza", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution_KYC {\n GroupByAccum cnt> @@grp;\n SumAccum @connCount;\n \n parties = SELECT p FROM Party:p\n ACCUM p.@connCount = p.outdegree()\n POST-ACCUM\n CASE \n WHEN p.@connCount <= 1 THEN @@grp += (\"Single Entity\"->1)\n WHEN p.@connCount <= 5 THEN @@grp += (\"2-5 Connections\"->1)\n WHEN p.@connCount <= 10 THEN @@grp += (\"6-10 Connections\"->1)\n ELSE @@grp += (\"11+ Connections\"->1)\n END;\n \n PRINT @@grp;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Connection Distribution", + "type": "pie", + "version": "1764696420610182024" + }, + "aZsdFDfkYXbLqore5CeS77": { + "chartSettings": { + "category": [ + { + "id": "entity", + "type": "string" + } + ], + "tableHeaders": [ + "cnt", + "entity" + ], + "tableIndex": 0, + "value": [ + { + "id": "cnt", + "type": "number" + } + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "aZsdFDfkYXbLqore5CeS77", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution_KYC {\n GroupByAccum cnt> @@grp;\n \n partySrc = \n SELECT s FROM Party:s ACCUM @@grp += (\"Party\"->1)\n ;\n res = \n SELECT t from partySrc:s - () - (Address|Phone|Email|Device|IP):t \n POST-ACCUM @@grp += (t.type->1)\n ;\n PRINT @@grp;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Entity Type Distribution", + "type": "bar", + "version": "1764695406058758862" + }, + "kSo7qKu14XSMP6mQCjryv1": { + "chartSettings": { + "showColumns": [ + { + "isChecked": false, + "name": "v_id" + }, + { + "isChecked": false, + "name": "v_type" + }, + { + "isChecked": true, + "name": "Address" + }, + { + "isChecked": true, + "name": "ComponentId" + }, + { + "isChecked": true, + "name": "Parties" + }, + { + "isChecked": true, + "name": "Phone" + }, + { + "isChecked": true, + "name": "Risk" + } + ], + "sortedColumns": [ + "ComponentId", + "Parties", + "Address", + "Phone", + "Risk", + "Action" + ], + "tableActions": [ + { + "pageID": "viZkpMCa5wFYgruGR9JSWy", + "params": [ + { + "isCreatable": false, + "name": "componentId", + "paramGlobalInput": "", + "value": "ComponentId" + } + ], + "text": "click", + "url": "", + "urlName": "New Page" + } + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [ + { + "children": [ + { + "id": "phone_number", + "isChecked": "true", + "type": "vertex" + } + ], + "id": "Phone", + "isChecked": "true", + "type": "vertex" + }, + { + "children": [ + { + "id": "address", + "isChecked": "true", + "type": "vertex" + } + ], + "id": "Address", + "isChecked": "true", + "type": "vertex" + } + ], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "v_id", + "v_type", + "Address", + "ComponentId", + "Parties", + "Phone", + "Risk" + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "kSo7qKu14XSMP6mQCjryv1", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution_KYC {\n SumAccum @parties,@address,@phone,@risk;\n res = \n SELECT s FROM\n Connected_Component:s - () - Party:p - () - (Address|Phone):t\n ACCUM\n CASE WHEN t.type == \"Address\" THEN s.@address += 1 ELSE t.@phone += 1 END\n //CASE WHEN p.is_fraud > 0 THEN s.@risk += 1 END\n POST-ACCUM s.@parties += s.outdegree(\"Entity_In_Ring\"),s.@risk += sum(s.neighborAttribute( \"Entity_In_Ring\", \"Party\", \"is_fraud\" ))\n\n\n \n ORDER BY s.@parties DESC\n ;\n PRINT res[res.id as ComponentId,res.@parties as Parties,res.@address as Address,res.@phone as Phone,res.@risk as Risk];\n}", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "Top_Connected_Component", + "id": "Top_Connected_Component", + "type": "QUERY" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top Connected Component", + "type": "table", + "version": "1764697101443443105" + }, + "pWVjbPx54GYQyYYTHe8Qpj": { + "chartSettings": { + "inputStates": [ + { + "dataType": "vertex", + "id": "input_rjFCLCFNBMSX1tGBx9rHuK", + "name": "componentId", + "selectedGraph": "Entity_Resolution_KYC", + "settings": { + "fixVertexType": true + }, + "vertexType": "Connected_Component", + "widgetType": "Input" + } + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "pWVjbPx54GYQyYYTHe8Qpj", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "New Widget", + "type": "Inputs", + "version": "1764696049224336494" + } + }, + "globalParameters": { + "componentId": { + "id": "input_rjFCLCFNBMSX1tGBx9rHuK", + "name": "componentId", + "type": "VERTEX", + "value": { + "vertexID": "1678770176", + "vertexType": "Connected_Component" + } + } + }, + "iconURL": "/insights/static/media/library.d3f7f207c6bb1d7be8e64045a19991b2.svg", + "id": "viZkpMCa5wFYgruGR9JSWy", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 33, + "i": "2g1ThadQg96eQw9vWvjWKc", + "moved": false, + "static": false, + "w": 7, + "x": 5, + "y": 28 + }, + { + "h": 8, + "i": "2zfNGnX5qw4YvjKuguE776", + "moved": false, + "static": false, + "w": 12, + "x": 0, + "y": 0 + }, + { + "h": 20, + "i": "6Yj4WGLfhs5iobubJJAgza", + "moved": false, + "static": false, + "w": 5, + "x": 0, + "y": 8 + }, + { + "h": 20, + "i": "aZsdFDfkYXbLqore5CeS77", + "moved": false, + "static": false, + "w": 7, + "x": 5, + "y": 8 + }, + { + "h": 33, + "i": "kSo7qKu14XSMP6mQCjryv1", + "moved": false, + "static": false, + "w": 5, + "x": 0, + "y": 28 + }, + { + "h": 12, + "i": "pWVjbPx54GYQyYYTHe8Qpj", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 61 + } + ], + "xs": [ + { + "h": 8, + "i": "2zfNGnX5qw4YvjKuguE776", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 20, + "i": "6Yj4WGLfhs5iobubJJAgza", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 8 + }, + { + "h": 20, + "i": "aZsdFDfkYXbLqore5CeS77", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 28 + }, + { + "h": 33, + "i": "kSo7qKu14XSMP6mQCjryv1", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 48 + }, + { + "h": 25, + "i": "2g1ThadQg96eQw9vWvjWKc", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 81 + }, + { + "h": 12, + "i": "pWVjbPx54GYQyYYTHe8Qpj", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 106 + } + ] + }, + "title": "Entity Network Dashboard", + "version": "1764696199602489941", + "weight": 20 + } + ], + "title": "Entity_Resolution_KYC", + "userRoleForApp": "owner", + "version": "1764673492229672678" +} \ No newline at end of file diff --git a/financial_crime/entity_resolution_kyc/queries/batch_party_cc_features.gsql b/financial_crime/entity_resolution_kyc/queries/batch_party_cc_features.gsql index f4acba91..5c679747 100644 --- a/financial_crime/entity_resolution_kyc/queries/batch_party_cc_features.gsql +++ b/financial_crime/entity_resolution_kyc/queries/batch_party_cc_features.gsql @@ -1,5 +1,24 @@ CREATE OR REPLACE DISTRIBUTED QUERY batch_party_cc_features(INT connections=25000, STRING output_file_path = "/home/tigergraph/gsql_output/batch_party_cc_features.csv") { + /* + Query Name: batch_party_cc_features + + Purpose: Generate Machine Learning Features for Fraud Detection + This query traverses the graph to extract aggregated features from the Connected Component (CC) + that each 'Party' belongs to, focusing on shared Personally Identifiable Information (PII) + and other related entities (Accounts, Cards, etc.). + + Key Use Cases: + . Feature Engineering: Create robust, graph-based variables to train a fraud prediction model. + . Ring Detection: Identify 'Parties' that are linked via shared PII, often indicative of fraud rings. + . Risk Profiling: Assess the risk of a Party based on the size and fraudulence of its community. + + Parameters: + connections: Upper limit for the out-degree of PII/Entity nodes (e.g., limit a shared IP address + to prevent overly broad, non-specific connections, often called 'Super-Node' filtering). + output_file_path: Location to save the resulting CSV file with all features. + */ + MapAccum @@connect_component, @@distinct_name, @@distinct_dob, @@distinct_email, @@distinct_phone, @@distinct_address, @@distinct_ip, @@distinct_id, @@distinct_device, @@distinct_party, @@distinct_account, @@distinct_card, @@connected_via_name, @@connected_via_dob, @@connected_via_email, @@connected_via_phone, @@connected_via_address, @@connected_via_ip, @@connected_via_id, @@connected_via_device, @@connected_via_party, @@connected_via_account, @@connected_via_card, @@only_connected_via_name, @@only_connected_via_dob, @@only_connected_via_email, @@only_connected_via_phone, @@only_connected_via_address, @@only_connected_via_ip, @@only_connected_via_id, @@only_connected_via_device, @@only_connected_via_party, @@only_connected_via_account, @@only_connected_via_card; OrAccum @connected_via_name, @connected_via_dob, @connected_via_email, @connected_via_phone, @connected_via_address, @connected_via_ip, @connected_via_id, @connected_via_device, @connected_via_party, @connected_via_account, @connected_via_card; SetAccum @middle_type; @@ -126,4 +145,4 @@ UPDATE DESCRIPTION OF QUERY batch_party_cc_features "This query outputs a file c UPDATE DESCRIPTION OF QUERY_PARAM batch_party_cc_features.connections "Maximum outdegree of each PII vertex allowed in search used to filter out hub nodes. Defaults to 25000." -UPDATE DESCRIPTION OF QUERY_PARAM batch_party_cc_features.output_file_path "File path location to containing the output graph features in CSV format. Defaults to /home/tigergraph/gsql_output/batch_party_cc_features.csv" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM batch_party_cc_features.output_file_path "File path location to containing the output graph features in CSV format. Defaults to /home/tigergraph/gsql_output/batch_party_cc_features.csv" diff --git a/financial_crime/entity_resolution_kyc/queries/batch_party_distance_and_path.gsql b/financial_crime/entity_resolution_kyc/queries/batch_party_distance_and_path.gsql index 2c1ff471..f61d1194 100644 --- a/financial_crime/entity_resolution_kyc/queries/batch_party_distance_and_path.gsql +++ b/financial_crime/entity_resolution_kyc/queries/batch_party_distance_and_path.gsql @@ -1,5 +1,25 @@ CREATE DISTRIBUTED QUERY batch_party_distance_and_path(INT depth=5, STRING output_file_path = "/home/tigergraph/gsql_output/batch_party_distance_and_path_features.csv") { + /* + Query Name: batch_party_distance_and_path + Fraudulent Party Linkage Analysis (Shortest Path & Distance) + + Purpose: Graph Linkage Feature Generation + This query is run for *every* 'Party' in the graph (a batch process) to: + 1. Find the shortest path and distance (in hops) to any other 'Party' that is marked as **fraudulent**. + 2. Record the precise path of shared PII/Entities that connects the two. + 3. Generate a feature set that shows the degree of separation between a Party and known fraud. + + Key Use Cases: + . Feature Engineering: Create "Distance-to-Fraud" and "Path-Type" features for ML models. + . Risk Prioritization: Identify non-fraudulent parties (is_fraud=0) that are only 1-2 hops away from known fraud. + . Investigation: Quickly trace the PII/Entity linkage for high-risk individuals. + + Parameters: + depth: Maximum traversal depth (number of hops) for the search. Limits search time and focuses on close connections. + output_file_path: Location to save the resulting CSV file. + */ + MapAccum, VERTEX> @@cc_map; MaxAccum @dis; MaxAccum @visited; @@ -73,4 +93,4 @@ UPDATE DESCRIPTION OF QUERY batch_party_distance_and_path "This query outputs a UPDATE DESCRIPTION OF QUERY_PARAM batch_party_distance_and_path.depth "Maximum number of hops from source Party vertex to traverse in the graph. Defaults to 5." -UPDATE DESCRIPTION OF QUERY_PARAM batch_party_distance_and_path.output_file_path "File path location to containing the output graph features in CSV format. Defaults to /home/tigergraph/gsql_output/batch_party_distance_and_path_features.csv" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM batch_party_distance_and_path.output_file_path "File path location to containing the output graph features in CSV format. Defaults to /home/tigergraph/gsql_output/batch_party_distance_and_path_features.csv" diff --git a/financial_crime/entity_resolution_kyc/queries/delete_all_party_cc_connections.gsql b/financial_crime/entity_resolution_kyc/queries/delete_all_party_cc_connections.gsql index 8c9cb643..47dcee6d 100644 --- a/financial_crime/entity_resolution_kyc/queries/delete_all_party_cc_connections.gsql +++ b/financial_crime/entity_resolution_kyc/queries/delete_all_party_cc_connections.gsql @@ -1,5 +1,25 @@ CREATE OR REPLACE DISTRIBUTED QUERY delete_all_party_cc_connections(INT num_of_batches = 1, INT batch_id = 0) { + /* + Query Name: delete_all_party_cc_connections + Graph Maintenance: Clear Connected Component (CC) Assignments + + Purpose: Preparation for Community Detection + This query's sole purpose is to efficiently **delete** all existing `Entity_In_Ring` edges + that link `Party` vertices to their associated `Connected_Component` vertices. This action is + a required precursor before running a fresh community detection algorithm (like a new Connected + Component run) to ensure the graph features are calculated based on the latest community structure. + + Key Concept: Batch Processing + The query is designed to run in distributed, manageable batches (`num_of_batches` and `batch_id`) + to prevent memory/resource exhaustion when deleting a massive number of edges (millions or billions) + in a single transaction. + + Parameters: + num_of_batches: Total number of jobs required to process all `Party` vertices. + batch_id: The specific partition of `Party` vertices to process in the current run (0 to num_of_batches - 1). + */ + SumAccum @@count; temp = SELECT t @@ -18,4 +38,4 @@ UPDATE DESCRIPTION OF QUERY delete_all_party_cc_connections "This query deletes UPDATE DESCRIPTION OF QUERY_PARAM delete_all_party_cc_connections.num_of_batches "Number of batches to partition the deletions for all Party vertices in the graph. This query must be called the same number of times as batch_num with incrementing batch_id 0 through batch_num - 1 to process all Party vertices in the graph. Defaults to 1." -UPDATE DESCRIPTION OF QUERY_PARAM delete_all_party_cc_connections.batch_id "Current batch partition of Party vertices to process and must be called with batch_id 0 through batch_num - 1. Defaults to 0." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM delete_all_party_cc_connections.batch_id "Current batch partition of Party vertices to process and must be called with batch_id 0 through batch_num - 1. Defaults to 0." diff --git a/financial_crime/entity_resolution_kyc/queries/delete_unused_cc_nodes.gsql b/financial_crime/entity_resolution_kyc/queries/delete_unused_cc_nodes.gsql index 98705745..75810467 100644 --- a/financial_crime/entity_resolution_kyc/queries/delete_unused_cc_nodes.gsql +++ b/financial_crime/entity_resolution_kyc/queries/delete_unused_cc_nodes.gsql @@ -1,4 +1,24 @@ CREATE OR REPLACE DISTRIBUTED QUERY delete_unused_cc_nodes(INT num_of_batches = 1, INT batch_id = 0) { + /* + Query Name: delete_unused_cc_nodes + Graph Maintenance: Prune Unused Connected Component (CC) Vertices + + Purpose: Cleanup and Optimization + This query is designed to clean up the graph by **deleting** any `Connected_Component` (CC) + vertex that no longer serves a purpose. After running a community detection algorithm and then + deleting the assignment edges (e.g., using `delete_all_party_cc_connections`), some CC nodes + might remain but have no connections to any `Party` vertices. Deleting these "orphaned" nodes + reduces graph size, improves query performance, and maintains graph integrity. + + Key Concept: Orphaned Node Removal + A CC node is considered 'unused' or 'orphaned' if its **out-degree is zero** (it has no outgoing edges). + Since the CC node is only supposed to have outgoing edges to `Party` vertices, an out-degree of zero + means it is not linked to any entities and can be safely removed. + + Parameters: + num_of_batches: Total number of jobs required to process all `Connected_Component` vertices. + batch_id: The specific partition of `Connected_Component` vertices to process in the current run. + */ SumAccum @@count; @@ -20,4 +40,4 @@ UPDATE DESCRIPTION OF QUERY delete_unused_cc_nodes "This query deletes all unuse UPDATE DESCRIPTION OF QUERY_PARAM delete_unused_cc_nodes.num_of_batches "Number of batches to partition the deletions for all Connected_Component vertices in the graph. This query must be called the same number of times as batch_num with incrementing batch_id 0 through batch_num - 1 to process all Connected_Component vertices in the graph. Defaults to 1." -UPDATE DESCRIPTION OF QUERY_PARAM delete_unused_cc_nodes.batch_id "Current batch partition of Connected_Component vertices to process and must be called with batch_id 0 through batch_num - 1. Defaults to 0." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM delete_unused_cc_nodes.batch_id "Current batch partition of Connected_Component vertices to process and must be called with batch_id 0 through batch_num - 1. Defaults to 0." diff --git a/financial_crime/entity_resolution_kyc/queries/distance_and_path_to_fraud_party.gsql b/financial_crime/entity_resolution_kyc/queries/distance_and_path_to_fraud_party.gsql index 5dee6779..9f3e48b1 100644 --- a/financial_crime/entity_resolution_kyc/queries/distance_and_path_to_fraud_party.gsql +++ b/financial_crime/entity_resolution_kyc/queries/distance_and_path_to_fraud_party.gsql @@ -1,4 +1,23 @@ CREATE OR REPLACE QUERY distance_and_path_to_fraud_party(VERTEX input, INT depth=5) { + /* + Query Name: distance_and_path_to_fraud_party + Real-Time Shortest Path to Fraudulent Parties + + Purpose: Real-Time Risk Assessment + This query is designed for near **real-time execution**, performing a single Breadth-First Search (BFS) + starting from a specific `Party` (e.g., a new applicant). Its goal is to quickly find the shortest + distance and the precise chain of shared PII/Entities linking the input Party to any known + `Fraud Party` within a specified hop `depth`. + + Key Use Cases: + . Transaction Monitoring: Identify if a customer is closely linked to fraud before approving a transaction. + . New Application Scoring: Immediately flag new applicants connected to high-risk communities. + . Investigation: Quickly map the shortest connection path between a suspect and known fraud entities. + + Parameters: + input: The specific Party vertex to start the search from (the target of the risk assessment). + depth: Maximum number of hops allowed. Keeps the search fast and focused on close-range risk. + */ MinAccum @dis; OrAccum @visited; @@ -48,4 +67,4 @@ UPDATE DESCRIPTION OF QUERY distance_and_path_to_fraud_party "This query returns UPDATE DESCRIPTION OF QUERY_PARAM distance_and_path_to_fraud_party.input "Source Party vertex from which to begin traversal searching for closely connected Fraud Party vertices." -UPDATE DESCRIPTION OF QUERY_PARAM distance_and_path_to_fraud_party.depth "Maximum number of hops from source Party vertex to traverse in the graph. Defaults to 5." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM distance_and_path_to_fraud_party.depth "Maximum number of hops from source Party vertex to traverse in the graph. Defaults to 5." diff --git a/financial_crime/entity_resolution_kyc/queries/expand_connected_component_entities_as_same_as_edge.gsql b/financial_crime/entity_resolution_kyc/queries/expand_connected_component_entities_as_same_as_edge.gsql new file mode 100644 index 00000000..096451b4 --- /dev/null +++ b/financial_crime/entity_resolution_kyc/queries/expand_connected_component_entities_as_same_as_edge.gsql @@ -0,0 +1,51 @@ +CREATE OR REPLACE DISTRIBUTED QUERY expand_connected_component_entities_same_as_edge ( + VERTEX input_connected_component +) { + /* + Query: expand_connected_component_entities_same_as_edge + Description: + Following Query is used to give an holistic view of the given input Connected_Component node using Same_As edge + + input: Connected_Component : `1678770178` + + output: connected edges + */ + OrAccum @visited; + SetAccum @@edges_to_display; + input_connected_component_set = {input_connected_component}; + + curr_linked_parties = SELECT t + FROM input_connected_component_set:s -(Entity_In_Ring:e)- Party:t + WHERE getvid(t) == s.id + ACCUM @@edges_to_display += e + + ; + + WHILE curr_linked_parties.size() > 0 DO + curr_linked_parties = SELECT t + FROM curr_linked_parties:s -(Same_As:e)- Party:t + ACCUM @@edges_to_display += e + HAVING NOT t.@visited + ; + + curr_linked_parties = SELECT s + FROM curr_linked_parties:s + POST-ACCUM s.@visited += True + ; + END; + + all_related_parties = SELECT s + FROM Party:s + WHERE s.@visited + ; + + PRINT + input_connected_component + ; + + PRINT + input_connected_component_set, + all_related_parties + ; + PRINT @@edges_to_display; +} diff --git a/financial_crime/entity_resolution_kyc/queries/expand_connected_components_to_entities_vis.gsql b/financial_crime/entity_resolution_kyc/queries/expand_connected_components_to_entities_vis.gsql new file mode 100644 index 00000000..d5a386e5 --- /dev/null +++ b/financial_crime/entity_resolution_kyc/queries/expand_connected_components_to_entities_vis.gsql @@ -0,0 +1,64 @@ +CREATE OR REPLACE DISTRIBUTED QUERY expand_connected_components_to_entities_vis ( + VERTEX input_connected_component, + BOOL show_only_shared_features = False +) { + + /* + Query: expand_connected_components_to_entities_vis + Description: + Following Query is used to give an holistic view of all connected edges for the Connected_Component node + input: Connected_Component : `1678770178` + + output: connected edges + */ + SumAccum @num_parties_linked_to_features; + SetAccum @@edges_to_display; + input_connected_component_set = {input_connected_component}; + + SetAccum @@edge_type_set = ( + "Has_Full_Name", + "Has_DOB", + "Has_Email", + "Has_Phone", + "Has_Address", + "Has_IP", + "Has_ID", + "Has_Device", + "Application_Has_Party", + "Party_Has_Account", + "Party_Has_Card" + ); + + linked_parties = SELECT t + FROM input_connected_component_set:s -(Entity_In_Ring:e)- Party:t + ACCUM @@edges_to_display += e + ; + + selected_features_to_display = SELECT t + FROM linked_parties:s -(@@edge_type_set:e)- :t + ACCUM t.@num_parties_linked_to_features += 1 + ; + + IF show_only_shared_features THEN + selected_features_to_display = SELECT s + FROM selected_features_to_display:s + WHERE s.@num_parties_linked_to_features > 1 + ; + END; + + selected_features_to_display = SELECT s + FROM selected_features_to_display:s -(@@edge_type_set:e)- linked_parties:t + ACCUM @@edges_to_display += e + ; + + PRINT + input_connected_component, + show_only_shared_features + ; + PRINT + input_connected_component_set, + linked_parties, + selected_features_to_display + ; + PRINT @@edges_to_display; +} diff --git a/financial_crime/entity_resolution_kyc/queries/explore_er_steps.gsql b/financial_crime/entity_resolution_kyc/queries/explore_er_steps.gsql new file mode 100644 index 00000000..c290126f --- /dev/null +++ b/financial_crime/entity_resolution_kyc/queries/explore_er_steps.gsql @@ -0,0 +1,67 @@ +CREATE OR REPLACE DISTRIBUTED QUERY explore_er_steps ( + SET> party_to_explore, + BOOL explore_step_01, + BOOL explore_step_02, + BOOL explore_step_03 +) { + // The visualization for this query will be more intuitive + // if all vertices in party_to_explore were all in the same vertex community + + // Step 01 = explore attribute edges - before match_party_entities + // Step 02 = status after match_party_entities was run + // Step 03 = status after unify_party_entities was run + SetAccum @@edges_to_display; + SetAccum @@attr_edges; + OrAccum @is_start_set; + + @@attr_edges += ( + "Has_Full_Name", + "Has_DOB", + "Has_Email", + "Has_Phone", + "Has_Address", + "Has_IP", + "Has_ID", + "Has_Device", + "Application_Has_Party", + "Party_Has_Account", + "Party_Has_Card" + ); + + party_to_explore_set = {party_to_explore}; + + party_to_explore_set = SELECT s + FROM party_to_explore_set:s + POST-ACCUM s.@is_start_set += TRUE + ; + + IF explore_step_01 THEN + attributes_explore_step_01 = SELECT t + FROM party_to_explore_set:s -(@@attr_edges:e)- :t + ACCUM @@edges_to_display += e + ; + + PRINT attributes_explore_step_01; + END; + + IF explore_step_02 THEN + party_with_same_as_edges = SELECT s + FROM party_to_explore_set:s -(Same_As:e)- Party:t + WHERE t.@is_start_set == TRUE + ACCUM @@edges_to_display += e + ; + END; + + IF explore_step_03 THEN + curr_party_cc_step_03 = SELECT t + FROM party_to_explore_set:s -(Entity_In_Ring:e)- Connected_Component:t + ACCUM @@edges_to_display += e + ; + + PRINT curr_party_cc_step_03; + END; + + + PRINT party_to_explore_set; + PRINT @@edges_to_display; +} diff --git a/financial_crime/entity_resolution_kyc/queries/find_shared_piis_of_two_parties.gsql b/financial_crime/entity_resolution_kyc/queries/find_shared_piis_of_two_parties.gsql index 80b69d73..ece20ce2 100644 --- a/financial_crime/entity_resolution_kyc/queries/find_shared_piis_of_two_parties.gsql +++ b/financial_crime/entity_resolution_kyc/queries/find_shared_piis_of_two_parties.gsql @@ -1,5 +1,26 @@ CREATE OR REPLACE QUERY find_shared_piis_of_two_parties(VERTEX party_1, VERTEX party_2) { + /* + Query Name: find_shared_piis_of_two_parties + Real-Time PII Linkage Analysis between Two Parties + + Purpose: Investigative Link Discovery + The query's goal is to identify all **Personally Identifiable Information (PII)** nodes + (like Name, Address, IP, Device, etc.) that are **directly shared** by the two input + `Party` vertices. This immediately reveals the underlying connection, which is a critical + step in verifying potential fraud rings or suspicious collusion. + + Key Output Features: + . PII Type (e.g., 'Full_Name'): What type of entity is shared. + . PII Value (Vertex ID): The ID of the shared entity itself. + . Degree (Out-degree): How many *other* parties are also connected to this shared PII. + A high degree indicates a high-risk 'hub' entity. + + Parameters: + party_1: The first Party vertex ID. + party_2: The second Party vertex ID. + */ + TYPEDEF TUPLE pii_info; ListAccum @@degrees_of_shared_piis; @@ -29,4 +50,4 @@ UPDATE DESCRIPTION OF QUERY find_shared_piis_of_two_parties "This query returns UPDATE DESCRIPTION OF QUERY_PARAM find_shared_piis_of_two_parties.party_1 "First Party vertex used to search for shared PII vertices." -UPDATE DESCRIPTION OF QUERY_PARAM find_shared_piis_of_two_parties.party_2 "Second Party vertex used to search for shared PII vertices." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM find_shared_piis_of_two_parties.party_2 "Second Party vertex used to search for shared PII vertices." diff --git a/financial_crime/entity_resolution_kyc/queries/get_party_cc_features.gsql b/financial_crime/entity_resolution_kyc/queries/get_party_cc_features.gsql index 8c99f0a7..6e260b6d 100644 --- a/financial_crime/entity_resolution_kyc/queries/get_party_cc_features.gsql +++ b/financial_crime/entity_resolution_kyc/queries/get_party_cc_features.gsql @@ -1,5 +1,24 @@ CREATE OR REPLACE QUERY get_party_cc_features(VERTEX party, INT connections=25000) { + /* + Query Name: get_party_cc_features + Real-Time Connected Component (CC) Feature Extraction for a Single Party + + Purpose: Real-Time Feature Engineering for Single Entity Risk + This query quickly calculates detailed, graph-based features for a specific `Party` by analyzing + the aggregated properties of the entire `Connected_Component` (CC) it belongs to. This is ideal + for calculating a risk score for a single entity (e.g., a new loan application or login attempt). + + Key Features Calculated (for the CC): + . Size and Composition: Total Party nodes, distinct PII nodes (Name, IP, Device, etc.). + . Linkage Strength: Number of Parties connected via shared PII. + . Specificity: Number of Parties connected *only* by a single type of shared PII (e.g., only by DOB). + + Parameters: + party: The specific Party vertex ID to analyze. + connections: PII Super-Node filter. Maximum out-degree allowed for PII nodes to be considered valid connectors. + */ + MapAccum @@connect_component, @@distinct_name, @@distinct_dob, @@distinct_email, @@distinct_phone, @@distinct_address, @@distinct_ip, @@distinct_id, @@distinct_device, @@distinct_application, @@distinct_account, @@distinct_card, @@connected_via_name, @@connected_via_dob, @@connected_via_email, @@connected_via_phone, @@connected_via_address, @@connected_via_ip, @@connected_via_id, @@connected_via_device, @@connected_via_application, @@connected_via_account, @@connected_via_card, @@only_connected_via_name, @@only_connected_via_dob, @@only_connected_via_email, @@only_connected_via_phone, @@only_connected_via_address, @@only_connected_via_ip, @@only_connected_via_id, @@only_connected_via_device, @@only_connected_via_application, @@only_connected_via_account, @@only_connected_via_card; OrAccum @connected_via_name, @connected_via_dob, @connected_via_email, @connected_via_phone, @connected_via_address, @connected_via_ip, @connected_via_id, @connected_via_device, @connected_via_application, @connected_via_account, @connected_via_card; SetAccum @middle_type; @@ -166,4 +185,4 @@ UPDATE DESCRIPTION OF QUERY get_party_cc_features "This query returns the follow UPDATE DESCRIPTION OF QUERY_PARAM get_party_cc_features.party "Source Party vertex for generating graph features." -UPDATE DESCRIPTION OF QUERY_PARAM get_party_cc_features.connections "Maximum outdegree of each PII vertex allowed in search used to filter out hub nodes. Defaults to 25000." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM get_party_cc_features.connections "Maximum outdegree of each PII vertex allowed in search used to filter out hub nodes. Defaults to 25000." diff --git a/financial_crime/entity_resolution_kyc/queries/get_party_fraud_status.gsql b/financial_crime/entity_resolution_kyc/queries/get_party_fraud_status.gsql index 5b75a5a0..5408d647 100644 --- a/financial_crime/entity_resolution_kyc/queries/get_party_fraud_status.gsql +++ b/financial_crime/entity_resolution_kyc/queries/get_party_fraud_status.gsql @@ -1,4 +1,23 @@ CREATE DISTRIBUTED QUERY get_party_fraud_status(SET> parties) { + + /* + Query Name: get_party_fraud_status + Bulk Retrieval of Party Fraud Status + + Purpose: Data Lookup and Status Verification + This query takes a set of specific `Party` vertices as input and quickly returns + the value of their `is_fraud` attribute. It is typically used for external systems + to retrieve known labels for downstream analysis, model evaluation, or auditing. + It performs no graph traversal, making it extremely fast. + + Key Use Cases: + . Model Evaluation: Retrieve ground truth labels for a set of Parties for model performance testing. + . System Integration: Fetch current fraud flags for multiple entities efficiently. + . Audit and Reporting: Generate a quick report on the current status of specific accounts. + + Parameters: + parties: A set of Party vertex IDs whose fraud status needs to be retrieved. + */ MapAccum, INT> @@fraud_status; @@ -13,4 +32,4 @@ CREATE DISTRIBUTED QUERY get_party_fraud_status(SET> parties) { UPDATE DESCRIPTION OF QUERY get_party_fraud_status "This query returns the integer fraud status of each provided Party vertex." -UPDATE DESCRIPTION OF QUERY_PARAM get_party_fraud_status.parties "Party vertices for obtaining fraud status as stored in is_fraud integer attribute." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM get_party_fraud_status.parties "Party vertices for obtaining fraud status as stored in is_fraud integer attribute." diff --git a/financial_crime/entity_resolution_kyc/queries/incremental_party_match.gsql b/financial_crime/entity_resolution_kyc/queries/incremental_party_match.gsql index 67bfc9bc..22d48e58 100644 --- a/financial_crime/entity_resolution_kyc/queries/incremental_party_match.gsql +++ b/financial_crime/entity_resolution_kyc/queries/incremental_party_match.gsql @@ -14,6 +14,25 @@ CREATE OR REPLACE QUERY incremental_party_match( FLOAT threshold = 1.0 // if a total matching score between two entities exceed the threshold, a similarity edge will be created between the matching entities ) { + /* + Query Name: incremental_party_match + Real-Time Weighted Entity Resolution and Matching + + Purpose: Incremental Graph Update and De-duplication + This query handles the full lifecycle of a new (or updated) Party record: + 1. **Data Ingestion:** Parses a JSON payload and inserts the new Party and all associated PII/Entity nodes (like Address, IP, Account) into the graph. + 2. **Scoring:** Calculates a **weighted match score** between the new Party and *all existing Parties* by summing the weights of their shared PII/Entities. + 3. **Resolution:** If the score to an existing Party's community exceeds a defined `threshold`, the new Party is linked to that highest-scoring `Connected_Component` (CC), effectively merging it into an existing group. + + Key Concept: Weighted Matching + Different PII/Entity types carry different evidential weight for matching (e.g., a shared Device ID is a strong match, a shared Name is weaker). This is controlled by the input `weight` parameters. + + Parameters: + input: The JSON string containing the new Party and PII data. + Customer_[Entity]_weight: Weights defining the contribution of each shared PII/Entity type to the total score. + threshold: The minimum cumulative score required to declare a match and perform the merge. + */ + TYPEDEF TUPLE ver> Connected_Component_heap_tuple; HeapAccum(1, score DESC) @@Connected_Component_ID_heap; SetAccum @@edge_type_set, @@party_vertex, @@name, @@dob, @@email, @@phone_numbers, @@addresses, @@ip, @@ids, @@device, @@applications, @@accounts, @@cards; @@ -459,4 +478,4 @@ UPDATE DESCRIPTION OF QUERY_PARAM incremental_party_match.Customer_Account_weigh UPDATE DESCRIPTION OF QUERY_PARAM incremental_party_match.Customer_Card_weight "Weight accumulation for matching Card PII vertices between provided input Party and every other Party in the graph. Defaults to 1.0." -UPDATE DESCRIPTION OF QUERY_PARAM incremental_party_match.threshold "Accumulated weight threshold required for matching Party vertices into a Connected_Component community using the provided weight for each PII vertex type. Defaults to 1.0." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM incremental_party_match.threshold "Accumulated weight threshold required for matching Party vertices into a Connected_Component community using the provided weight for each PII vertex type. Defaults to 1.0." diff --git a/financial_crime/entity_resolution_kyc/queries/incremental_party_unify.gsql b/financial_crime/entity_resolution_kyc/queries/incremental_party_unify.gsql index ab029e0b..31a1bb59 100644 --- a/financial_crime/entity_resolution_kyc/queries/incremental_party_unify.gsql +++ b/financial_crime/entity_resolution_kyc/queries/incremental_party_unify.gsql @@ -1,5 +1,27 @@ CREATE OR REPLACE DISTRIBUTED QUERY incremental_party_unify(SET> parties) { + /* + Query Name: incremental_party_unify + Incremental Community Initialization (Connected Component Assignment) + + Purpose: Graph Structure Completion + This query takes a set of newly inserted `Party` vertices that were not successfully + matched to an existing `Connected_Component` (CC) community by the matching logic + (e.g., `incremental_party_match`). It then assigns each of these unmatched Parties + to a **new, unique CC** of its own. + + Core Logic: + 1. **Self-Assignment:** Initially treat each Party (or small group linked by `Same_As` edges) + as its own CC. + 2. **Propagation (Optional):** If the input Parties are linked by `Same_As` edges (a rare scenario + in a strictly incremental flow, but accounted for), this step uses the standard Connected + Component algorithm (specifically, label propagation) to group them. + 3. **Creation:** Inserts a new `Connected_Component` vertex and links the Party to it. + + Parameters: + parties: The set of new Party vertices that need to be assigned to a CC. + */ + MinAccum @cc_id; // Each vertex's tentative component id DATETIME start_time = now(); @@ -41,4 +63,4 @@ CREATE OR REPLACE DISTRIBUTED QUERY incremental_party_unify(SET> p UPDATE DESCRIPTION OF QUERY incremental_party_unify "This query associates Party vertices unable to be matched to any existing Party using incremental_party_match into its own respective Connected_Component community vertex so it can be eligible for weighted weakly connected components matching in near real time." -UPDATE DESCRIPTION OF QUERY_PARAM incremental_party_unify.parties "Party vertices to put into a Connected_Component community." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM incremental_party_unify.parties "Party vertices to put into a Connected_Component community." diff --git a/financial_crime/entity_resolution_kyc/queries/match_party_entities.gsql b/financial_crime/entity_resolution_kyc/queries/match_party_entities.gsql index 3f69c364..942aa444 100644 --- a/financial_crime/entity_resolution_kyc/queries/match_party_entities.gsql +++ b/financial_crime/entity_resolution_kyc/queries/match_party_entities.gsql @@ -18,6 +18,28 @@ CREATE OR REPLACE DISTRIBUTED QUERY match_party_entities( DATETIME compute_entities_after_date = to_datetime("1970-01-01 00:00:00") // All entities that were created in TG after this date, will be computed to find similarity edges related to these new entities. The default value is the earliest time possible (great for the first time running this query) ) { + /* + Query Name: match_party_entities + Batch-Based Weighted Entity Resolution (All-Pairs Similarity) + + Purpose: Full Graph De-Duplication + This query is an **offline batch job** designed to find and link similar `Party` vertices + across the entire graph based on shared PII/Entities. It calculates a weighted score + for every pair of parties connected by a shared PII, and if the score exceeds the + `threshold`, it creates a `Same_As` edge between them. + + Key Concepts: + . **Weighted Similarity:** Each shared PII type contributes a different weight to the final score. + . **Nested Batching:** The graph is divided into `source` and `target` batches to manage memory and parallelize the all-pairs comparison efficiently. + . **Super-Node Filtering:** PII nodes (e.g., a common IP address shared by millions of users) are filtered out using `pii_low_connections_limit` and `pii_high_connections_limit` to prevent false positives and poor performance. + + Parameters: + Customer_[Entity]_weight: Weights for each PII type defining match strength. + num_of_source_batches/num_of_target_batches: Control the size and number of batches for distributed processing. + threshold: Score required to create a `Same_As` link. + compute_entities_after_date: Incremental processing date filter, useful for re-running the job on recent data only. + */ + ListAccum> @entity_list; MapAccum,FLOAT> @entity_map; SumAccum @@count; @@ -265,4 +287,4 @@ UPDATE DESCRIPTION OF QUERY_PARAM match_party_entities.pii_low_connections_limit UPDATE DESCRIPTION OF QUERY_PARAM match_party_entities.pii_high_connections_limit "Maximum outdegree of PII vertex considered for 'high connection' matching. The case will be skipped if the same vertex is connected to too many entities. Defaults to 25000." -UPDATE DESCRIPTION OF QUERY_PARAM match_party_entities.compute_entities_after_date "All Party vertices with created_at after this date will be computed to find similarity edges related to these new entities. Defaults to 1970-01-01 00:00:00 (which is the earliest possible created_at and great for the initial running of this query)." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM match_party_entities.compute_entities_after_date "All Party vertices with created_at after this date will be computed to find similarity edges related to these new entities. Defaults to 1970-01-01 00:00:00 (which is the earliest possible created_at and great for the initial running of this query)." diff --git a/financial_crime/entity_resolution_kyc/queries/output_party_cc_to_file.gsql b/financial_crime/entity_resolution_kyc/queries/output_party_cc_to_file.gsql index 32543849..01919618 100644 --- a/financial_crime/entity_resolution_kyc/queries/output_party_cc_to_file.gsql +++ b/financial_crime/entity_resolution_kyc/queries/output_party_cc_to_file.gsql @@ -1,5 +1,24 @@ CREATE OR REPLACE QUERY output_party_cc_to_file(STRING output_file_path = "/home/tigergraph/gsql_output/party_cc_output.txt") { - + /* + Query Name: output_party_cc_to_file + Export Party-to-Community Mapping to File + + Purpose: Data Export and Downstream Integration + The primary goal of this query is to **generate a plain text or CSV file** that maps + every `Party` vertex in the graph to the `Connected_Component` community vertex it + currently belongs to. This output is critical for: + . **Model Training:** Providing ground truth labels (community IDs) for machine learning models. + . **Reporting/Auditing:** Creating a comprehensive record of the current entity resolution state. + . **External System Integration:** Loading the community assignments into a data warehouse or external database. + + Key Concepts: + . FILE Object: TigerGraph's utility to write data directly to a file on the server. + . Simple Traversal: A single hop is used to link the Party to its assigned community. + + Parameters: + output_file_path: The full path where the output file (e.g., CSV) should be created on the TigerGraph server. + */ + FILE f (output_file_path); f.println("Party", "Connected_Component"); @@ -17,4 +36,4 @@ CREATE OR REPLACE QUERY output_party_cc_to_file(STRING output_file_path = "/home UPDATE DESCRIPTION OF QUERY output_party_cc_to_file "This query outputs a file containing a mapping of all Party vertices to their respective Connected_Component community vertex." -UPDATE DESCRIPTION OF QUERY_PARAM output_party_cc_to_file.output_file_path "File path location containing the output Party vertex to Connected_Component vertex in CSV format. Defaults to /home/tigergraph/gsql_output/party_cc_output.csv" \ No newline at end of file +UPDATE DESCRIPTION OF QUERY_PARAM output_party_cc_to_file.output_file_path "File path location containing the output Party vertex to Connected_Component vertex in CSV format. Defaults to /home/tigergraph/gsql_output/party_cc_output.csv" diff --git a/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql b/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql new file mode 100644 index 00000000..820e41bc --- /dev/null +++ b/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql @@ -0,0 +1,34 @@ +CREATE OR REPLACE DISTRIBUTED QUERY Top_Connected_Component(/* Parameters here */) FOR GRAPH Entity_Resolution_KYC { + + /* + Query Name: Top_Connected_Component + Top Connected Component Summary Analysis + + Purpose: Community Profiling and Risk Prioritization + This query traverses the graph to aggregate key statistics for each `Connected_Component` (CC) + vertex. It focuses on gathering the total count of associated `Party` vertices, the number of + unique PII entities (specifically `Address` and `Phone`), and a cumulative risk score. The + results are then sorted to show the largest/most active communities first. + + Key Metrics Calculated per Connected_Component: + . Parties: Total number of Party vertices belonging to the CC. + . Address: Total number of unique Address PII vertices linked to the CC. + . Phone: Total number of unique Phone PII vertices linked to the CC. + . Risk: The sum of the `is_fraud` attribute from all associated Party vertices (cumulative risk score). + */ + + SumAccum @parties,@address,@phone,@risk; + res = + SELECT s FROM + Connected_Component:s - () - Party:p - () - (Address|Phone):t + ACCUM + CASE WHEN t.type == "Address" THEN s.@address += 1 ELSE t.@phone += 1 END + //CASE WHEN p.is_fraud > 0 THEN s.@risk += 1 END + POST-ACCUM s.@parties += s.outdegree("Entity_In_Ring"),s.@risk += sum(s.neighborAttribute( "Entity_In_Ring", "Party", "is_fraud" )) + + + + ORDER BY s.@parties DESC + ; + PRINT res[res.id as ComponentId,res.@parties as Parties,res.@address as Address,res.@phone as Phone,res.@risk as Risk]; +} diff --git a/financial_crime/entity_resolution_kyc/queries/unify_party_entities.gsql b/financial_crime/entity_resolution_kyc/queries/unify_party_entities.gsql index c469f5b9..6deb61e8 100644 --- a/financial_crime/entity_resolution_kyc/queries/unify_party_entities.gsql +++ b/financial_crime/entity_resolution_kyc/queries/unify_party_entities.gsql @@ -1,5 +1,26 @@ CREATE OR REPLACE DISTRIBUTED QUERY unify_party_entities() { + /* + Query Name: unify_party_entities + Finalization of Entity Resolution: Connected Component Assignment + + Purpose: Clustering and Community Creation + This query is the final, essential step in a batch Entity Resolution pipeline. It performs + the Connected Components (CC) algorithm, using the transitive links established by the + `Same_As` similarity edges (created by `match_party_entities`), to **group all connected + `Party` vertices** into single, distinct communities. + + Core Logic: Label Propagation (MinAccum) + 1. **Initialization:** Every Party vertex is initially labeled with its own unique ID. + 2. **Propagation:** Smaller IDs are iteratively passed across `Same_As` edges until every + Party in a connected cluster has adopted the smallest ID among them. This smallest ID + becomes the unique identifier for the entire community (the CC ID). + 3. **Creation:** New `Connected_Component` vertices are created using these final CC IDs, + and linking edges (`Entity_In_Ring`) are established. + + Key Outcome: Graph structure is finalized with explicit community assignments. + */ + MinAccum @cc_id; // Each vertex's tentative component id DATETIME start_time = now(); @@ -39,4 +60,4 @@ CREATE OR REPLACE DISTRIBUTED QUERY unify_party_entities() { } -UPDATE DESCRIPTION OF QUERY unify_party_entities "This query associates all Party vertices in the graph to a Connected_Component vertex using Same_As edge previously inserted by match_party_entities query." \ No newline at end of file +UPDATE DESCRIPTION OF QUERY unify_party_entities "This query associates all Party vertices in the graph to a Connected_Component vertex using Same_As edge previously inserted by match_party_entities query." diff --git a/financial_crime/mule_account_detection/README.md b/financial_crime/mule_account_detection/README.md new file mode 100644 index 00000000..548dd816 --- /dev/null +++ b/financial_crime/mule_account_detection/README.md @@ -0,0 +1,359 @@ +# Mule Account Detection Solution +**A graph-based analytics solution for identifying mule accounts, suspicious transactions, and hidden fraud patterns in real-time.** + +--- + +## Contents + +- [Overview](#overview) +- [ML Features](#ml-features) +- [Queries Included](#queries-included) +- [Components](#components) +- [Prerequisites](#prerequisites) +- [Quickstart Guide](#quickstart-guide) +- [Query Execution Order and Explanations](#query-execution-order-and-explanations) +- [Load Sample Data](#load-sample-data) +- [Graph Schema](#graph-schema) +- [Sample Dataset](#sample-dataset) +- [Using Your Own Data](#using-your-own-data) +- [Cleanup](#cleanup-optional) +- [Insights Applications](#insights-applications) + +--- + +## Overview + +Mule account detection in financial crime is a critical endeavor aimed at +identifying bank accounts that are used to receive and disperse money from +illicit activities. These mule accounts, which may be involved either knowingly +or unknowingly in these operations, present a significant challenge for financial +institutions. The rapid movement of funds through an extensive and seemingly +unconnected network of accounts, spread across numerous financial institutions, +complicates the tracking and halting of such illicit transactions. Financial +institutions are therefore tasked with detecting this activity promptly to +prevent further fund transfer and ensure the return of assets to their rightful +owners. TigerGraph's solutions address this challenge through a suite of +sophisticated graph algorithms that enable real-time monitoring, feature engineering +for machine learning, and anomaly detection to trace illegal funds effectively. +Community Detection is employed to unveil clusters within the transaction network, +highlighting groups of accounts that work in concert to move illicit funds. +The Centrality (PageRank) algorithm identifies key accounts that act as +central nodes in the distribution network, crucial for disrupting the flow +of illicit money. Closeness (Shortest Path) analysis reveals the most direct +routes for money laundering, aiding in the trace back to the source. Lastly, +Deep Link Analysis uncovers hidden connections between accounts, providing a +comprehensive understanding of the network's structure and operation. Together, +these algorithms form the backbone of TigerGraph's approach to dismantling the +complex networks of mule accounts, safeguarding the financial system against +the movement of illicit funds. + +--- +# ML Features + +### Here is a list of graph features we provide in the solution kit: + +1. **Community Detection**: + We use the weighted Weakly Connected Components (WCC) algorithm to detect + communities within the graph. This includes updating the size of each community + and assigning community IDs, which help enhance the machine learning models + for identifying mule accounts. + +2. **PageRank Score**: + The PageRank score for each account vertex in the account network is calculated. + This score represents the importance or influence of each account, aiding the + machine learning models in detecting mule accounts. + +3. **Shortest Path Length**: + We calculate the shortest path length from each account to identified mule + accounts. This metric helps determine the proximity of each account to known + mule accounts, providing valuable features for the models. + +4. **IP Sharing**: + The number of mule accounts that share the same IP address for each account + is calculated. This feature helps identify suspicious accounts connected + through the same IP address. + +5. **Multi-Hop IP Analysis**: + We calculate the number of IP addresses linked to any mule accounts within a + specified number of hops. This feature identifies potentially fraudulent IP + connections, enhancing the machine learning models. + +6. **Device Sharing**: + The number of mule accounts that share the same device for each account is + calculated. This aids in identifying suspicious accounts connected through + the same device. + +7. **Multi-Hop Device Analysis**: + We calculate the number of devices linked to any mule accounts within a + specified number of hops, helping to identify potentially fraudulent device + connections. + +8. **Transfer Ratio**: + The ratio of the total amount of money transferred to and from mule accounts + relative to the total transfer amount for each account is calculated. This + includes the ratio of incoming and outgoing transfers, identifying suspicious + transfer patterns. + +9. **Multi-Hop Mule Account Analysis**: + We calculate the total number of mule accounts within a specified number of + hops from each account. This helps assess the proximity and density of mule + accounts in the network. + +## Queries Included + +Below is a list of queries included in this solution. + +### **1. Identify Potential Mule Accounts** +Scores accounts based on suspicious behavior patterns such as high-velocity flows, +layering, quick pass-through transactions, etc. + +### **2. Multi-hop Transaction Tracing** +Finds the complete chain of fund movement starting from a given account up to N hops. + +### **3. Circular Transaction Finder** +Detects loops where money moves between accounts and eventually returns to the origin. + +### **4. Suspicious Cluster Detection** +Uses graph connectivity to identify tightly linked groups behaving in coordinated ways. + +### **5. Behavioral Anomaly Detection** +Compares current account transaction behavior against its own historical patterns. + +### **6. Account Profile Explorer** +Retrieves everything related to a specific account: customer, devices, IPs, +merchants, inflow/outflow, partner accounts, and more. + +### **7. Counterparty Risk Aggregation** +Measures the risk level of counterparties transacting with a given account. + +### **8. Velocity-based Red Flags** +Detects spikes in incoming or outgoing amounts over short windows. + + +# Components +This repository includes multiple components: + +- `Schema` - Definition of database schema. +- `Queries` - Collection of GSQL queries. +- `Mock Data` - Sample data. +- `Loading Jobs` - two loading jobs: one for cloud integration and the other for on-prem local deployment. +- `Insights Applications` - ROI Dashboard and Analytical Insights Applicaitions. +- `README.md` - This usage guide. +- `setup.sh` - Automated setup script. + + +## Prerequisites + +On TG Savanna, you can install these with the click of a button, and the +following prerequisites are for an on-prem setup. + +You will need: + +- **TigerGraph 3.x or 4.x** +- GSQL client access +- RESTPP endpoint enabled +- Ability to install schema + queries +- Linux shell or Windows WSL for running scripts + +--- + +## Quickstart Guide + + +### 1. Create the schema + +This schema is a subset of the Super Schema in `financial_crime/library/schema`. +First create the global vertex and edge types, then create the graph for the +mule-account detection solution: + +```bash +gsql /home/tigergraph/solution_kits/financial_crime/library/schema/general_global_financial_crime_super_schema.gsql + +gsql /home/tigergraph/solution_kits/financial_crime/mule_account_detection/schema/create_schema_bottomup.gsql +``` + +### 2. Load data + +Load data into the schema by running the local loading job script: + +```bash + gsql /home/tigergraph/solution_kits/financial_crime/mule_account_detection/load/local_loading_job.gsql +``` + + +### 3. Install queries +```bash +./install_queries.sh +``` + + +## Query Execution Order and Explanations + +### Step 1: Insert Edges for Merchant and Card Networks + +To initiate the `wcc` and `pagerank` algorithms on the Merchant and Card network, +it's essential to first execute the following two queries: + +- `account_account_with_weights` + +### Step 2: Form Communities + +The following query runs weighted wcc algorithm using the weights on Account_Account +edge. + +- `tg_wcc_account_with_weights` + +Subsequent queries can be run following the completion of the aforementioned two. + +### Step 3: Feature Engineering Queries + +The feature engineering queries generate feature values to feed the downstream ML model. +These queries aggregate the values and propagate the features as attributes of the `Account` vertex. +To use the ML model documented in the `model` folder, the following feature engineering queries need +to be executed. The right column lists the attribute name corresponding to the query. + +| Query | Attribute Name(Feature) | Type | +|-------------------------------------|----------------------------|--------| +| `tg_wcc_account_with_weights` | `com_size` & `com_id` | INT | +| `tg_pagerank_wt_account` | `pagerank` | FLOAT | +| `tg_shortest_path_length_account` | `shortest_path_length` | INT | +| `number_of_mule_accounts_on_same_IP`| `ip_collision` | INT | +| `n_hop_fraud_count_ip` | `fraud_ip` | INT | +| `number_of_mule_accounts_on_same_device` | `device_collision` | INT | +| `n_hop_fraud_count_device` | `fraud_device` | INT | +| `ratio_of_mule_account_transfer` | `trans_in_mule_ratio (out)`| FLOAT | +| `n_hop_number_of_total_mule_account`| `mule_cnt` | INT | + +### Step 4: Investigation Queries and Information Retrieval Queries + +These queries can be executed at any time and have no dependencies on the previous queries. +The investigation queries are designed to help investigate certain accounts, facilitating deep analysis. These queries include: + +- `single_party_PII`: Provides personally identifiable information for a party involved in transactions. +- `attributes_to_party_traversal`: Traverses from transaction attributes to the parties involved. +- `party_full_address`: Retrieves the full address details for a party involved in transactions. + +## Load Sample Data + + +You can load the provided CSV files via: + +```bash +gsql -g MuleDetection run loading_job load_sample +``` + +This loads: + +- Customers +- Accounts +- Transactions +- Devices +- IPs +- Merchants +- Geolocation data + +--- + +## Graph Schema + +The sample schema includes the following **vertex types**: + +- **Customer** - account owner +- **Account** - bank accounts under the customer +- **Transaction** - money movement events +- **Device** - login devices +- **IP_Address** - IP used for login +- **Merchant** - merchant receiving funds +- **Location** - geographical tag + +### **Edges** + +- `OWNS` - Customer -> Account +- `HAS_TRANSACTION` -> Account -> Transaction +- `SENT_TO` - Transaction -> Account (target) +- `USED_DEVICE` - Customer -> Device +- `USED_IP` - Device -> IP_Address +- `VISITED` - Customer -> Location +- `RECEIVED_BY` - Transaction -> Merchant + +This structure enables deep multi-hop investigative analytics. + +--- + +## Sample Dataset + +The `data` folder is populated with sample data files. These files are crafted +to closely mimic real-world scenarios, providing a realistic context for testing and demonstration purposes. + +This dataset includes: + +- Customers with IDs and profiles +- Accounts linked to customers +- Synthetic but realistic transactions +- Complex money movement chains +- Suspicious clusters intentionally injected +- Devices, IPs, and merchants for contextual enrichment + +This dataset allows you to test every query in the solution end-to-end. + +--- +-- + +## Using Your Own Data + +Update your mapping in: + +``` +/data/your_sources/ +``` + +Then modify: + +``` +load.gsql +``` + +to match your column names. + +You may also adjust: + +- scoring weights +- anomaly thresholds +- cycle detection sensitivity + +--- + +## Cleanup (Optional) + +Remove all vertices, edges, and data: + +```bash +gsql -g MuleDetection drop all +``` + +Or: + +```bash +gsql drop graph MuleDetection +``` + + +## Insights Applications + +There are two insights applications: + +- **ROI Dashboard** + + - Shows the transaction fraud losses. + - Live total fraud losses and amount saved by TigerGraph. + - Live fraud distribution. + - ML performance. + +- **Mule Account Detection** + - The application has 5 pages: + - Account network pagerank. + - WCC Community. + - Shortest Path. + - IP Sharing Analysis. + - Device Sharing Analysis. + diff --git a/financial_crime/mule_account_detection/ReadMe.md b/financial_crime/mule_account_detection/ReadMe.md deleted file mode 100644 index 6adde3bc..00000000 --- a/financial_crime/mule_account_detection/ReadMe.md +++ /dev/null @@ -1,114 +0,0 @@ -# Narratives -Mule account detection in financial crime is a critical endeavor aimed at identifying bank accounts that are used to receive and disperse money from illicit activities. These mule accounts, which may be involved either knowingly or unknowingly in these operations, present a significant challenge for financial institutions. The rapid movement of funds through an extensive and seemingly unconnected network of accounts, spread across numerous financial institutions, complicates the tracking and halting of such illicit transactions. Financial institutions are therefore tasked with detecting this activity promptly to prevent further fund transfer and ensure the return of assets to their rightful owners. TigerGraph's solutions address this challenge through a suite of sophisticated graph algorithms that enable real-time monitoring, feature engineering for machine learning, and anomaly detection to trace illegal funds effectively. Community Detection is employed to unveil clusters within the transaction network, highlighting groups of accounts that work in concert to move illicit funds. The Centrality (PageRank) algorithm identifies key accounts that act as central nodes in the distribution network, crucial for disrupting the flow of illicit money. Closeness (Shortest Path) analysis reveals the most direct routes for money laundering, aiding in the trace back to the source. Lastly, Deep Link Analysis uncovers hidden connections between accounts, providing a comprehensive understanding of the network's structure and operation. Together, these algorithms form the backbone of TigerGraph's approach to dismantling the complex networks of mule accounts, safeguarding the financial system against the movement of illicit funds. - -# Components -This repository includes multiple components: - -- `Schema` - Definition of database schema. -- `Queries` - Collection of GSQL queries. -- `Mock Data` - Sample data. -- `Loading Jobs` - two loading jobs: one for cloud integration and the other for on-prem local deployment. -- `Insights Applications` - ROI Dashboard and Analytical Insights Applicaitions. -- `README.md` - This usage guide. -- `setup.sh` - Automated setup script. - -# ML Features -### Here is a list of graph features we provide in the solution kit: - -1. **Community Detection**: - We use the weighted Weakly Connected Components (WCC) algorithm to detect communities within the graph. This includes updating the size of each community and assigning community IDs, which help enhance the machine learning models for identifying mule accounts. - -2. **PageRank Score**: - The PageRank score for each account vertex in the account network is calculated. This score represents the importance or influence of each account, aiding the machine learning models in detecting mule accounts. - -3. **Shortest Path Length**: - We calculate the shortest path length from each account to identified mule accounts. This metric helps determine the proximity of each account to known mule accounts, providing valuable features for the models. - -4. **IP Sharing**: - The number of mule accounts that share the same IP address for each account is calculated. This feature helps identify suspicious accounts connected through the same IP address. - -5. **Multi-Hop IP Analysis**: - We calculate the number of IP addresses linked to any mule accounts within a specified number of hops. This feature identifies potentially fraudulent IP connections, enhancing the machine learning models. - -6. **Device Sharing**: - The number of mule accounts that share the same device for each account is calculated. This aids in identifying suspicious accounts connected through the same device. - -7. **Multi-Hop Device Analysis**: - We calculate the number of devices linked to any mule accounts within a specified number of hops, helping to identify potentially fraudulent device connections. - -8. **Transfer Ratio**: - The ratio of the total amount of money transferred to and from mule accounts relative to the total transfer amount for each account is calculated. This includes the ratio of incoming and outgoing transfers, identifying suspicious transfer patterns. - -9. **Multi-Hop Mule Account Analysis**: - We calculate the total number of mule accounts within a specified number of hops from each account. This helps assess the proximity and density of mule accounts in the network. - -# Instructions - -1. **Schema Creation**: This schema is a subset of the Super Schema in financial_crime/library/schema. To create the schema, first run `gsql /home/tigergraph/solution_kits/financial_crime/library/schema/general_global_financial_crime_super_schema.gsql` to generate global vertex and edge types. Then, user can run `/home/tigergraph/solution_kits/financial_crime/mule_account_detection/schema/create_schema_bottomup.gsql` to create the graph. -2. **Data Loading**: Load data into the schema by running the data loading job with the `local_loading_job.gsql` script. -3. **Query Installation**: Completes the setup by installing necessary queries through the `install_queries.sh` script. - -## Query Execution Order and Explanations - -### Step 1: Insert Edges for Merchant and Card Networks - -To initiate the `wcc` and `pagerank` algorithms on the Merchant and Card network, it's essential to first execute the following two queries: - -- `account_account_with_weights` - -### Step 2: Form Communities - -The following query runs weighted wcc algorithm using the weights on Account_Account edge. - -- `tg_wcc_account_with_weights` - - -Subsequent queries can be run following the completion of the aforementioned two. - -### Step 3: Feature Engineering Queries - -The feature engineering queries generate feature values to feed the downstream ML model. These queries aggregate the values and propagate the features as attributes of the `Account` vertex. To use the ML model documented in the `model` folder, the following feature engineering queries need to be executed. The right column lists the attribute name corresponding to the query. -| Query | Attribute Name(Feature) | Type | -|-------------------------------------|----------------------------|--------| -| `tg_wcc_account_with_weights` | `com_size` & `com_id` | INT | -| `tg_pagerank_wt_account` | `pagerank` | FLOAT | -| `tg_shortest_path_length_account` | `shortest_path_length` | INT | -| `number_of_mule_accounts_on_same_IP`| `ip_collision` | INT | -| `n_hop_fraud_count_ip` | `fraud_ip` | INT | -| `number_of_mule_accounts_on_same_device` | `device_collision` | INT | -| `n_hop_fraud_count_device` | `fraud_device` | INT | -| `ratio_of_mule_account_transfer` | `trans_in_mule_ratio (out)`| FLOAT | -| `n_hop_number_of_total_mule_account`| `mule_cnt` | INT | - -### Step 4: Investigation Queries and Information Retrieval Queries - -These queries can be executed at any time and have no dependencies on the previous queries. The investigation queries are designed to help investigate certain accounts, facilitating deep analysis. These queries include: - -- `single_party_PII`: Provides personally identifiable information for a party involved in transactions. -- `attributes_to_party_traversal`: Traverses from transaction attributes to the parties involved. -- `party_full_address`: Retrieves the full address details for a party involved in transactions. - - -## Mock Data - -The `data` folder is populated with sample data files. These files are crafted to closely mimic real-world scenarios, providing a realistic context for testing and demonstration purposes. - - - -## Insights Applications - -There are two insights applications: -- **ROI Dashboard** - - Shows the transaction fraud losses. - - Live total fraud losses and amount saved by TigerGraph. - - Live fraud distribution. - - ML performance. - -- **Mule Account Detection** - - The application has 5 pages: - - Account network pagerank. - - WCC Community. - - Shortest Path. - - IP Sharing Analysis. - - Device Sharing Analysis. - \ No newline at end of file diff --git a/financial_crime/mule_account_detection/meta/Insights_mule_analytics.json b/financial_crime/mule_account_detection/meta/Insights_mule_analytics.json index 94a7dfc5..eada1dd7 100644 --- a/financial_crime/mule_account_detection/meta/Insights_mule_analytics.json +++ b/financial_crime/mule_account_detection/meta/Insights_mule_analytics.json @@ -1,12 +1,12 @@ { "defaultGraph": "Mule_Account_Detection", "iconURL": "/studio/assets/gvis/icons/builtin/64/285-asset.png", - "id": "fHvaU5uuocyNTvLK1hvnhZ", + "id": "nm7u6Gfy5RyCjFn5a4XXvX", "owner": "tigergraph", "pageConfigSeparated": true, "title": "Mule Account Detection", "userRoleForApp": "owner", - "version": "1718182992759391308", + "version": "1772468490523977939", "pages": [ { "globalParameters": { @@ -56,7 +56,7 @@ ] }, "title": "Device Conflicts", - "version": "1718182992772794626", + "version": "1772468490536218944", "weight": 65, "chartMap": { "bRkhvjVhY1TWQcogdN5qWX": { @@ -156,7 +156,7 @@ "staticData": "{}", "title": "Device Sharing Table", "type": "table", - "version": "1718182992806038828" + "version": "1772468490570044181" }, "bsGHYaUbye71mmcqUw73hu": { "chartSettings": { @@ -193,7 +193,7 @@ "staticData": "{}", "title": "Device Sharing Graph", "type": "internal-graph", - "version": "1718182992813412007" + "version": "1772468490577999522" }, "eQZKGCXHQALSLsewAjs1ea": { "chartSettings": { @@ -220,7 +220,7 @@ "staticData": "{}", "title": "Input Account", "type": "Inputs", - "version": "1718182992819392426" + "version": "1772468490584087918" } } }, @@ -298,7 +298,7 @@ ] }, "title": "Account_Network_Pagerank", - "version": "1718182992778659189", + "version": "1772468490542439646", "weight": 20, "chartMap": { "4QYhdJVbfoeUnGjAzhi5sG": { @@ -359,7 +359,7 @@ "staticData": "{}", "title": "Top Pagerank Score Account", "type": "table", - "version": "1718182992825778629" + "version": "1772468490589993580" }, "d3UvsqzXpUWp8BroNYLgqM": { "chartSettings": { @@ -389,7 +389,7 @@ "staticData": "{}", "title": "Input", "type": "Inputs", - "version": "1718182992833516058" + "version": "1772468490595807693" }, "nLdSm9KfPcJrmCcdi8Y1gW": { "chartSettings": {}, @@ -403,7 +403,7 @@ "staticData": "{}", "title": "Account Network Pagerank", "type": "internal-graph", - "version": "1718182992840084316" + "version": "1772468490603726226" } } }, @@ -452,7 +452,7 @@ ] }, "title": "WCC Community", - "version": "1718182992784492285", + "version": "1772468490551008026", "weight": 35, "chartMap": { "jkio2L3Um2ufCJ7gkybBWw": { @@ -529,7 +529,7 @@ "staticData": "{}", "title": "WCC Table", "type": "table", - "version": "1718182992846644161" + "version": "1772468490609449424" }, "rVesAEnUrQfHxr86YV2rfy": { "chartSettings": {}, @@ -543,7 +543,7 @@ "staticData": "{}", "title": "Communities", "type": "internal-graph", - "version": "1718182992853827816" + "version": "1772468490615273082" }, "xsBfPK3EJ6n4J9gZMoxdhs": { "chartSettings": { @@ -573,11 +573,60 @@ "staticData": "{}", "title": "Input", "type": "Inputs", - "version": "1718182992859656801" + "version": "1772468490624413433" } } }, { + "globalParameters": { + "account": { + "id": "input_4bVYZSdcBZcMkXxvqLiAfC", + "name": "account", + "type": "VERTEX", + "value": { + "vertexID": "0448743965", + "vertexType": "Account" + } + } + }, + "iconURL": "/studio/assets/gvis/icons/builtin/64/052-cells.png", + "id": "sXcEF1LsFkipMZXidn8WoX", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 45, + "i": "eGUYb6o4Lf5YaTokas5xi8", + "moved": false, + "static": false, + "w": 6, + "x": 0, + "y": 0 + }, + { + "h": 35, + "i": "m4XfGCnug9rZ6zMXUX8eth", + "moved": false, + "static": false, + "w": 6, + "x": 6, + "y": 10 + }, + { + "h": 10, + "i": "umcMFXKpivgT6ftHb3sf2R", + "moved": false, + "static": false, + "w": 6, + "x": 6, + "y": 0 + } + ] + }, + "title": "Shortest Path", + "version": "1772468490556807422", + "weight": 45, "chartMap": { "eGUYb6o4Lf5YaTokas5xi8": { "chartSettings": { @@ -618,28 +667,19 @@ }, { "data": "account", - "id": "87a48d71-cdd3-4317-91e9-512550c26af6", + "id": "ac6681b5-e84c-4694-8960-c1da25d2f256", "paramGlobalInput": "account", "paramName": "ver", "paramType": "VERTEX", "paramTypeReadonly": true, "type": "PARAM", "vertexType": "Account" - }, - { - "data": "number_paths", - "id": "f4ee8d07-3306-4f4c-8e03-ce502c62e0e4", - "paramGlobalInput": "number_paths", - "paramName": "print_number", - "paramType": "INT", - "paramTypeReadonly": true, - "type": "PARAM" } ], "staticData": "{}", "title": "shortest path", "type": "internal-graph", - "version": "1718226734254095335" + "version": "1772600035200321987" }, "m4XfGCnug9rZ6zMXUX8eth": { "chartSettings": { @@ -791,28 +831,19 @@ }, { "data": "account", - "id": "a8d8e336-d857-41b0-ab62-f87252703050", + "id": "f1427c4b-90db-4498-a62d-b37bb466db93", "paramGlobalInput": "account", "paramName": "ver", "paramType": "VERTEX", "paramTypeReadonly": true, "type": "PARAM", "vertexType": "Account" - }, - { - "data": "number_paths", - "id": "18491db9-f7a0-4546-a30f-45dbcc86e868", - "paramGlobalInput": "number_paths", - "paramName": "print_number", - "paramType": "INT", - "paramTypeReadonly": true, - "type": "PARAM" } ], "staticData": "{}", "title": "Shortest Path Table", "type": "table", - "version": "1718227114441036065" + "version": "1772600048009176957" }, "umcMFXKpivgT6ftHb3sf2R": { "chartSettings": { @@ -838,58 +869,9 @@ "staticData": "{}", "title": "Input", "type": "Inputs", - "version": "1718227489486288049" - } - }, - "globalParameters": { - "account": { - "id": "input_4bVYZSdcBZcMkXxvqLiAfC", - "name": "account", - "type": "VERTEX", - "value": { - "vertexID": "0448743965", - "vertexType": "Account" - } + "version": "1772468490644947116" } - }, - "iconURL": "/studio/assets/gvis/icons/builtin/64/052-cells.png", - "id": "sXcEF1LsFkipMZXidn8WoX", - "isDetail": true, - "isNew": false, - "layouts": { - "md": [ - { - "h": 45, - "i": "eGUYb6o4Lf5YaTokas5xi8", - "moved": false, - "static": false, - "w": 6, - "x": 0, - "y": 0 - }, - { - "h": 35, - "i": "m4XfGCnug9rZ6zMXUX8eth", - "moved": false, - "static": false, - "w": 6, - "x": 6, - "y": 10 - }, - { - "h": 10, - "i": "umcMFXKpivgT6ftHb3sf2R", - "moved": false, - "static": false, - "w": 6, - "x": 6, - "y": 0 - } - ] - }, - "title": "Shortest Path", - "version": "1718227489476782113", - "weight": 45 + } }, { "globalParameters": { @@ -939,7 +921,7 @@ ] }, "title": "IP Conflicts", - "version": "1718182992797652966", + "version": "1772468490562837106", "weight": 55, "chartMap": { "1dQRNR2aD7aFypgUHjQHRn": { @@ -967,7 +949,7 @@ "staticData": "{}", "title": "Input Account", "type": "Inputs", - "version": "1718182992883991368" + "version": "1772468490652793186" }, "2JscpDG7RsknwWoxrytfGT": { "chartSettings": { @@ -1004,7 +986,7 @@ "staticData": "{}", "title": "IP Sharing Graph", "type": "internal-graph", - "version": "1718182992890948957" + "version": "1772468490658972004" }, "gFu8x13AQTuCwQdnTTn18k": { "chartSettings": { @@ -1103,7 +1085,7 @@ "staticData": "{}", "title": "IP Sharing Table", "type": "table", - "version": "1718182992897871800" + "version": "1772468490665221019" } } } diff --git a/financial_crime/mule_account_detection/queries/account_account_with_weights.gsql b/financial_crime/mule_account_detection/queries/account_account_with_weights.gsql index 6b97f395..d33dcb43 100644 --- a/financial_crime/mule_account_detection/queries/account_account_with_weights.gsql +++ b/financial_crime/mule_account_detection/queries/account_account_with_weights.gsql @@ -4,14 +4,15 @@ CREATE DISTRIBUTED QUERY account_account_with_weights( INT min_edge_weight=1) FOR GRAPH Mule_Account_Detection { /* - This query creates a Account_Account edges for every pair of accounts that share the same transfer_transaction. - It then assigns weights to these connections based on the frequency of transactions involving the common merchant - pair across all cards. Weighted Personalized PageRank algorithm will be applied to this account_account network. + This query creates Account_Account edges for every pair of accounts that + share the same transfer_transaction. It then assigns weights to these + connections based on the frequency of transactions involving the common merchant + pair across all cards. Weighted Personalized PageRank algorithm will be + applied to this account_account network. */ SetAccum> @accounts; MapAccum, INT> @edge_weight; - //clear previous Account_Account edges tmp = SELECT s FROM Account:s -(Account_Account:e)- Account:t @@ -31,4 +32,4 @@ CREATE DISTRIBUTED QUERY account_account_with_weights( END END; PRINT "Account_Account Edges Inserted " AS Status; - } \ No newline at end of file + } diff --git a/financial_crime/transaction_fraud/README.md b/financial_crime/transaction_fraud/README.md new file mode 100644 index 00000000..d1a31d5b --- /dev/null +++ b/financial_crime/transaction_fraud/README.md @@ -0,0 +1,224 @@ +# Credit Card Transaction Fraud Detection + +A real-time, graph-powered solution for detecting credit card fraud rings, +busting coordinated attacks, and preventing financial losses using TigerGraph. + +## Contents + +- [Overview](#overview) +- [Components](#components) +- [Prerequisites](#prerequisites) +- [Quickstart Guide](#quickstart-guide) +- [Query Execution Order and Explanations](#query-execution-order-and-explanations) +- [ML Features](#ml-features) +- [Graph Schema](#graph-schema) +- [Sample Dataset](#sample-dataset) +- [Using Your Own Data](#using-your-own-data) +- [Cleanup (Optional)](#cleanup-optional) +- [Insights Applications](#insights-applications) +- [ML Model and Insights Application](#ml-model-and-insights-application) + +--- + + +## Overview + +Credit card transaction fraud detection identifies and prevents unauthorized +or deceptive transactions in real-time. It analyzes transaction data, +including cardholder information, transaction details, and historical patterns, +to detect anomalies and suspicious activity indicative of fraud. TigerGraph +models complex relationships and patterns among entities such as cardholders, +merchants, transactions, and geographic locations. This enable detection of +fraudulent networks and patterns that may be difficult to uncover using traditional +relational databases. Using graph algorithms enable organizations to detect and +respond to fraudulent transactions quickly and efficiently, ultimately reducing +financial losses and protecting consumers from fraudulent activity. + +## Components +- Schema - Graph schema definition +- Queries - Complete GSQL query library +- Query Installation Script - `3_install_queries.sh` (handles dependencies automatically) +- Mock Data - Realistic synthetic dataset +- Loading Job - `loading_job/load_data.gsql` +- Insights Applications - Pre-built JSON dashboards +- Machine Learning Model - Training code & docs in `./model` +- Unit Tests - Validation suite + +## Prerequisites +- TigerGraph 3.x or 4.x (Cloud or Enterprise) +- GSQL client access +- RESTPP endpoint enabled +- Linux shell or Windows WSL + +## Quickstart Guide +One-command full setup (recommended): +```bash +./setup.sh +``` +This would run the following commands + +```bash +# 1. Create global vertex/edge types (run once per instance) +gsql /home/tigergraph/solution_kits/financial_crime/library/schema/general_global_financial_crime_super_schema.gsql +# 2. Create the TransactionFraud graph and local schema +gsql /home/tigergraph/solution_kits/financial_crime/transaction_fraud/schema/create_schema_bottomup.gsql +# 3. Load sample data +gsql -g TransactionFraud run loading_job load_data +# 4. Install all queries (automatically respects mer_shortest_path_length dependency) +./3_install_queries.sh +``` +You are now ready to explore fraud patterns and launch the Insights apps! + +## Query Execution Order and Explanations + +### Step 1: Insert Edges for Merchant and Card Networks + +To initiate the `wcc` and `pagerank` algorithms on the Merchant and Card network, it's essential to first execute the following two queries: + +- `merchant_merchant_with_weights` +- `card_card_with_weights` + +### Step 2: Form Communities + +The provided solution kit includes two variations of the `wcc` community algorithm. The first employs the default community algorithm found in the built-in algorithm library (i.e. `tg_wcc_card` and `tg_wcc_merchant`), while the second leverages edge weight-based `wcc` to enhance community detection according to the dataset. If user has own dataset, the first option is recommended. Users have the flexibility to select either option for clustering purposes. For this solution kit, we use `weighted wcc` for community detection, so the following two queries need to be executed in step 2: + +- `tg_wcc_card_weight_based` +- `tg_wcc_merchant_weight_based` + +Subsequent queries can be run following the completion of the aforementioned two. + +### Step 3: Feature Engineering Queries + +The feature engineering queries generate feature values to feed the downstream ML model. These queries aggregate the values and propagate the features as attributes of the `Payment_Transaction` vertex. To use the ML model documented in the `model` folder, the following feature engineering queries need to be executed. The right column lists the attribute name of `Payment_Transaction` corresponding to the query. + +| Query | Attribute | Type | +| ---------------------------------------------- | --------------------------------------------- | ------ | +| `community_size` | `mer_com_size / cd_com_size` | INT | +| `tg_pagerank_wt_merchant` | `mer_pagerank` | FLOAT | +| `tg_pagerank_wt_card` | `cd_pagerank` | FLOAT | +| `mer_shortest_path_length` | `shortest_path_length` | INT | +| `all_shortest_path_length` | `shortest_path_length` | INT | +| `card_merchant_max_amount_within_interval` | `max_txn_amt_interval` | FLOAT | +| `gender` | `gender` | STRING | +| `age` | `age` | INT | +| `city_pop` | `city_pop` | INT | +| `occupation` | `occupation` (job is a reserved keyword) | STRING | +| `NA` (original attribute value) | `unix_time` | INT | +| `card_merchant_max_txn_count_in_interval` | `max_txn_cnt_interval` | INT | +| `number_of_repeated_card` | `cnt_repeated_card` | INT | +| `community_transaction_count` | `com_mer_txn_cnt / com_cd_txn_cnt` | INT | +| `community_transaction_total_amount` | `com_mer_txn_total_amt/ com_cd_txn_total_amt` | FLOAT | +| `community_average_amount` | `com_mer_txn_avg_amt/ com_cd_txn_avg_amt` | FLOAT | +| `community_maximum_amount` | `com_mer_txn_max_amt/com_cd_txn_max_amt` | FLOAT | +| `community_minimum_amount` | `com_mer_txn_min_amt/com_cd_txn_min_amt` | FLOAT | +| `merchant_category` | `mer_cat` | STRING | +| `merchant_category_transaction_count` | `mer_cat_cnt` | INT | +| `merchant_category_transaction_total_amount` | `mer_cat_total_amt` | FLOAT | +| `merchant_category_transaction_avg_amount` | `mer_cat_avg_amt` | FLOAT | +| `merchant_category_transaction_maximum_amount` | `mer_cat_max_amt` | FLOAT | +| `merchant_category_transaction_minimum_amount` | `mer_cat_min_amt` | FLOAT | +| `degrees` | `indegree` / `outdegree` | INT | + +### Step 4: Investigation Queries and Information Retrieval Queries + +These queries can be executed at any time and have no dependencies on the previous queries. The investigation queries are designed to help investigate certain merchants, cards, and transactions, facilitating deep analysis. These queries include: + +- `merchant_with_single_large_transaction`: Identifies merchants with a single, notably large transaction. +- `merchant_has_frequent_transactions`: Finds merchants with a high frequency of transactions. +- `merchant_has_large_total_amount`: Locates merchants with large total transaction amounts. +- `card_with_single_large_transaction`: Identifies cards involved in a single, notably large transaction. +- `card_has_frequent_transactions`: Finds cards with a high frequency of transactions. +- `card_has_large_total_amount`: Locates cards with large total transaction amounts. +- `single_card_lookup`: Allows for the lookup of a single card's transaction history. +- `single_transaction_lookup`: Enables the lookup of a single transaction's details. +- `single_merchant_lookup`: Allows for the lookup of a single merchant's transaction history. +- `single_party_PII`: Provides personally identifiable information for a party involved in transactions. +- `attributes_to_party_traversal`: Traverses from transaction attributes to the parties involved. +- `party_full_address`: Retrieves the full address details for a party involved in transactions. +- `merchant_transactions_stats`: Provides statistical analysis of transactions per merchant. +- `card_transactions_stats`: Offers statistical analysis of transactions per card. +- `merchant_category_transaction_stats`: Delivers transaction statistics categorized by merchant types. + +## ML Features + +Graph-derived features automatically attached to each `Payment_Transaction` vertex for the included machine learning model: + +| Feature | Query | Attribute on Payment_Transaction | Type | +| :--- | :--- | :--- | :--- | +| **Community Size** (Merchant & Card) | `tg_wcc_*_weight_based` | `mer_com_size`, `cd_com_size` | INT | +| **PageRank** (Merchant & Card) | `tg_pagerank_wt_*` | `mer_pagerank`, `cd_pagerank` | FLOAT | +| **Shortest Path to Known Fraud** | `all_shortest_path_length` | `shortest_path_length` | INT | +| **Max Amount in Recent Interval** | `card_merchant_max_amount_within_interval` | `max_txn_amt_interval` | FLOAT | +| **Max Tx Count in Interval** | `card_merchant_max_txn_count_in_interval` | `max_txn_cnt_interval` | INT | +| **Repeated Card Count** | `number_of_repeated_card` | `cnt_repeated_card` | INT | +| **Community Tx Stats** | `community_transaction_*` | `com_mer_*` / `com_cd_*` | INT/FLOAT | +| **Merchant Category Stats** | `merchant_category_*` | `mer_cat_*` | INT/FLOAT | +| **In/Out Degree** | `degrees` | `indegree`, `outdegree` | INT | +| **Demographics & Context** | direct attributes | `gender`, `age`, `city_pop`, `occupation`, `unix_time`, `mer_cat` | STRING/INT | +## Graph Schema +Main vertices: Cardholder, Credit_Card, Payment_Transaction, Merchant, Merchant_Category, Location +Full schema available in ./schema/ + +## Sample Dataset +Realistic synthetic data with intentionally injected fraud patterns: +- Multi-card / multi-merchant fraud rings +- Single large transactions +- High-frequency attacks +- Geographic and demographic enrichment +Ideal for testing all queries, ML model, and visualizations end-to-end. + +## Using Your Own Data +1. Place your files in **/data/your_sources/** +2. Modify column mappings in **loading_job/load_data.gsql** +3. Adjust thresholds or feature logic as needed + +## Cleanup (Optional) +```bash +gsql drop graph TransactionFraud +# or +gsql -g TransactionFraud drop all +``` + +## Insights Applications +**ROI Dashboard** +1. Shows the transaction fraud losses. +2. Live total fraud losses and amount saved by TigerGraph. +3. Live fraud distribution. +4. ML performance. +**Transaction Fraud Analytics** +1. Merchant network pagerank. +2. Shortest path to frauds with user-specified limit. +3. Community. +4. Card With Large Total Transaction Amount. +5. Card with high transaction frequency. +**Note**: For ```Card With Large Total Transaction Amount``` and ```Card with high transaction frequency,``` the graph only shows fraudulent transactions to avoid overcrowded visualizations. + + +## Mock Data + +The `data` folder is populated with sample data files. These files are crafted to closely mimic real-world scenarios, providing a realistic context for testing and demonstration purposes. + +## ML Model and Insights Application + +- You can find the instructions for training the ML model and its performance metrics within the documentation located in the model folder. +- The Insights Applications are available as JSON files in the meta folder. + +### Insights Applications + +There are two insights applications: + +- **ROI Dashboard** + + - Shows the transaction fraud losses. + - Live total fraud losses and amount saved by TigerGraph. + - Live fraud distribution. + - ML performance. + +- **Transaction Fraud** + - The application has 5 pages: + - Merchant network pagerank. + - Shortest path to frauds with user-specified limit. + - Community. + - Card With Large Total Transaction Amount. + - Card with high transaction frequency. + - **Note:** For "Card With Large Total Transaction Amount" and "Card with high transaction frequency," the graph only shows fraudulent transactions to avoid overcrowded visualizations. diff --git a/financial_crime/transaction_fraud/ReadMe.md b/financial_crime/transaction_fraud/ReadMe.md deleted file mode 100644 index bdf9e963..00000000 --- a/financial_crime/transaction_fraud/ReadMe.md +++ /dev/null @@ -1,133 +0,0 @@ -# Narratives -Credit card transaction fraud detection identifies and prevents unauthorized or deceptive transactions in real-time. It analyzes transaction data, including cardholder information, transaction details, and historical patterns, to detect anomalies and suspicious activity indicative of fraud. TigerGraph models complex relationships and patterns among entities such as cardholders, merchants, transactions, and geographic locations. This enable detection of fraudulent networks and patterns that may be difficult to uncover using traditional relational databases. Using graph algorithms enable organizations to detect and respond to fraudulent transactions quickly and efficiently, ultimately reducing financial losses and protecting consumers from fraudulent activity. - -# Components -This repository includes multiple components: - -- `Schema` - Definition of database schema. -- `Queries` - Collection of GSQL queries. -- `Query Installation Automation Script` - Scripts for automating the installation of queries. -- `Mock Data` - Sample data. -- `Loading Jobs` - Scripts for data loading tasks. -- `Insights Applications (JSON)` - Applications for data analysis and visualization, configured in JSON. -- `Machine Learning Model` - Source code for the machine learning model. -- `Unit Test` - Test cases for ensuring code reliability and correctness. - - - -# Instructions - -The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: - -1. **Schema Creation**: This schema is a subset of the Super Schema in financial_crime/library/schema. To create the schema, first run `gsql /home/tigergraph/solution_kits/financial_crime/library/schema/general_global_financial_crime_super_schema.gsql` to generate global vertex and edge types. Then, user can run `/home/tigergraph/solution_kits/financial_crime/transaction_fraud/schema/create_schema_bottomup.gsql` to create the graph. -2. **Data Loading**: Load data into the schema by running the data loading job with the `2_load_data.gsql` script. -3. **Query Installation**: Completes the setup by installing necessary queries through the `3_install_queries.sh` script. - -## Installation Note for Queries - -When installing the queries, there is **one specific query** that requires special attention: - -- `mer_shortest_path_length` is a subquery of `all_shortest_path_length`. - -To successfully install `all_shortest_path_length`, the `mer_shortest_path_length` query **must be installed first**. - -The script `3_install_queries.sh` has already been configured to handle this dependency. This note is intended for users who wish to **manually install** these queries. - - - -## Query Execution Order and Explanations - -### Step 1: Insert Edges for Merchant and Card Networks - -To initiate the `wcc` and `pagerank` algorithms on the Merchant and Card network, it's essential to first execute the following two queries: - -- `merchant_merchant_with_weights` -- `card_card_with_weights` - -### Step 2: Form Communities - -The provided solution kit includes two variations of the `wcc` community algorithm. The first employs the default community algorithm found in the built-in algorithm library (i.e. `tg_wcc_card` and `tg_wcc_merchant`), while the second leverages edge weight-based `wcc` to enhance community detection according to the dataset. If user has own dataset, the first option is recommended. Users have the flexibility to select either option for clustering purposes. For this solution kit, we use `weighted wcc` for community detection, so the following two queries need to be executed in step 2: - -- `tg_wcc_card_weight_based` -- `tg_wcc_merchant_weight_based` - -Subsequent queries can be run following the completion of the aforementioned two. - -### Step 3: Feature Engineering Queries - -The feature engineering queries generate feature values to feed the downstream ML model. These queries aggregate the values and propagate the features as attributes of the `Payment_Transaction` vertex. To use the ML model documented in the `model` folder, the following feature engineering queries need to be executed. The right column lists the attribute name of `Payment_Transaction` corresponding to the query. - -| Query | Attribute | Type | -|------------------------------------------------|--------------------------------------|--------| -| `community_size` | `mer_com_size / cd_com_size` | INT | -| `tg_pagerank_wt_merchant` | `mer_pagerank` | FLOAT | -| `tg_pagerank_wt_card` | `cd_pagerank` | FLOAT | -| `mer_shortest_path_length` | `shortest_path_length` | INT | -| `all_shortest_path_length` | `shortest_path_length` | INT | -| `card_merchant_max_amount_within_interval` | `max_txn_amt_interval` | FLOAT | -| `gender` | `gender` | STRING | -| `age` | `age` | INT | -| `city_pop` | `city_pop` | INT | -| `occupation` | `occupation` (job is a reserved keyword) | STRING | -| `NA` (original attribute value) | `unix_time` | INT | -| `card_merchant_max_txn_count_in_interval` | `max_txn_cnt_interval` | INT | -| `number_of_repeated_card` | `cnt_repeated_card` | INT | -| `community_transaction_count` | `com_mer_txn_cnt / com_cd_txn_cnt` | INT | -| `community_transaction_total_amount` | `com_mer_txn_total_amt/ com_cd_txn_total_amt` | FLOAT | -| `community_average_amount` | `com_mer_txn_avg_amt/ com_cd_txn_avg_amt` | FLOAT | -| `community_maximum_amount` | `com_mer_txn_max_amt/com_cd_txn_max_amt` | FLOAT | -| `community_minimum_amount` | `com_mer_txn_min_amt/com_cd_txn_min_amt` | FLOAT | -| `merchant_category` | `mer_cat` | STRING | -| `merchant_category_transaction_count` | `mer_cat_cnt` | INT | -| `merchant_category_transaction_total_amount` | `mer_cat_total_amt` | FLOAT | -| `merchant_category_transaction_avg_amount` | `mer_cat_avg_amt` | FLOAT | -| `merchant_category_transaction_maximum_amount` | `mer_cat_max_amt` | FLOAT | -| `merchant_category_transaction_minimum_amount` | `mer_cat_min_amt` | FLOAT | -| `degrees` | `indegree` / `outdegree` | INT | - -### Step 4: Investigation Queries and Information Retrieval Queries - -These queries can be executed at any time and have no dependencies on the previous queries. The investigation queries are designed to help investigate certain merchants, cards, and transactions, facilitating deep analysis. These queries include: - -- `merchant_with_single_large_transaction`: Identifies merchants with a single, notably large transaction. -- `merchant_has_frequent_transactions`: Finds merchants with a high frequency of transactions. -- `merchant_has_large_total_amount`: Locates merchants with large total transaction amounts. -- `card_with_single_large_transaction`: Identifies cards involved in a single, notably large transaction. -- `card_has_frequent_transactions`: Finds cards with a high frequency of transactions. -- `card_has_large_total_amount`: Locates cards with large total transaction amounts. -- `single_card_lookup`: Allows for the lookup of a single card's transaction history. -- `single_transaction_lookup`: Enables the lookup of a single transaction's details. -- `single_merchant_lookup`: Allows for the lookup of a single merchant's transaction history. -- `single_party_PII`: Provides personally identifiable information for a party involved in transactions. -- `attributes_to_party_traversal`: Traverses from transaction attributes to the parties involved. -- `party_full_address`: Retrieves the full address details for a party involved in transactions. -- `merchant_transactions_stats`: Provides statistical analysis of transactions per merchant. -- `card_transactions_stats`: Offers statistical analysis of transactions per card. -- `merchant_category_transaction_stats`: Delivers transaction statistics categorized by merchant types. - -## Mock Data - -The `data` folder is populated with sample data files. These files are crafted to closely mimic real-world scenarios, providing a realistic context for testing and demonstration purposes. - -## ML Model and Insights Application - -- You can find the instructions for training the ML model and its performance metrics within the documentation located in the model folder. -- The Insights Applications are available as JSON files in the meta folder. - -### Insights Applications - -There are two insights applications: -- **ROI Dashboard** - - Shows the transaction fraud losses. - - Live total fraud losses and amount saved by TigerGraph. - - Live fraud distribution. - - ML performance. - -- **Transaction Fraud** - - The application has 5 pages: - - Merchant network pagerank. - - Shortest path to frauds with user-specified limit. - - Community. - - Card With Large Total Transaction Amount. - - Card with high transaction frequency. - - **Note:** For "Card With Large Total Transaction Amount" and "Card with high transaction frequency," the graph only shows fraudulent transactions to avoid overcrowded visualizations. diff --git a/financial_crime/transaction_fraud/model/model_training.ipynb b/financial_crime/transaction_fraud/model/model_training.ipynb index 2f48e85e..1f094801 100644 --- a/financial_crime/transaction_fraud/model/model_training.ipynb +++ b/financial_crime/transaction_fraud/model/model_training.ipynb @@ -22,7 +22,7 @@ "\n", "Our journey begins with an exploration of a sample graph included in the Solution Kit, embodying transactions between credit cards and merchants, alongside cardholders ('Party') and their attributes such as addresses. For a visual guide and more details on the graph schema, please refer to the **Solution Kit Documentation**.\n", "\n", - "Utilizing an XGBoost model, we will demonstrate the stark contrast in performance between models trained with and without our graph features. Here’s a glimpse of the empirical evidence highlighting the effectiveness of incorporating graph features:\n", + "Utilizing an XGBoost model, we will demonstrate the stark contrast in performance between models trained with and without our graph features. Here's a glimpse of the empirical evidence highlighting the effectiveness of incorporating graph features:\n", "\n", "\n", "| Model | Precision | Recall | \n", diff --git a/financial_crime/transaction_fraud/model/training/model_training.md b/financial_crime/transaction_fraud/model/training/model_training.md index 8d3c2b90..0e9e103f 100644 --- a/financial_crime/transaction_fraud/model/training/model_training.md +++ b/financial_crime/transaction_fraud/model/training/model_training.md @@ -7,7 +7,7 @@ Prerequisites for this tutorial are working knowledge of TigerGraph database and Our journey begins with an exploration of a sample graph included in the Solution Kit, embodying transactions between credit cards and merchants, alongside cardholders ('Party') and their attributes such as addresses. For a visual guide and more details on the graph schema, please refer to the **Solution Kit Documentation**. -Utilizing an XGBoost model, we will demonstrate the stark contrast in performance between models trained with and without our graph features. Here’s a glimpse of the empirical evidence highlighting the effectiveness of incorporating graph features: +Utilizing an XGBoost model, we will demonstrate the stark contrast in performance between models trained with and without our graph features. Here's a glimpse of the empirical evidence highlighting the effectiveness of incorporating graph features: | Model | Precision | Recall | diff --git a/scripts/markdown.js b/scripts/markdown.js index fd6c24c5..5c106ad1 100644 --- a/scripts/markdown.js +++ b/scripts/markdown.js @@ -1,49 +1,120 @@ -import fs from 'fs' -import { basename, extname } from 'path'; -import { marked } from 'marked'; +import fs from "fs"; +import { basename, extname } from "path"; +import { marked } from "marked"; -function capitalize(s) -{ - return s[0].toUpperCase() + s.slice(1); +function capitalize(s) { + return s[0].toUpperCase() + s.slice(1); } function normalizeName(name) { - return name.split('_').map(name => capitalize(name)).join(' '); + return name + .split("_") + .map((name) => capitalize(name)) + .join(" "); } export function renderMarkdown(solution, file) { - const content = fs.readFileSync(file, { encoding: 'utf8' }) + const content = fs.readFileSync(file, { encoding: "utf8" }); - // Convert Markdown to HTML - const htmlContent = marked(content); + // Convert Markdown to HTML + const htmlContent = marked(content); - // Wrap the HTML content with the necessary HTML and CSS - const htmlTemplate = ` + // Wrap the HTML content with the necessary HTML and CSS + const htmlTemplate = ` - TigerGraph - ${normalizeName(basename(solution, extname(solution)))} - ${normalizeName(basename(file, extname(file)))} - + TigerGraph - ${normalizeName( + basename(solution, extname(solution)) + )} - ${normalizeName(basename(file, extname(file)))} + + - +
${htmlContent}
@@ -51,5 +122,5 @@ export function renderMarkdown(solution, file) { `; - return htmlTemplate; -} \ No newline at end of file + return htmlTemplate; +} diff --git a/scripts/readme.md b/scripts/readme.md index 86501b7d..fa17396f 100644 --- a/scripts/readme.md +++ b/scripts/readme.md @@ -16,6 +16,7 @@ solution name/ loading_job/ udfs/ reset/ + Readme.md ``` 1. The top-level folder names, such as `meta` or `schema`, within the solution folder are based on convention and cannot be changed. Only folders containing `meta/meta.yml` are recognized as solutions. @@ -65,7 +66,7 @@ queries: 3. The algorithms list all the graph algorithms used in the solution. -4. The `first` and `last` sections are used to control the order of the queries. The queries listed under `first` will be executed first, and the queries listed under `last` will be executed last. +4. The `first` and `last` sections are used to control the order of the queries. The queries listed under `first` will be executed first, and the queries listed under `last` will be executed last. We assume the queries listed under `last` are the queries that need to run after all the data is loaded to process the data. 5. `production` is default to true, if you want to hide the solution from production environment(`tgcloud.io`), set this to `false` diff --git a/scripts/script.js b/scripts/script.js index d36b49fa..e3e6b3bd 100644 --- a/scripts/script.js +++ b/scripts/script.js @@ -119,6 +119,17 @@ function getAllSolutions() { return metaFiles.map((file) => file.slice(0, -"/meta/meta.yml".length)); } +// Find a case-insensitive Readme.md file in the solution root +function findReadmeFile(solutionDir) { + try { + const files = fs.readdirSync(solutionDir); + const readme = files.find((f) => f.toLowerCase() === "readme.md"); + return readme ? path.join(solutionDir, readme) : null; + } catch (_) { + return null; + } +} + async function getSolution(dir) { console.log("\n\nProcessing", dir, "..."); const file = fs.readFileSync(`${dir}/meta/meta.yml`, "utf8"); @@ -137,7 +148,10 @@ async function getSolution(dir) { content.metadata.icon = await syncFile(iconPath); } - content.metadata.images = await syncFolder(`${dir}/meta/images`, imageCacheControl); + content.metadata.images = await syncFolder( + `${dir}/meta/images`, + imageCacheControl + ); content.metadata.images.sort(); content.metadata.provider = content.metadata.provider || "TigerGraph"; @@ -152,11 +166,17 @@ async function getSolution(dir) { const insightsFiles = await globSync(`${dir}/meta/Insights*.json`); content.metadata.hasInsights = insightsFiles.length > 0; - const markdownFiles = globSync([`${dir}/model/**/*.md`, `${dir}/doc/**/*.md`]); + const markdownFiles = globSync([ + `${dir}/model/**/*.md`, + `${dir}/doc/**/*.md`, + ]); const htmlFiles = []; for (let markdownFile of markdownFiles) { const html = renderMarkdown(dir, markdownFile); - const htmlFile = markdownFile.replace(".md", ".html"); + const htmlFile = path.join( + path.dirname(markdownFile), + path.basename(markdownFile, path.extname(markdownFile)) + ".html" + ); const params = { ...commonBucketConfig, @@ -176,6 +196,34 @@ async function getSolution(dir) { } } + // Handle root-level README (case-insensitive) + const readmeFile = findReadmeFile(dir); + if (readmeFile) { + const html = renderMarkdown(dir, readmeFile); + const htmlFile = path.join( + path.dirname(readmeFile), + path.basename(readmeFile, path.extname(readmeFile)) + ".html" + ); + + const params = { + ...commonBucketConfig, + Key: htmlFile, + Body: html, + ContentType: "text/html", + CacheControl: disableCacheControl, + }; + + try { + const data = await s3.upload(params).promise(); + console.log(`${htmlFile} => ${data.Location}`); + // Expose README link separately + content.metadata.readme = data.Location; + } catch (error) { + console.error("Error uploading html file:", error); + process.exit(1); + } + } + content.metadata.docLinks = htmlFiles; return content; } @@ -208,7 +256,9 @@ function concatFilesForQuery(files, lastFiles, graphName, is_library) { content += `INSTALL QUERY ALL\n`; } - const lastFileContents = lastFiles.map((file) => fs.readFileSync(file, "utf8")); + const lastFileContents = lastFiles.map((file) => + fs.readFileSync(file, "utf8") + ); for (let i = 0; i < lastFiles.length; i++) { content += "#File: " + lastFiles[i] + "\n"; content += lastFileContents[i]; @@ -228,7 +278,9 @@ function concatFilesForLastFile(lastFiles, graphName, is_library) { } let content = `USE GRAPH ${graphName}\n`; - const lastFileContents = lastFiles.map((file) => fs.readFileSync(file, "utf8")); + const lastFileContents = lastFiles.map((file) => + fs.readFileSync(file, "utf8") + ); for (let i = 0; i < lastFiles.length; i++) { content += lastFileContents[i]; content += "\n"; @@ -242,7 +294,9 @@ function concatFilesForLastFile(lastFiles, graphName, is_library) { function concatLoadingFiles(files) { const fileContents = files .map((file) => fs.readFileSync(file, "utf8")) - .map((content) => content.replaceAll("tigergraph-solution-kits", getBucketName())); + .map((content) => + content.replaceAll("tigergraph-solution-kits", getBucketName()) + ); let content = ""; for (let i = 0; i < files.length; i++) { @@ -267,7 +321,10 @@ async function getSolutionDetail(dir, first, last, graphName, is_library) { styleJSON = fs.readFileSync(`${dir}/meta/style.json`, "utf8"); } catch (error) {} - const queryFiles = globSync([`${dir}/queries/*.gsql`, `${dir}/queries/*/*.gsql`]); + const queryFiles = globSync([ + `${dir}/queries/*.gsql`, + `${dir}/queries/*/*.gsql`, + ]); const firstFiles = []; for (let file of first) { @@ -326,7 +383,9 @@ async function main() { const solutions = getAllSolutions(); const metadataList = []; for (let solution of solutions) { - const { metadata, queries: { first, last } = {} } = await getSolution(solution); + const { metadata, queries: { first, last } = {} } = await getSolution( + solution + ); if (metadata.production === false) { console.log("hide solution", metadata.name); @@ -411,3 +470,99 @@ main(); // console.log(sampleLoadingJob); // console.log(reset); // } + +async function generateFileForImportSolution() { + const dir = "solution_metadata"; + const list_data = []; + fs.mkdirSync(dir); + + const solutions = getAllSolutions(); + const metadataList = []; + for (let solution of solutions) { + const { metadata, queries: { first, last } = {} } = await getSolution( + solution + ); + + if (metadata.production === false) { + console.log("hide solution", metadata.name); + continue; + } + + const solutionDetails = await getSolutionDetail( + solution, + first || [], + last || [], + metadata.graph, + metadata.is_library + ); + + // strip aws s3 url prefix + if (metadata.icon) { + metadata.icon = + dir + "/" + metadata.icon.slice(metadata.icon.indexOf(solution)); + } + if (metadata.images) { + metadata.images = metadata.images.map((image) => { + return dir + "/" + image.slice(image.indexOf(solution)); + }); + } + + metadataList.push({ + ...metadata, + path: solution, + initQuery: solutionDetails.lastQuery, + }); + + if (!solutionDetails.is_library) { + // https://tigergraph-solution-kits-prod.s3.us-west-1.amazonaws.com/financial_crime/application_fraud/4.x/solution_with_data.tar.gz + const solution_with_data = `https://tigergraph-solution-kits-prod.s3.us-west-1.amazonaws.com/${solution}/4.x/solution_with_data.tar.gz`; + const solution_without_data = `https://tigergraph-solution-kits-prod.s3.us-west-1.amazonaws.com/${solution}/4.x/solution_without_data.tar.gz`; + list_data.push({ + name: metadata.name, + path: solution, + graph: metadata.graph, + solution_with_data: solution_with_data, + solution_without_data: solution_without_data, + }); + } + + // copy images and icon + fs.mkdirSync(`${dir}/${solution}/meta`, { recursive: true }); + if (fs.existsSync(`${solution}/meta/icon.png`)) { + fs.copyFileSync( + `${solution}/meta/icon.png`, + `${dir}/${solution}/meta/icon.png` + ); + } else if (fs.existsSync(`${solution}/meta/icon.jpg`)) { + fs.copyFileSync( + `${solution}/meta/icon.jpg`, + `${dir}/${solution}/meta/icon.jpg` + ); + } else if (fs.existsSync(`${solution}/meta/icon.svg`)) { + fs.copyFileSync( + `${solution}/meta/icon.svg`, + `${dir}/${solution}/meta/icon.svg` + ); + } + if (fs.existsSync(`${solution}/meta/images`)) { + fs.mkdirSync(`${dir}/${solution}/meta/images`); + const images = globSync([`${solution}/meta/images/*`]); + for (let image of images) { + const fileName = path.basename(image); + fs.copyFileSync(image, `${dir}/${solution}/meta/images/${fileName}`); + } + } + } + + fs.writeFileSync( + `${dir}/solution_list.json`, + JSON.stringify(metadataList, null, 2) + ); + fs.writeFileSync( + `${dir}/solution_data.json`, + JSON.stringify(list_data, null, 2) + ); +} + +// generate meta data for import solution, used by https://dl.tigergraph.com/?tab=solution +// generateFileForImportSolution();