From 47592d875cd47bad41914e0b066ad13a2e121da0 Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Fri, 27 Jun 2025 09:57:28 +0200 Subject: [PATCH 1/4] Make sure to drop temp internal node props in `from_gds` --- changelog.md | 2 ++ python-wrapper/src/neo4j_viz/gds.py | 4 ++++ python-wrapper/tests/test_gds.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/changelog.md b/changelog.md index 54e3a0b9..c83f505b 100644 --- a/changelog.md +++ b/changelog.md @@ -13,6 +13,8 @@ ## Bug fixes +* Make sure that temporary internal node properties are not included in the visualization output. + ## Improvements diff --git a/python-wrapper/src/neo4j_viz/gds.py b/python-wrapper/src/neo4j_viz/gds.py index 338de75c..aff8ad78 100644 --- a/python-wrapper/src/neo4j_viz/gds.py +++ b/python-wrapper/src/neo4j_viz/gds.py @@ -119,6 +119,10 @@ def from_gds( node_properties = [property_name] node_dfs = _fetch_node_dfs(gds, G_fetched, node_properties, G_fetched.node_labels()) + if property_name is not None: + for df in node_dfs.values(): + df.drop(columns=[property_name], inplace=True) + rel_df = _fetch_rel_df(gds, G_fetched) finally: if G_fetched.name() != G.name(): diff --git a/python-wrapper/tests/test_gds.py b/python-wrapper/tests/test_gds.py index fda1caf3..773a21d9 100644 --- a/python-wrapper/tests/test_gds.py +++ b/python-wrapper/tests/test_gds.py @@ -276,6 +276,9 @@ def test_from_gds_sample(gds: Any) -> None: ): VG = from_gds(gds, G) + # Make sure internal temporary properties are not present + assert set(VG.nodes[0].properties.keys()) == {"labels"} + assert len(VG.nodes) >= 9_500 assert len(VG.nodes) <= 10_500 assert len(VG.relationships) >= 9_500 From 4677de6c5777a9744bd090c3a386d37076c0f269 Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Fri, 27 Jun 2025 14:35:45 +0200 Subject: [PATCH 2/4] Allow entities with different property sets in `from_gds` loader --- changelog.md | 4 +- python-wrapper/src/neo4j_viz/gds.py | 58 +++++++++++++----------- python-wrapper/src/neo4j_viz/pandas.py | 23 +++++++--- python-wrapper/tests/test_gds.py | 61 ++++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 31 deletions(-) diff --git a/changelog.md b/changelog.md index c83f505b..907a45f0 100644 --- a/changelog.md +++ b/changelog.md @@ -9,11 +9,13 @@ ## New features * Allow passing a `neo4j.Driver` instance as input to `from_neo4j`, in which case the driver will be used internally to fetch the graph data using a simple query +* Added optional argument `dropna` to `from_dfs` loader allowing for not including NaN properties in the created visualization graph ## Bug fixes -* Make sure that temporary internal node properties are not included in the visualization output. +* Make sure that temporary internal node properties are not included in the visualization output +* Fixed bug where loading a graph with `from_gds` where all node or relationship properties are not present on every entity would result in an error ## Improvements diff --git a/python-wrapper/src/neo4j_viz/gds.py b/python-wrapper/src/neo4j_viz/gds.py index aff8ad78..4aba8ec4 100644 --- a/python-wrapper/src/neo4j_viz/gds.py +++ b/python-wrapper/src/neo4j_viz/gds.py @@ -14,11 +14,11 @@ def _fetch_node_dfs( - gds: GraphDataScience, G: Graph, node_properties: list[str], node_labels: list[str] + gds: GraphDataScience, G: Graph, node_properties_by_label: dict[str, list[str]], node_labels: list[str] ) -> dict[str, pd.DataFrame]: return { lbl: gds.graph.nodeProperties.stream( - G, node_properties=node_properties, node_labels=[lbl], separate_property_columns=True + G, node_properties=node_properties_by_label[lbl], node_labels=[lbl], separate_property_columns=True ) for lbl in node_labels } @@ -79,24 +79,31 @@ def from_gds( """ node_properties_from_gds = G.node_properties() assert isinstance(node_properties_from_gds, pd.Series) - actual_node_properties = list(chain.from_iterable(node_properties_from_gds.to_dict().values())) + actual_node_properties = node_properties_from_gds.to_dict() + all_actual_node_properties = list(chain.from_iterable(actual_node_properties.values())) - if size_property is not None and size_property not in actual_node_properties: - raise ValueError(f"There is no node property '{size_property}' in graph '{G.name()}'") + if size_property is not None: + if size_property not in all_actual_node_properties: + raise ValueError(f"There is no node property '{size_property}' in graph '{G.name()}'") if additional_node_properties is None: - additional_node_properties = actual_node_properties + node_properties_by_label = {k: set(v) for k, v in actual_node_properties.items()} else: for prop in additional_node_properties: - if prop not in actual_node_properties: + if prop not in all_actual_node_properties: raise ValueError(f"There is no node property '{prop}' in graph '{G.name()}'") - node_properties = set() - if additional_node_properties is not None: - node_properties.update(additional_node_properties) + node_properties_by_label = {} + for label, props in actual_node_properties.items(): + node_properties_by_label[label] = { + prop for prop in actual_node_properties[label] if prop in additional_node_properties + } + if size_property is not None: - node_properties.add(size_property) - node_properties = list(node_properties) + for label, props in node_properties_by_label.items(): + props.add(size_property) + + node_properties_by_label = {k: list(v) for k, v in node_properties_by_label.items()} node_count = G.node_count() if node_count > max_node_count: @@ -112,13 +119,14 @@ def from_gds( property_name = None try: # Since GDS does not allow us to only fetch node IDs, we add the degree property - # as a temporary property to ensure that we have at least one property to fetch - if len(actual_node_properties) == 0: + # as a temporary property to ensure that we have at least one property for each label to fetch + if sum([len(props) == 0 for props in node_properties_by_label.values()]) > 0: property_name = f"neo4j-viz_property_{uuid4()}" gds.degree.mutate(G_fetched, mutateProperty=property_name) - node_properties = [property_name] + for props in node_properties_by_label.values(): + props.append(property_name) - node_dfs = _fetch_node_dfs(gds, G_fetched, node_properties, G_fetched.node_labels()) + node_dfs = _fetch_node_dfs(gds, G_fetched, node_properties_by_label, G_fetched.node_labels()) if property_name is not None: for df in node_dfs.values(): df.drop(columns=[property_name], inplace=True) @@ -131,35 +139,35 @@ def from_gds( gds.graph.nodeProperties.drop(G_fetched, node_properties=[property_name]) for df in node_dfs.values(): - df.rename(columns={"nodeId": "id"}, inplace=True) if property_name is not None and property_name in df.columns: df.drop(columns=[property_name], inplace=True) - rel_df.rename(columns={"sourceNodeId": "source", "targetNodeId": "target"}, inplace=True) node_props_df = pd.concat(node_dfs.values(), ignore_index=True, axis=0).drop_duplicates() if size_property is not None: - if "size" in actual_node_properties and size_property != "size": + if "size" in all_actual_node_properties and size_property != "size": node_props_df.rename(columns={"size": "__size"}, inplace=True) node_props_df.rename(columns={size_property: "size"}, inplace=True) for lbl, df in node_dfs.items(): - if "labels" in actual_node_properties: + if "labels" in all_actual_node_properties: df.rename(columns={"labels": "__labels"}, inplace=True) df["labels"] = lbl - node_labels_df = pd.concat([df[["id", "labels"]] for df in node_dfs.values()], ignore_index=True, axis=0) - node_labels_df = node_labels_df.groupby("id").agg({"labels": list}) + node_labels_df = pd.concat([df[["nodeId", "labels"]] for df in node_dfs.values()], ignore_index=True, axis=0) + node_labels_df = node_labels_df.groupby("nodeId").agg({"labels": list}) - node_df = node_props_df.merge(node_labels_df, on="id") + node_df = node_props_df.merge(node_labels_df, on="nodeId") - if "caption" not in actual_node_properties: + if "caption" not in all_actual_node_properties: node_df["caption"] = node_df["labels"].astype(str) if "caption" not in rel_df.columns: rel_df["caption"] = rel_df["relationshipType"] try: - return _from_dfs(node_df, rel_df, node_radius_min_max=node_radius_min_max, rename_properties={"__size": "size"}) + return _from_dfs( + node_df, rel_df, node_radius_min_max=node_radius_min_max, rename_properties={"__size": "size"}, dropna=True + ) except ValueError as e: err_msg = str(e) if "column" in err_msg: diff --git a/python-wrapper/src/neo4j_viz/pandas.py b/python-wrapper/src/neo4j_viz/pandas.py index 15e29c0e..761a2c63 100644 --- a/python-wrapper/src/neo4j_viz/pandas.py +++ b/python-wrapper/src/neo4j_viz/pandas.py @@ -31,8 +31,9 @@ def _from_dfs( rel_dfs: DFS_TYPE, node_radius_min_max: Optional[tuple[float, float]] = (3, 60), rename_properties: Optional[dict[str, str]] = None, + dropna: bool = False, ) -> VisualizationGraph: - relationships = _parse_relationships(rel_dfs, rename_properties=rename_properties) + relationships = _parse_relationships(rel_dfs, rename_properties=rename_properties, dropna=dropna) if node_dfs is None: has_size = False @@ -42,7 +43,7 @@ def _from_dfs( node_ids.add(rel.target) nodes = [Node(id=id) for id in node_ids] else: - nodes, has_size = _parse_nodes(node_dfs, rename_properties=rename_properties) + nodes, has_size = _parse_nodes(node_dfs, rename_properties=rename_properties, dropna=dropna) VG = VisualizationGraph(nodes=nodes, relationships=relationships) @@ -52,7 +53,9 @@ def _from_dfs( return VG -def _parse_nodes(node_dfs: DFS_TYPE, rename_properties: Optional[dict[str, str]]) -> tuple[list[Node], bool]: +def _parse_nodes( + node_dfs: DFS_TYPE, rename_properties: Optional[dict[str, str]], dropna: bool = False +) -> tuple[list[Node], bool]: if isinstance(node_dfs, DataFrame): node_dfs_iter: Iterable[DataFrame] = [node_dfs] elif node_dfs is None: @@ -67,6 +70,8 @@ def _parse_nodes(node_dfs: DFS_TYPE, rename_properties: Optional[dict[str, str]] for node_df in node_dfs_iter: has_size &= "size" in node_df.columns for _, row in node_df.iterrows(): + if dropna: + row.dropna(inplace=True) top_level = {} properties = {} for key, value in row.to_dict().items(): @@ -85,7 +90,9 @@ def _parse_nodes(node_dfs: DFS_TYPE, rename_properties: Optional[dict[str, str]] return nodes, has_size -def _parse_relationships(rel_dfs: DFS_TYPE, rename_properties: Optional[dict[str, str]]) -> list[Relationship]: +def _parse_relationships( + rel_dfs: DFS_TYPE, rename_properties: Optional[dict[str, str]], dropna: bool = False +) -> list[Relationship]: all_rel_field_aliases = Relationship.all_validation_aliases() if isinstance(rel_dfs, DataFrame): @@ -96,6 +103,8 @@ def _parse_relationships(rel_dfs: DFS_TYPE, rename_properties: Optional[dict[str for rel_df in rel_dfs_iter: for _, row in rel_df.iterrows(): + if dropna: + row.dropna(inplace=True) top_level = {} properties = {} for key, value in row.to_dict().items(): @@ -118,6 +127,7 @@ def from_dfs( node_dfs: Optional[DFS_TYPE], rel_dfs: DFS_TYPE, node_radius_min_max: Optional[tuple[float, float]] = (3, 60), + dropna: bool = False, ) -> VisualizationGraph: """ Create a VisualizationGraph from pandas DataFrames representing a graph. @@ -136,6 +146,9 @@ def from_dfs( node_radius_min_max : tuple[float, float], optional Minimum and maximum node radius. To avoid tiny or huge nodes in the visualization, the node sizes are scaled to fit in the given range. + dropna : bool, optional + If True, NaN values will be dropped from the DataFrames before processing. + Defaults to False. """ - return _from_dfs(node_dfs, rel_dfs, node_radius_min_max) + return _from_dfs(node_dfs, rel_dfs, node_radius_min_max, dropna=dropna) diff --git a/python-wrapper/tests/test_gds.py b/python-wrapper/tests/test_gds.py index 773a21d9..75f87471 100644 --- a/python-wrapper/tests/test_gds.py +++ b/python-wrapper/tests/test_gds.py @@ -283,3 +283,64 @@ def test_from_gds_sample(gds: Any) -> None: assert len(VG.nodes) <= 10_500 assert len(VG.relationships) >= 9_500 assert len(VG.relationships) <= 10_500 + + +@pytest.mark.requires_neo4j_and_gds +def test_from_gds_hetero(gds: Any) -> None: + from neo4j_viz.gds import from_gds + + A_nodes = pd.DataFrame( + { + "nodeId": [0, 1], + "labels": ["A", "A"], + "component": [1, 2], + } + ) + B_nodes = pd.DataFrame( + { + "nodeId": [2, 3], + "labels": ["B", "B"], + # No 'component' property + } + ) + rels = pd.DataFrame( + { + "sourceNodeId": [0, 1], + "targetNodeId": [2, 3], + "weight": [0.5, 1.5], + "relationshipType": ["REL", "REL2"], + } + ) + + with gds.graph.construct("flo", [A_nodes, B_nodes], rels) as G: + VG = from_gds( + gds, + G, + ) + + assert len(VG.nodes) == 4 + assert sorted(VG.nodes, key=lambda x: x.id) == [ + Node(id=0, caption="['A']", properties=dict(labels=["A"], component=float(1))), + Node(id=1, caption="['A']", properties=dict(labels=["A"], component=float(2))), + Node(id=2, caption="['B']", properties=dict(labels=["B"])), + Node(id=3, caption="['B']", properties=dict(labels=["B"])), + ] + + assert len(VG.relationships) == 2 + vg_rels = sorted( + [ + ( + e.source, + e.target, + e.caption, + e.properties["relationshipType"], + e.properties["weight"], + ) + for e in VG.relationships + ], + key=lambda x: x[0], + ) + assert vg_rels == [ + (0, 2, "REL", "REL", 0.5), + (1, 3, "REL2", "REL2", 1.5), + ] From 297cf7d0d28965ec8a183701fcdeb562cf56a3d7 Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Tue, 1 Jul 2025 09:56:49 +0200 Subject: [PATCH 3/4] Address review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Florentin Dörre --- changelog.md | 1 - python-wrapper/src/neo4j_viz/pandas.py | 10 +++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/changelog.md b/changelog.md index 907a45f0..762168aa 100644 --- a/changelog.md +++ b/changelog.md @@ -9,7 +9,6 @@ ## New features * Allow passing a `neo4j.Driver` instance as input to `from_neo4j`, in which case the driver will be used internally to fetch the graph data using a simple query -* Added optional argument `dropna` to `from_dfs` loader allowing for not including NaN properties in the created visualization graph ## Bug fixes diff --git a/python-wrapper/src/neo4j_viz/pandas.py b/python-wrapper/src/neo4j_viz/pandas.py index 761a2c63..b07d9c39 100644 --- a/python-wrapper/src/neo4j_viz/pandas.py +++ b/python-wrapper/src/neo4j_viz/pandas.py @@ -71,7 +71,7 @@ def _parse_nodes( has_size &= "size" in node_df.columns for _, row in node_df.iterrows(): if dropna: - row.dropna(inplace=True) + row = row.dropna(inplace=False) top_level = {} properties = {} for key, value in row.to_dict().items(): @@ -104,7 +104,7 @@ def _parse_relationships( for rel_df in rel_dfs_iter: for _, row in rel_df.iterrows(): if dropna: - row.dropna(inplace=True) + row = row.dropna(inplace=False) top_level = {} properties = {} for key, value in row.to_dict().items(): @@ -127,7 +127,6 @@ def from_dfs( node_dfs: Optional[DFS_TYPE], rel_dfs: DFS_TYPE, node_radius_min_max: Optional[tuple[float, float]] = (3, 60), - dropna: bool = False, ) -> VisualizationGraph: """ Create a VisualizationGraph from pandas DataFrames representing a graph. @@ -146,9 +145,6 @@ def from_dfs( node_radius_min_max : tuple[float, float], optional Minimum and maximum node radius. To avoid tiny or huge nodes in the visualization, the node sizes are scaled to fit in the given range. - dropna : bool, optional - If True, NaN values will be dropped from the DataFrames before processing. - Defaults to False. """ - return _from_dfs(node_dfs, rel_dfs, node_radius_min_max, dropna=dropna) + return _from_dfs(node_dfs, rel_dfs, node_radius_min_max, dropna=False) From b696f77cf5164b91b572ff337c5a6ddbc8092f2c Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Tue, 1 Jul 2025 10:07:38 +0200 Subject: [PATCH 4/4] Fix mypy complaints --- python-wrapper/tests/conftest.py | 2 ++ python-wrapper/tests/gds_helper.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python-wrapper/tests/conftest.py b/python-wrapper/tests/conftest.py index 081a7007..3f7aa6c7 100644 --- a/python-wrapper/tests/conftest.py +++ b/python-wrapper/tests/conftest.py @@ -43,7 +43,9 @@ def aura_ds_instance() -> Generator[Any, None, None]: # setting as environment variables to run notebooks with this connection os.environ["NEO4J_URI"] = dbms_connection_info.uri + assert isinstance(dbms_connection_info.username, str) os.environ["NEO4J_USER"] = dbms_connection_info.username + assert isinstance(dbms_connection_info.password, str) os.environ["NEO4J_PASSWORD"] = dbms_connection_info.password yield dbms_connection_info diff --git a/python-wrapper/tests/gds_helper.py b/python-wrapper/tests/gds_helper.py index e5fa270d..e5a0d3dc 100644 --- a/python-wrapper/tests/gds_helper.py +++ b/python-wrapper/tests/gds_helper.py @@ -62,8 +62,6 @@ def create_aurads_instance(api: AuraApi) -> tuple[str, DbmsConnectionInfo]: if wait_result.error: raise Exception(f"Error while waiting for instance to be running: {wait_result.error}") - wait_result.connection_url - return instance_details.id, DbmsConnectionInfo( uri=wait_result.connection_url, username="neo4j",