From 1570d8d39d8ef6fc31125e7ee14c8ae6a159c9ec Mon Sep 17 00:00:00 2001 From: Thomas Barbazuk Date: Sun, 3 Aug 2025 01:54:46 +0200 Subject: [PATCH 1/4] cleaned up before running on more pathways --- src/logic_network_generator.py | 72 ++++++++++++++-------------- src/pathway_generator.py | 2 +- src/reaction_generator.py | 86 +++++++++++++++++++++++++++++----- 3 files changed, 114 insertions(+), 46 deletions(-) diff --git a/src/logic_network_generator.py b/src/logic_network_generator.py index 3ae850c..8440046 100755 --- a/src/logic_network_generator.py +++ b/src/logic_network_generator.py @@ -10,7 +10,7 @@ graph: Graph = Graph(uri, auth=("neo4j", "test")) -def _get_reactome_id_from_hash(decomposed_uid_mapping: pd.DataFrame, hash_value: str) -> int: +def get_reactome_id_from_hash(decomposed_uid_mapping: pd.DataFrame, hash_value: str) -> int: """Extract reactome_id for a given hash from decomposed_uid_mapping.""" return decomposed_uid_mapping.loc[ decomposed_uid_mapping["uid"] == hash_value, "reactome_id" @@ -35,14 +35,14 @@ def create_reaction_id_map( rows = [] for _, match in best_matches.iterrows(): - incomming_hash = match["incomming"] + incoming_hash = match["incoming"] outgoing_hash = match["outgoing"] - reactome_id = _get_reactome_id_from_hash(decomposed_uid_mapping, incomming_hash) + reactome_id = get_reactome_id_from_hash(decomposed_uid_mapping, incoming_hash) row = { "uid": str(uuid.uuid4()), "reactome_id": int(reactome_id), - "input_hash": incomming_hash, + "input_hash": incoming_hash, "output_hash": outgoing_hash, } print("row") @@ -55,39 +55,43 @@ def create_reaction_id_map( def create_uid_reaction_connections( - reaction_id_map: pd.DataFrame, - best_matches: pd.DataFrame, + reaction_id_map: pd.DataFrame, + best_matches: pd.DataFrame, decomposed_uid_mapping: pd.DataFrame ) -> pd.DataFrame: """Create connections between reaction UIDs based on best matches.""" - + reactome_id_to_uid_mapping = dict( zip(reaction_id_map["reactome_id"], reaction_id_map["uid"]) ) - + uid_reaction_connections_data = [] - + for _, match in best_matches.iterrows(): - incomming_hash = match["incomming"] + incoming_hash = match["incoming"] outgoing_hash = match["outgoing"] - + # Get reactome IDs for both hashes - preceding_reaction_id = _get_reactome_id_from_hash(decomposed_uid_mapping, incomming_hash) - following_reaction_id = _get_reactome_id_from_hash(decomposed_uid_mapping, outgoing_hash) - + preceding_reaction_id = get_reactome_id_from_hash(decomposed_uid_mapping, incoming_hash) + following_reaction_id = get_reactome_id_from_hash(decomposed_uid_mapping, outgoing_hash) + # Get corresponding UIDs preceding_uid = reactome_id_to_uid_mapping.get(preceding_reaction_id) following_uid = reactome_id_to_uid_mapping.get(following_reaction_id) - + # Only add connection if both UIDs exist if preceding_uid is not None and following_uid is not None: uid_reaction_connections_data.append({ - "preceding_uid": preceding_uid, + "preceding_uid": preceding_uid, "following_uid": following_uid }) - - return pd.DataFrame(uid_reaction_connections_data) + # Fix: Always create DataFrame with proper columns, even if empty + if uid_reaction_connections_data: + return pd.DataFrame(uid_reaction_connections_data) + else: + # Return empty DataFrame with correct column structure + return pd.DataFrame(columns=["preceding_uid", "following_uid"]) def _execute_regulator_query( graph: Graph, @@ -213,29 +217,29 @@ def get_negative_regulators_for_reaction( ) -def _get_non_null_values(df: pd.DataFrame, column: str) -> List[Any]: +def get_non_null_values(df: pd.DataFrame, column: str) -> List[Any]: """Extract non-null values from a DataFrame column.""" return [value for value in df[column].tolist() if pd.notna(value)] -def _get_hash_for_reaction(reaction_id_map: pd.DataFrame, uid: str, hash_type: str) -> str: +def get_hash_for_reaction(reaction_id_map: pd.DataFrame, uid: str, hash_type: str) -> str: """Get input_hash or output_hash for a given reaction UID.""" return reaction_id_map.loc[ reaction_id_map["uid"] == uid, hash_type ].iloc[0] -def _extract_uid_and_reactome_values(decomposed_uid_mapping: pd.DataFrame, hash_value: str) -> tuple: +def extract_uid_and_reactome_values(decomposed_uid_mapping: pd.DataFrame, hash_value: str) -> tuple: """Extract UID and Reactome ID values for a given hash.""" filtered_rows = decomposed_uid_mapping[decomposed_uid_mapping["uid"] == hash_value] - uid_values = _get_non_null_values(filtered_rows, "input_or_output_uid") - reactome_id_values = _get_non_null_values(filtered_rows, "input_or_output_reactome_id") + uid_values = get_non_null_values(filtered_rows, "input_or_output_uid") + reactome_id_values = get_non_null_values(filtered_rows, "input_or_output_reactome_id") return uid_values, reactome_id_values -def _assign_uuids(reactome_ids: List[str], reactome_id_to_uuid: Dict[str, str]) -> List[str]: +def assign_uuids(reactome_ids: List[str], reactome_id_to_uuid: Dict[str, str]) -> List[str]: """Assign UUIDs to Reactome IDs, creating new ones if they don't exist.""" return [ reactome_id_to_uuid.setdefault(reactome_id, str(uuid.uuid4())) @@ -243,7 +247,7 @@ def _assign_uuids(reactome_ids: List[str], reactome_id_to_uuid: Dict[str, str]) ] -def _determine_edge_properties(input_uid_values: List[Any]) -> tuple: +def determine_edge_properties(input_uid_values: List[Any]) -> tuple: """Determine and_or and edge_type based on input UID values.""" if input_uid_values: return "and", "input" @@ -251,7 +255,7 @@ def _determine_edge_properties(input_uid_values: List[Any]) -> tuple: return "or", "output" -def _add_pathway_connections( +def add_pathway_connections( input_uuids: List[str], output_uuids: List[str], and_or: str, @@ -283,8 +287,8 @@ def extract_inputs_and_outputs( for reaction_uid in reaction_uids: # Extract input information - input_hash = _get_hash_for_reaction(reaction_id_map, reaction_uid, "input_hash") - input_uid_values, input_reactome_id_values = _extract_uid_and_reactome_values( + input_hash = get_hash_for_reaction(reaction_id_map, reaction_uid, "input_hash") + input_uid_values, input_reactome_id_values = extract_uid_and_reactome_values( decomposed_uid_mapping, input_hash ) @@ -295,20 +299,20 @@ def extract_inputs_and_outputs( for preceding_uid in preceding_uids: # Extract output information - output_hash = _get_hash_for_reaction(reaction_id_map, preceding_uid, "output_hash") - output_uid_values, output_reactome_id_values = _extract_uid_and_reactome_values( + output_hash = get_hash_for_reaction(reaction_id_map, preceding_uid, "output_hash") + output_uid_values, output_reactome_id_values = extract_uid_and_reactome_values( decomposed_uid_mapping, output_hash ) # Assign UUIDs - input_uuids = _assign_uuids(input_reactome_id_values, reactome_id_to_uuid) - output_uuids = _assign_uuids(output_reactome_id_values, reactome_id_to_uuid) + input_uuids = assign_uuids(input_reactome_id_values, reactome_id_to_uuid) + output_uuids = assign_uuids(output_reactome_id_values, reactome_id_to_uuid) # Determine edge properties - and_or, edge_type = _determine_edge_properties(input_uid_values) + and_or, edge_type = determine_edge_properties(input_uid_values) # Add connections to pathway network - _add_pathway_connections( + add_pathway_connections( input_uuids, output_uuids, and_or, edge_type, pathway_logic_network_data ) diff --git a/src/pathway_generator.py b/src/pathway_generator.py index 53440e0..57322b4 100755 --- a/src/pathway_generator.py +++ b/src/pathway_generator.py @@ -43,7 +43,7 @@ def generate_pathway_file( pathway_id, reaction_connections ) best_matches = pd.DataFrame( - best_matches_list, columns=["incomming", "outgoing"] + best_matches_list, columns=["incoming", "outgoing"] ) decomposed_uid_mapping.to_csv(decomposed_uid_mapping_file, index=False) best_matches.to_csv(best_matches_file, index=False) diff --git a/src/reaction_generator.py b/src/reaction_generator.py index ba5fc79..b2e8f2d 100755 --- a/src/reaction_generator.py +++ b/src/reaction_generator.py @@ -2,7 +2,7 @@ import itertools import uuid import warnings -from typing import Any, Dict, List, Set, Tuple +from typing import Any, Dict, List, Set, Tuple, Optional import pandas as pd @@ -31,6 +31,28 @@ ReactomeID = str DataFrameRow = Dict[str, Any] +# Enhanced functional equivalence groups configuration +FUNCTIONAL_EQUIVALENT_GROUPS = { + # Ubiquitin - classic case of functionally equivalent isoforms + "68524": { + "group_name": "ubiquitin_isoforms", + "strategy": "use_canonical", + "canonical_representative": "68524", + "description": "Ubiquitin isoforms - all serve identical protein degradation signaling function", + "biological_rationale": "Multiple ubiquitin genes produce identical/near-identical proteins" + }, + + # Template for other functional groups + # "entity_id": { + # "group_name": "group_name", + # "strategy": "use_canonical", # or "representative_sampling", "context_dependent" + # "canonical_representative": "representative_id", + # "description": "Brief description", + # "biological_rationale": "Why these are functionally equivalent" + # } +} + + decomposed_uid_mapping = pd.DataFrame( columns=decomposed_uid_mapping_column_types.keys() ).astype( # type: ignore @@ -41,8 +63,6 @@ def get_component_id_or_reference_entity_id(reactome_id): - global reference_entity_dict - if reactome_id in reference_entity_dict: component_id = reference_entity_dict[reactome_id] return component_id @@ -61,6 +81,26 @@ def is_valid_uuid(identifier: Any) -> bool: return True if len(identifier) == 64 else False +def is_functionally_equivalent_entity(entity_id: str) -> Optional[Dict[str, Any]]: + """Check if entity is part of a functional equivalence group.""" + return FUNCTIONAL_EQUIVALENT_GROUPS.get(entity_id) + + +def apply_functional_equivalence_strategy( + entity_id: str, + functional_group: Dict[str, Any], + original_result: Set[str] +) -> Set[str]: + """Apply functional equivalence strategy to entity decomposition.""" + strategy = functional_group.get("strategy", "use_canonical") + + if strategy == "use_canonical": + canonical_rep = functional_group.get("canonical_representative", entity_id) + return {canonical_rep} + else: + return original_result + + def get_broken_apart_ids( broken_apart_members: list[set[str]], reactome_id: ReactomeID ) -> Set[UID]: @@ -77,16 +117,20 @@ def get_broken_apart_ids( new_broken_apart_members.append({member}) iterproduct_components = list(itertools.product(*new_broken_apart_members)) + total_combinations = len(iterproduct_components) + + if total_combinations > 1000: + logger.warning(f"Large combination set for reaction {reactome_id}: {total_combinations} combinations") + iterproduct_components_as_sets = [ set(map(str, item)) for item in iterproduct_components ] - uids = get_uids_for_iterproduct_components( - iterproduct_components_as_sets, reactome_id - ) + uids = get_uids_for_iterproduct_components(iterproduct_components_as_sets, reactome_id) else: uid = str(uuid.uuid4()) rows: List[DataFrameRow] = [] row: DataFrameRow + for member in broken_apart_members: if is_valid_uuid(member): component_ids = decomposed_uid_mapping.loc[ @@ -110,13 +154,14 @@ def get_broken_apart_ids( "component_id": member, "reactome_id": reactome_id, "component_id_or_reference_entity_id": get_component_id_or_reference_entity_id( - component_id + member # Fixed: was using undefined 'component_id' ), "input_or_output_uid": None, "input_or_output_reactome_id": member, } rows.append(row) uids = {uid} + decomposed_uid_mapping = pd.concat([decomposed_uid_mapping, pd.DataFrame(rows)]) return uids @@ -131,6 +176,7 @@ def get_uids_for_iterproduct_components( uids: Set[UID] = set() for component in iterproduct_components: component_to_input_or_output: Dict[ComponentID, InputOutputID] = {} + for item in component: if is_valid_uuid(item): selected_rows = decomposed_uid_mapping.loc[ @@ -173,7 +219,23 @@ def get_uids_for_iterproduct_components( def break_apart_entity(entity_id: int) -> Set[str]: - """Break apart entity.""" + """Break apart entity with functional equivalence support.""" + global decomposed_uid_mapping + + entity_str = str(entity_id) + + # Check for functional equivalence rules first + functional_group = is_functionally_equivalent_entity(entity_str) + if functional_group: + original_result = break_apart_entity_standard(entity_id) + return apply_functional_equivalence_strategy(entity_str, functional_group, original_result) + + # Use original decomposition logic + return break_apart_entity_standard(entity_id) + + +def break_apart_entity_standard(entity_id: int) -> Set[str]: + """Standard break_apart_entity logic.""" global decomposed_uid_mapping labels = get_labels(entity_id) @@ -197,12 +259,12 @@ def break_apart_entity(entity_id: int) -> Set[str]: contains_thing = contains_reference_gene_product_molecule_or_isoform(entity_id) if contains_thing: return set([str(entity_id)]) + member_ids = get_set_members(entity_id) member_list: List[str] = [] for member_id in member_ids: members = break_apart_entity(member_id) - if isinstance(members, set): member_list.extend(members) else: @@ -232,7 +294,6 @@ def break_apart_entity(entity_id: int) -> Set[str]: "SimpleEntity", ] ): - return {str(entity_id)} else: @@ -277,7 +338,10 @@ def get_decomposed_uid_mapping( """Get decomposed UID mapping.""" global decomposed_uid_mapping - decomposed_uid_mapping.drop(decomposed_uid_mapping.index, inplace=True) + # Fix: Properly reinitialize instead of dropping rows + decomposed_uid_mapping = pd.DataFrame( + columns=decomposed_uid_mapping_column_types.keys() + ).astype(decomposed_uid_mapping_column_types) reaction_ids = pd.unique( reaction_connections[ From cba91a812b9e07cbe90bf6000830880ec30dea3e Mon Sep 17 00:00:00 2001 From: Thomas Barbazuk Date: Mon, 4 Aug 2025 18:44:27 +0200 Subject: [PATCH 2/4] added in stoichiomtery. Runs well, more changes fortcoming --- src/decomposed_uid_mapping.py | 3 + src/logic_network_generator.py | 105 ++++++++++++++++++++---- src/neo4j_connector.py | 46 ++++++++--- src/reaction_generator.py | 145 ++++++++++++++++++++++++--------- 4 files changed, 233 insertions(+), 66 deletions(-) diff --git a/src/decomposed_uid_mapping.py b/src/decomposed_uid_mapping.py index 384f0e5..85fd5b0 100644 --- a/src/decomposed_uid_mapping.py +++ b/src/decomposed_uid_mapping.py @@ -7,4 +7,7 @@ "component_id_or_reference_entity_id": pd.Int64Dtype(), "input_or_output_uid": str, "input_or_output_reactome_id": pd.Int64Dtype(), + "stoichiometry": pd.Int64Dtype(), + "entity_stoichiometry": pd.Int64Dtype(), # For original entity stoichiometry + "component_stoichiometry": pd.Int64Dtype(), # For complex component stoichiometry } diff --git a/src/logic_network_generator.py b/src/logic_network_generator.py index 8440046..01f9edc 100755 --- a/src/logic_network_generator.py +++ b/src/logic_network_generator.py @@ -122,7 +122,7 @@ def _execute_regulator_query( def get_catalysts_for_reaction(reaction_id_map: DataFrame, graph: Graph) -> DataFrame: - """Get catalysts for reactions using Neo4j graph queries.""" + """Get catalysts for reactions with stoichiometry using Neo4j graph queries.""" catalyst_list = [] for _, row in reaction_id_map.iterrows(): @@ -130,8 +130,9 @@ def get_catalysts_for_reaction(reaction_id_map: DataFrame, graph: Graph) -> Data reaction_uuid = row["uid"] query = ( - f"MATCH (reaction:ReactionLikeEvent{{dbId: {reaction_id}}})-[:catalystActivity]->(catalystActivity:CatalystActivity)-[:physicalEntity]->(catalyst:PhysicalEntity) " - f"RETURN reaction.dbId AS reaction_id, catalyst.dbId AS catalyst_id, 'catalyst' AS edge_type" + f"MATCH (reaction:ReactionLikeEvent{{dbId: {reaction_id}}})-[r:catalystActivity]->(catalystActivity:CatalystActivity)-[:physicalEntity]->(catalyst:PhysicalEntity) " + f"RETURN reaction.dbId AS reaction_id, catalyst.dbId AS catalyst_id, " + f"COALESCE(r.stoichiometry, 1) AS stoichiometry, 'catalyst' AS edge_type" ) try: @@ -147,7 +148,7 @@ def get_catalysts_for_reaction(reaction_id_map: DataFrame, graph: Graph) -> Data return pd.DataFrame( catalyst_list, - columns=["reaction_id", "catalyst_id", "edge_type", "uuid", "reaction_uuid"], + columns=["reaction_id", "catalyst_id", "stoichiometry", "edge_type", "uuid", "reaction_uuid"], ) @@ -283,27 +284,37 @@ def extract_inputs_and_outputs( reactome_id_to_uuid: Dict[str, str], pathway_logic_network_data: List[Dict[str, Any]], ) -> None: - """Extract inputs and outputs for reactions and add them to the pathway network.""" + """Extract inputs and outputs with stoichiometry support.""" for reaction_uid in reaction_uids: - # Extract input information + # Extract input information WITH stoichiometry input_hash = get_hash_for_reaction(reaction_id_map, reaction_uid, "input_hash") input_uid_values, input_reactome_id_values = extract_uid_and_reactome_values( decomposed_uid_mapping, input_hash ) + # NEW: Extract stoichiometry for inputs + input_stoichiometry = extract_stoichiometry_for_entity( + decomposed_uid_mapping, input_hash, input_reactome_id_values + ) + # Process preceding reactions (outputs) preceding_uids = uid_reaction_connections[ uid_reaction_connections["following_uid"] == reaction_uid ]["preceding_uid"].tolist() for preceding_uid in preceding_uids: - # Extract output information + # Extract output information WITH stoichiometry output_hash = get_hash_for_reaction(reaction_id_map, preceding_uid, "output_hash") output_uid_values, output_reactome_id_values = extract_uid_and_reactome_values( decomposed_uid_mapping, output_hash ) + # NEW: Extract stoichiometry for outputs + output_stoichiometry = extract_stoichiometry_for_entity( + decomposed_uid_mapping, output_hash, output_reactome_id_values + ) + # Assign UUIDs input_uuids = assign_uuids(input_reactome_id_values, reactome_id_to_uuid) output_uuids = assign_uuids(output_reactome_id_values, reactome_id_to_uuid) @@ -311,9 +322,11 @@ def extract_inputs_and_outputs( # Determine edge properties and_or, edge_type = determine_edge_properties(input_uid_values) - # Add connections to pathway network - add_pathway_connections( - input_uuids, output_uuids, and_or, edge_type, pathway_logic_network_data + # Add connections with stoichiometry + add_pathway_connections_with_stoichiometry( + input_uuids, output_uuids, + input_stoichiometry, output_stoichiometry, + and_or, edge_type, pathway_logic_network_data ) @@ -326,25 +339,29 @@ def append_regulators( and_or: str, edge_type: str, ) -> None: - """Append regulatory relationships to the pathway network.""" + """Append regulatory relationships with stoichiometry.""" regulator_configs = [ (catalyst_map, "pos", "catalyst"), - (negative_regulator_map, "neg", "regulator"), + (negative_regulator_map, "neg", "regulator"), (positive_regulator_map, "pos", "regulator"), ] for map_df, pos_neg, edge_type_override in regulator_configs: for _, row in map_df.iterrows(): + stoichiometry = row.get("stoichiometry", 1) # Get stoichiometry if available + pathway_logic_network_data.append({ "source_id": row["uuid"], "target_id": row["reaction_uuid"], "pos_neg": pos_neg, "and_or": and_or, "edge_type": edge_type_override, + "source_stoichiometry": stoichiometry, + "target_stoichiometry": 1, # Reactions are typically 1:1 with regulators + "stoichiometry_ratio": 1.0, }) - def _calculate_reaction_statistics(reaction_connections: pd.DataFrame) -> None: """Calculate and print statistics about reactions without preceding events.""" reactions_without_preceding_events = reaction_connections[ @@ -375,21 +392,75 @@ def _print_regulator_statistics( ) +def extract_stoichiometry_for_entity( + decomposed_uid_mapping: pd.DataFrame, + hash_value: str, + entity_reactome_ids: List[str] +) -> Dict[str, int]: + """Extract stoichiometry information for entities.""" + stoichiometry_map = {} + + for reactome_id in entity_reactome_ids: + # Get stoichiometry from decomposed mapping + filtered_rows = decomposed_uid_mapping[ + (decomposed_uid_mapping["uid"] == hash_value) & + (decomposed_uid_mapping["input_or_output_reactome_id"] == reactome_id) + ] + + if not filtered_rows.empty: + stoichiometry = filtered_rows["stoichiometry"].iloc[0] + stoichiometry_map[reactome_id] = stoichiometry if pd.notna(stoichiometry) else 1 + else: + stoichiometry_map[reactome_id] = 1 + + return stoichiometry_map + +def add_pathway_connections_with_stoichiometry( + input_uuids: List[str], + output_uuids: List[str], + input_stoichiometry: Dict[str, int], + output_stoichiometry: Dict[str, int], + and_or: str, + edge_type: str, + pathway_logic_network_data: List[Dict[str, Any]] +) -> None: + """Add connections with stoichiometry information.""" + + for input_uuid in input_uuids: + for output_uuid in output_uuids: + # Calculate stoichiometry ratio + input_stoich = input_stoichiometry.get(input_uuid, 1) + output_stoich = output_stoichiometry.get(output_uuid, 1) + stoich_ratio = output_stoich / input_stoich if input_stoich > 0 else 1.0 + + pathway_logic_network_data.append({ + "source_id": input_uuid, + "target_id": output_uuid, + "pos_neg": "pos", + "and_or": and_or, + "edge_type": edge_type, + "source_stoichiometry": input_stoich, + "target_stoichiometry": output_stoich, + "stoichiometry_ratio": stoich_ratio, + }) + def create_pathway_logic_network( decomposed_uid_mapping: pd.DataFrame, reaction_connections: pd.DataFrame, best_matches: Any, ) -> pd.DataFrame: - """Create a pathway logic network from decomposed UID mappings and reaction connections.""" - logger.debug("Adding reaction pairs to pathway_logic_network") - - # Initialize data structures + """Create a pathway logic network with stoichiometry support.""" + + # Enhanced column structure columns = { "source_id": pd.Series(dtype="Int64"), "target_id": pd.Series(dtype="Int64"), "pos_neg": pd.Series(dtype="str"), "and_or": pd.Series(dtype="str"), "edge_type": pd.Series(dtype="str"), + "source_stoichiometry": pd.Series(dtype="Int64"), # NEW + "target_stoichiometry": pd.Series(dtype="Int64"), # NEW + "stoichiometry_ratio": pd.Series(dtype="float"), # NEW } pathway_logic_network_data = [] diff --git a/src/neo4j_connector.py b/src/neo4j_connector.py index 66bf4fb..502afc4 100755 --- a/src/neo4j_connector.py +++ b/src/neo4j_connector.py @@ -65,21 +65,23 @@ def get_labels(entity_id: int) -> List[str]: raise -def get_complex_components(entity_id: int) -> Set[int]: +def get_complex_components_with_stoichiometry(entity_id: int) -> List[Dict[str, Any]]: query_get_components_template: str = """ - MATCH (entity)-[:hasComponent]->(component) + MATCH (entity)-[r:hasComponent]->(component) WHERE entity.dbId = %s - RETURN collect(component.dbId) AS component_ids + RETURN component.dbId AS component_id, + COALESCE(r.stoichiometry, 1) AS stoichiometry """ query: str = query_get_components_template % entity_id try: - return set(graph.run(query).data()[0]["component_ids"]) + result = graph.run(query).data() + print(f"Complex {entity_id} components with stoichiometry: {result}") + return result except Exception: - logger.error("Error in get_complex_components", exc_info=True) + logger.error("Error in get_complex_components_with_stoichiometry", exc_info=True) raise - def get_set_members(entity_id: int) -> Set[int]: query_get_members_template: str = """ MATCH (entity)-[:hasCandidate|hasMember]->(member) @@ -111,22 +113,24 @@ def get_reactions(pathway_id: int, taxon_id: str) -> List[int]: raise -def get_reaction_input_output_ids(reaction_id: int, input_or_output: str) -> Set[int]: +def get_reaction_input_output_ids_with_stoichiometry(reaction_id: int, input_or_output: str) -> List[Dict[str, Any]]: query_template: str = """ - MATCH (reaction)-[:%s]-(io) + MATCH (reaction)-[r:%s]->(io) WHERE (reaction:Reaction OR reaction:ReactionLikeEvent) AND reaction.dbId=%s - RETURN COLLECT(io.dbId) AS io_ids + RETURN io.dbId AS entity_id, + COALESCE(r.stoichiometry, 1) AS stoichiometry """ relation_type: str = "input" if input_or_output == "input" else "output" query: str = query_template % (relation_type, reaction_id) try: - return set(graph.run(query).data()[0]["io_ids"]) + result = graph.run(query).data() + print(f"Reaction {reaction_id} {input_or_output}s with stoichiometry: {result}") + return result except Exception: - logger.error("Error in get_reaction_input_output_ids", exc_info=True) + logger.error("Error in get_reaction_input_output_ids_with_stoichiometry", exc_info=True) raise - def get_reference_entity_id(entity_id: int) -> Union[str, None]: query_template: str = """ MATCH (reference_database:ReferenceDatabase)<-[:referenceDatabase]-(reference_entity_gene:ReferenceEntity)<-[:referenceGene]-(reference_entity:ReferenceEntity)<-[:referenceEntity]-(pe:PhysicalEntity) @@ -164,3 +168,21 @@ def contains_reference_gene_product_molecule_or_isoform(entity_id: int) -> bool: exc_info=True, ) raise e + + +def get_catalyst_stoichiometry(reaction_id: int) -> List[Dict[str, Any]]: + query_template: str = """ + MATCH (reaction:ReactionLikeEvent)-[r:catalystActivity]->(ca:CatalystActivity)-[:physicalEntity]->(catalyst:PhysicalEntity) + WHERE reaction.dbId = %s + RETURN catalyst.dbId AS catalyst_id, + COALESCE(r.stoichiometry, 1) AS stoichiometry + """ + query: str = query_template % reaction_id + + try: + result = graph.run(query).data() + print(f"Reaction {reaction_id} catalysts with stoichiometry: {result}") + return result + except Exception: + logger.error("Error in get_catalyst_stoichiometry", exc_info=True) + raise diff --git a/src/reaction_generator.py b/src/reaction_generator.py index b2e8f2d..5bfd287 100755 --- a/src/reaction_generator.py +++ b/src/reaction_generator.py @@ -11,9 +11,9 @@ from src.decomposed_uid_mapping import decomposed_uid_mapping_column_types from src.neo4j_connector import ( contains_reference_gene_product_molecule_or_isoform, - get_complex_components, + get_complex_components_with_stoichiometry, # MODIFIED IMPORT get_labels, - get_reaction_input_output_ids, + get_reaction_input_output_ids_with_stoichiometry, # MODIFIED IMPORT get_reference_entity_id, get_set_members, ) @@ -41,23 +41,19 @@ "description": "Ubiquitin isoforms - all serve identical protein degradation signaling function", "biological_rationale": "Multiple ubiquitin genes produce identical/near-identical proteins" }, - - # Template for other functional groups - # "entity_id": { - # "group_name": "group_name", - # "strategy": "use_canonical", # or "representative_sampling", "context_dependent" - # "canonical_representative": "representative_id", - # "description": "Brief description", - # "biological_rationale": "Why these are functionally equivalent" - # } } +# MODIFIED: Add stoichiometry columns to the schema +enhanced_decomposed_uid_mapping_column_types = { + **decomposed_uid_mapping_column_types, + "stoichiometry": pd.Int64Dtype(), + "entity_stoichiometry": pd.Int64Dtype(), + "component_stoichiometry": pd.Int64Dtype(), +} decomposed_uid_mapping = pd.DataFrame( - columns=decomposed_uid_mapping_column_types.keys() -).astype( # type: ignore - decomposed_uid_mapping_column_types -) + columns=enhanced_decomposed_uid_mapping_column_types.keys() +).astype(enhanced_decomposed_uid_mapping_column_types) reference_entity_dict: Dict[str, str] = {} @@ -102,11 +98,16 @@ def apply_functional_equivalence_strategy( def get_broken_apart_ids( - broken_apart_members: list[set[str]], reactome_id: ReactomeID + broken_apart_members: list[set[str]], + reactome_id: ReactomeID, + stoichiometry_map: Dict[str, int] = None # NEW PARAMETER ) -> Set[UID]: - """Get broken apart IDs.""" + """Get broken apart IDs with stoichiometry support.""" global decomposed_uid_mapping + if stoichiometry_map is None: + stoichiometry_map = {} + uids: Set[UID] if any(isinstance(member, set) for member in broken_apart_members): new_broken_apart_members = [] @@ -125,7 +126,9 @@ def get_broken_apart_ids( iterproduct_components_as_sets = [ set(map(str, item)) for item in iterproduct_components ] - uids = get_uids_for_iterproduct_components(iterproduct_components_as_sets, reactome_id) + uids = get_uids_for_iterproduct_components( + iterproduct_components_as_sets, reactome_id, stoichiometry_map + ) else: uid = str(uuid.uuid4()) rows: List[DataFrameRow] = [] @@ -137,6 +140,9 @@ def get_broken_apart_ids( decomposed_uid_mapping["uid"] == member, "component_id" ].tolist() for component_id in component_ids: + # NEW: Add stoichiometry information + stoichiometry = stoichiometry_map.get(component_id, 1) + row = { "uid": uid, "component_id": component_id, @@ -146,18 +152,27 @@ def get_broken_apart_ids( ), "input_or_output_uid": member, "input_or_output_reactome_id": None, + "stoichiometry": stoichiometry, # NEW + "entity_stoichiometry": stoichiometry, # NEW + "component_stoichiometry": 1, # NEW } rows.append(row) else: + # NEW: Add stoichiometry information + stoichiometry = stoichiometry_map.get(member, 1) + row = { "uid": uid, "component_id": member, "reactome_id": reactome_id, "component_id_or_reference_entity_id": get_component_id_or_reference_entity_id( - member # Fixed: was using undefined 'component_id' + member ), "input_or_output_uid": None, "input_or_output_reactome_id": member, + "stoichiometry": stoichiometry, # NEW + "entity_stoichiometry": stoichiometry, # NEW + "component_stoichiometry": 1, # NEW } rows.append(row) uids = {uid} @@ -168,14 +183,20 @@ def get_broken_apart_ids( def get_uids_for_iterproduct_components( - iterproduct_components: List[Set[ComponentID]], reactome_id: ReactomeID + iterproduct_components: List[Set[ComponentID]], + reactome_id: ReactomeID, + stoichiometry_map: Dict[str, int] = None # NEW PARAMETER ) -> Set[UID]: - """Get UID for iterproduct components.""" + """Get UID for iterproduct components with stoichiometry support.""" global decomposed_uid_mapping + if stoichiometry_map is None: + stoichiometry_map = {} + uids: Set[UID] = set() for component in iterproduct_components: component_to_input_or_output: Dict[ComponentID, InputOutputID] = {} + component_stoichiometries: Dict[ComponentID, int] = {} # NEW for item in component: if is_valid_uuid(item): @@ -185,12 +206,17 @@ def get_uids_for_iterproduct_components( for index, selected_row in selected_rows.iterrows(): component_id = selected_row["component_id"] component_to_input_or_output[component_id] = item + # NEW: Get stoichiometry from existing data + component_stoichiometries[component_id] = selected_row.get("stoichiometry", 1) else: component_to_input_or_output[item] = item + # NEW: Get stoichiometry from the map + component_stoichiometries[item] = stoichiometry_map.get(item, 1) - uid = hashlib.sha256( - str(sorted(component_to_input_or_output.values())).encode() - ).hexdigest() + # MODIFIED: Include stoichiometry in UID calculation + uid_data = [(comp, io, component_stoichiometries.get(comp, 1)) + for comp, io in sorted(component_to_input_or_output.items())] + uid = hashlib.sha256(str(uid_data).encode()).hexdigest() rows: List[DataFrameRow] = [] for component_id, input_or_output_id in component_to_input_or_output.items(): @@ -200,6 +226,10 @@ def get_uids_for_iterproduct_components( input_or_output_reactome_id = ( input_or_output_id if not is_valid_uuid(input_or_output_id) else None ) + + # NEW: Get stoichiometry information + stoichiometry = component_stoichiometries.get(component_id, 1) + row: DataFrameRow = { "uid": uid, "component_id": component_id, @@ -209,6 +239,9 @@ def get_uids_for_iterproduct_components( ), "input_or_output_uid": input_or_output_uid, "input_or_output_reactome_id": input_or_output_reactome_id, + "stoichiometry": stoichiometry, # NEW + "entity_stoichiometry": stoichiometry, # NEW + "component_stoichiometry": stoichiometry, # NEW } rows.append(row) @@ -235,7 +268,7 @@ def break_apart_entity(entity_id: int) -> Set[str]: def break_apart_entity_standard(entity_id: int) -> Set[str]: - """Standard break_apart_entity logic.""" + """Standard break_apart_entity logic with stoichiometry support.""" global decomposed_uid_mapping labels = get_labels(entity_id) @@ -274,13 +307,27 @@ def break_apart_entity_standard(entity_id: int) -> Set[str]: elif "Complex" in labels: broken_apart_members: List[Set[str]] = [] - member_ids = get_complex_components(entity_id) + + # MODIFIED: Get components with stoichiometry + component_data = get_complex_components_with_stoichiometry(entity_id) + print(f"Complex {entity_id} component data: {component_data}") + + component_stoichiometry = {} + member_ids = [] + + for component_info in component_data: + component_id = component_info['component_id'] + stoichiometry = component_info['stoichiometry'] + component_stoichiometry[str(component_id)] = stoichiometry + member_ids.append(component_id) for member_id in member_ids: members = break_apart_entity(member_id) broken_apart_members.append(members) - return get_broken_apart_ids(broken_apart_members, str(entity_id)) + return get_broken_apart_ids( + broken_apart_members, str(entity_id), component_stoichiometry + ) elif any( entity_label in labels @@ -302,25 +349,41 @@ def break_apart_entity_standard(entity_id: int) -> Set[str]: def decompose_by_reactions(reaction_ids: List[int]) -> List[Any]: - """Decompose by reactions.""" + """Decompose by reactions with stoichiometry support.""" global decomposed_uid_mapping - logger.debug("Decomposing reactions") + logger.debug("Decomposing reactions with stoichiometry") all_best_matches = [] for reaction_id in reaction_ids: - input_ids = get_reaction_input_output_ids(reaction_id, "input") + print(f"Processing reaction {reaction_id}") + + # MODIFIED: Get inputs with stoichiometry + input_data = get_reaction_input_output_ids_with_stoichiometry(reaction_id, "input") + input_ids = [item['entity_id'] for item in input_data] + input_stoichiometry = {str(item['entity_id']): item['stoichiometry'] for item in input_data} + + print(f"Reaction {reaction_id} inputs: {input_ids}") + print(f"Input stoichiometry: {input_stoichiometry}") + broken_apart_input_id = [break_apart_entity(input_id) for input_id in input_ids] input_combinations = get_broken_apart_ids( - broken_apart_input_id, str(reaction_id) + broken_apart_input_id, str(reaction_id), input_stoichiometry ) - output_ids = get_reaction_input_output_ids(reaction_id, "output") + # MODIFIED: Get outputs with stoichiometry + output_data = get_reaction_input_output_ids_with_stoichiometry(reaction_id, "output") + output_ids = [item['entity_id'] for item in output_data] + output_stoichiometry = {str(item['entity_id']): item['stoichiometry'] for item in output_data} + + print(f"Reaction {reaction_id} outputs: {output_ids}") + print(f"Output stoichiometry: {output_stoichiometry}") + broken_apart_output_id = [ break_apart_entity(output_id) for output_id in output_ids ] output_combinations = get_broken_apart_ids( - broken_apart_output_id, str(reaction_id) + broken_apart_output_id, str(reaction_id), output_stoichiometry ) [best_matches, _] = find_best_reaction_match( @@ -335,13 +398,13 @@ def decompose_by_reactions(reaction_ids: List[int]) -> List[Any]: def get_decomposed_uid_mapping( pathway_id: str, reaction_connections: pd.DataFrame ) -> Tuple[pd.DataFrame, List[Any]]: - """Get decomposed UID mapping.""" + """Get decomposed UID mapping with stoichiometry support.""" global decomposed_uid_mapping - # Fix: Properly reinitialize instead of dropping rows + # MODIFIED: Use enhanced column types decomposed_uid_mapping = pd.DataFrame( - columns=decomposed_uid_mapping_column_types.keys() - ).astype(decomposed_uid_mapping_column_types) + columns=enhanced_decomposed_uid_mapping_column_types.keys() + ).astype(enhanced_decomposed_uid_mapping_column_types) reaction_ids = pd.unique( reaction_connections[ @@ -351,6 +414,14 @@ def get_decomposed_uid_mapping( reaction_ids = reaction_ids[~pd.isna(reaction_ids)] # removing NA value from list reaction_ids = reaction_ids.astype(int).tolist() # converting to integer + + print(f"Processing {len(reaction_ids)} reactions for pathway {pathway_id}") + best_matches = decompose_by_reactions(list(reaction_ids)) + + print(f"Final decomposed_uid_mapping shape: {decomposed_uid_mapping.shape}") + print(f"Stoichiometry column summary:") + if 'stoichiometry' in decomposed_uid_mapping.columns: + print(decomposed_uid_mapping['stoichiometry'].value_counts()) return (decomposed_uid_mapping, best_matches) From 666e67d3f82a05ee4beeeaa08208e0d91c10cb80 Mon Sep 17 00:00:00 2001 From: Thomas Barbazuk Date: Mon, 4 Aug 2025 18:48:28 +0200 Subject: [PATCH 3/4] added in stoichiomtery. Runs well, more changes fortcoming --- src/logic_network_generator.py | 348 +++++++++++++++++++-------------- src/neo4j_connector.py | 14 +- src/pathway_generator.py | 4 +- src/reaction_generator.py | 100 ++++++---- 4 files changed, 267 insertions(+), 199 deletions(-) diff --git a/src/logic_network_generator.py b/src/logic_network_generator.py index 01f9edc..b767ef6 100755 --- a/src/logic_network_generator.py +++ b/src/logic_network_generator.py @@ -10,7 +10,9 @@ graph: Graph = Graph(uri, auth=("neo4j", "test")) -def get_reactome_id_from_hash(decomposed_uid_mapping: pd.DataFrame, hash_value: str) -> int: +def get_reactome_id_from_hash( + decomposed_uid_mapping: pd.DataFrame, hash_value: str +) -> int: """Extract reactome_id for a given hash from decomposed_uid_mapping.""" return decomposed_uid_mapping.loc[ decomposed_uid_mapping["uid"] == hash_value, "reactome_id" @@ -18,27 +20,27 @@ def get_reactome_id_from_hash(decomposed_uid_mapping: pd.DataFrame, hash_value: def create_reaction_id_map( - decomposed_uid_mapping: pd.DataFrame, - reaction_ids: List[int], - best_matches: pd.DataFrame + decomposed_uid_mapping: pd.DataFrame, + reaction_ids: List[int], + best_matches: pd.DataFrame, ) -> pd.DataFrame: """Create a mapping between reaction UIDs, reactome IDs, and input/output hashes.""" - + reaction_id_map_column_types = { "uid": str, "reactome_id": pd.Int64Dtype(), "input_hash": str, "output_hash": str, } - + print("Checking best_matches contents:") - + rows = [] for _, match in best_matches.iterrows(): incoming_hash = match["incoming"] outgoing_hash = match["outgoing"] reactome_id = get_reactome_id_from_hash(decomposed_uid_mapping, incoming_hash) - + row = { "uid": str(uuid.uuid4()), "reactome_id": int(reactome_id), @@ -48,23 +50,23 @@ def create_reaction_id_map( print("row") print(row) rows.append(row) - + reaction_id_map = pd.DataFrame(rows).astype(reaction_id_map_column_types) - + return reaction_id_map def create_uid_reaction_connections( reaction_id_map: pd.DataFrame, best_matches: pd.DataFrame, - decomposed_uid_mapping: pd.DataFrame + decomposed_uid_mapping: pd.DataFrame, ) -> pd.DataFrame: """Create connections between reaction UIDs based on best matches.""" reactome_id_to_uid_mapping = dict( zip(reaction_id_map["reactome_id"], reaction_id_map["uid"]) ) - + uid_reaction_connections_data = [] for _, match in best_matches.iterrows(): @@ -72,8 +74,12 @@ def create_uid_reaction_connections( outgoing_hash = match["outgoing"] # Get reactome IDs for both hashes - preceding_reaction_id = get_reactome_id_from_hash(decomposed_uid_mapping, incoming_hash) - following_reaction_id = get_reactome_id_from_hash(decomposed_uid_mapping, outgoing_hash) + preceding_reaction_id = get_reactome_id_from_hash( + decomposed_uid_mapping, incoming_hash + ) + following_reaction_id = get_reactome_id_from_hash( + decomposed_uid_mapping, outgoing_hash + ) # Get corresponding UIDs preceding_uid = reactome_id_to_uid_mapping.get(preceding_reaction_id) @@ -81,10 +87,9 @@ def create_uid_reaction_connections( # Only add connection if both UIDs exist if preceding_uid is not None and following_uid is not None: - uid_reaction_connections_data.append({ - "preceding_uid": preceding_uid, - "following_uid": following_uid - }) + uid_reaction_connections_data.append( + {"preceding_uid": preceding_uid, "following_uid": following_uid} + ) # Fix: Always create DataFrame with proper columns, even if empty if uid_reaction_connections_data: @@ -93,29 +98,29 @@ def create_uid_reaction_connections( # Return empty DataFrame with correct column structure return pd.DataFrame(columns=["preceding_uid", "following_uid"]) + def _execute_regulator_query( - graph: Graph, - query: str, - reaction_uuid: str, - function_name: str + graph: Graph, query: str, reaction_uuid: str, function_name: str ) -> List[Dict[str, Any]]: """Execute a regulator query and return processed results.""" try: result = graph.run(query) regulators = [] - + for record in result: regulator_uuid = str(uuid.uuid4()) - regulators.append({ - "reaction": reaction_uuid, - "PhysicalEntity": regulator_uuid, - "edge_type": "regulator", - "uuid": regulator_uuid, - "reaction_uuid": reaction_uuid, - }) - + regulators.append( + { + "reaction": reaction_uuid, + "PhysicalEntity": regulator_uuid, + "edge_type": "regulator", + "uuid": regulator_uuid, + "reaction_uuid": reaction_uuid, + } + ) + return regulators - + except Exception as e: logger.error(f"Error in {function_name}", exc_info=True) raise e @@ -124,60 +129,66 @@ def _execute_regulator_query( def get_catalysts_for_reaction(reaction_id_map: DataFrame, graph: Graph) -> DataFrame: """Get catalysts for reactions with stoichiometry using Neo4j graph queries.""" catalyst_list = [] - + for _, row in reaction_id_map.iterrows(): reaction_id = row["reactome_id"] reaction_uuid = row["uid"] - + query = ( f"MATCH (reaction:ReactionLikeEvent{{dbId: {reaction_id}}})-[r:catalystActivity]->(catalystActivity:CatalystActivity)-[:physicalEntity]->(catalyst:PhysicalEntity) " f"RETURN reaction.dbId AS reaction_id, catalyst.dbId AS catalyst_id, " f"COALESCE(r.stoichiometry, 1) AS stoichiometry, 'catalyst' AS edge_type" ) - + try: data = graph.run(query).data() for item in data: item["uuid"] = str(uuid.uuid4()) item["reaction_uuid"] = reaction_uuid catalyst_list.extend(data) - + except Exception as e: logger.error("Error in get_catalysts_for_reaction", exc_info=True) raise e - + return pd.DataFrame( catalyst_list, - columns=["reaction_id", "catalyst_id", "stoichiometry", "edge_type", "uuid", "reaction_uuid"], + columns=[ + "reaction_id", + "catalyst_id", + "stoichiometry", + "edge_type", + "uuid", + "reaction_uuid", + ], ) def get_positive_regulators_for_reaction( - reaction_id_mapping: DataFrame, - graph: Graph + reaction_id_mapping: DataFrame, graph: Graph ) -> DataFrame: """Get positive regulators for reactions using Neo4j graph queries.""" regulators_list = [] - + for _, row in reaction_id_mapping.iterrows(): reaction_id = row["reactome_id"] reaction_uuid = row["uid"] - + if pd.isna(reaction_uuid): logger.error(f"No UUID found for reaction ID {reaction_id}") continue - + query = ( f"MATCH (reaction)-[:regulatedBy]->(regulator:PositiveRegulation)-[:regulator]->(pe:PhysicalEntity) " f"WHERE reaction.dbId = {reaction_id} " "RETURN reaction.dbId as reaction, pe.dbId as PhysicalEntity" ) - + regulators = _execute_regulator_query( graph, query, reaction_uuid, "get_positive_regulators_for_reaction" ) regulators_list.extend(regulators) - + return pd.DataFrame( regulators_list, columns=["reaction", "PhysicalEntity", "edge_type", "uuid", "reaction_uuid"], @@ -186,31 +197,30 @@ def get_positive_regulators_for_reaction( def get_negative_regulators_for_reaction( - reaction_id_mapping: DataFrame, - graph: Graph + reaction_id_mapping: DataFrame, graph: Graph ) -> DataFrame: """Get negative regulators for reactions using Neo4j graph queries.""" regulators_list = [] - + for _, row in reaction_id_mapping.iterrows(): reaction_id = row["reactome_id"] reaction_uuid = row["uid"] - + if pd.isna(reaction_uuid): logger.error(f"No UUID found for reaction ID {reaction_id}") continue - + query = ( f"MATCH (reaction)-[:regulatedBy]->(regulator:NegativeRegulation)-[:regulator]->(pe:PhysicalEntity) " f"WHERE reaction.dbId = {reaction_id} " "RETURN reaction.dbId as reaction, pe.dbId as PhysicalEntity" ) - + regulators = _execute_regulator_query( graph, query, reaction_uuid, "get_negative_regulators_for_reaction" ) regulators_list.extend(regulators) - + return pd.DataFrame( regulators_list, columns=["reaction", "PhysicalEntity", "edge_type", "uuid", "reaction_uuid"], @@ -223,24 +233,30 @@ def get_non_null_values(df: pd.DataFrame, column: str) -> List[Any]: return [value for value in df[column].tolist() if pd.notna(value)] -def get_hash_for_reaction(reaction_id_map: pd.DataFrame, uid: str, hash_type: str) -> str: +def get_hash_for_reaction( + reaction_id_map: pd.DataFrame, uid: str, hash_type: str +) -> str: """Get input_hash or output_hash for a given reaction UID.""" - return reaction_id_map.loc[ - reaction_id_map["uid"] == uid, hash_type - ].iloc[0] + return reaction_id_map.loc[reaction_id_map["uid"] == uid, hash_type].iloc[0] -def extract_uid_and_reactome_values(decomposed_uid_mapping: pd.DataFrame, hash_value: str) -> tuple: +def extract_uid_and_reactome_values( + decomposed_uid_mapping: pd.DataFrame, hash_value: str +) -> tuple: """Extract UID and Reactome ID values for a given hash.""" filtered_rows = decomposed_uid_mapping[decomposed_uid_mapping["uid"] == hash_value] - + uid_values = get_non_null_values(filtered_rows, "input_or_output_uid") - reactome_id_values = get_non_null_values(filtered_rows, "input_or_output_reactome_id") - + reactome_id_values = get_non_null_values( + filtered_rows, "input_or_output_reactome_id" + ) + return uid_values, reactome_id_values -def assign_uuids(reactome_ids: List[str], reactome_id_to_uuid: Dict[str, str]) -> List[str]: +def assign_uuids( + reactome_ids: List[str], reactome_id_to_uuid: Dict[str, str] +) -> List[str]: """Assign UUIDs to Reactome IDs, creating new ones if they don't exist.""" return [ reactome_id_to_uuid.setdefault(reactome_id, str(uuid.uuid4())) @@ -257,22 +273,24 @@ def determine_edge_properties(input_uid_values: List[Any]) -> tuple: def add_pathway_connections( - input_uuids: List[str], - output_uuids: List[str], - and_or: str, + input_uuids: List[str], + output_uuids: List[str], + and_or: str, edge_type: str, - pathway_logic_network_data: List[Dict[str, Any]] + pathway_logic_network_data: List[Dict[str, Any]], ) -> None: """Add all input-output connections to the pathway network data.""" for input_uuid in input_uuids: for output_uuid in output_uuids: - pathway_logic_network_data.append({ - "source_id": input_uuid, - "target_id": output_uuid, - "pos_neg": "pos", - "and_or": and_or, - "edge_type": edge_type, - }) + pathway_logic_network_data.append( + { + "source_id": input_uuid, + "target_id": output_uuid, + "pos_neg": "pos", + "and_or": and_or, + "edge_type": edge_type, + } + ) def extract_inputs_and_outputs( @@ -285,48 +303,54 @@ def extract_inputs_and_outputs( pathway_logic_network_data: List[Dict[str, Any]], ) -> None: """Extract inputs and outputs with stoichiometry support.""" - + for reaction_uid in reaction_uids: # Extract input information WITH stoichiometry input_hash = get_hash_for_reaction(reaction_id_map, reaction_uid, "input_hash") input_uid_values, input_reactome_id_values = extract_uid_and_reactome_values( decomposed_uid_mapping, input_hash ) - + # NEW: Extract stoichiometry for inputs input_stoichiometry = extract_stoichiometry_for_entity( decomposed_uid_mapping, input_hash, input_reactome_id_values ) - + # Process preceding reactions (outputs) preceding_uids = uid_reaction_connections[ uid_reaction_connections["following_uid"] == reaction_uid ]["preceding_uid"].tolist() - + for preceding_uid in preceding_uids: # Extract output information WITH stoichiometry - output_hash = get_hash_for_reaction(reaction_id_map, preceding_uid, "output_hash") - output_uid_values, output_reactome_id_values = extract_uid_and_reactome_values( - decomposed_uid_mapping, output_hash + output_hash = get_hash_for_reaction( + reaction_id_map, preceding_uid, "output_hash" + ) + output_uid_values, output_reactome_id_values = ( + extract_uid_and_reactome_values(decomposed_uid_mapping, output_hash) ) - + # NEW: Extract stoichiometry for outputs output_stoichiometry = extract_stoichiometry_for_entity( decomposed_uid_mapping, output_hash, output_reactome_id_values ) - + # Assign UUIDs input_uuids = assign_uuids(input_reactome_id_values, reactome_id_to_uuid) output_uuids = assign_uuids(output_reactome_id_values, reactome_id_to_uuid) - + # Determine edge properties and_or, edge_type = determine_edge_properties(input_uid_values) - + # Add connections with stoichiometry add_pathway_connections_with_stoichiometry( - input_uuids, output_uuids, - input_stoichiometry, output_stoichiometry, - and_or, edge_type, pathway_logic_network_data + input_uuids, + output_uuids, + input_stoichiometry, + output_stoichiometry, + and_or, + edge_type, + pathway_logic_network_data, ) @@ -340,27 +364,32 @@ def append_regulators( edge_type: str, ) -> None: """Append regulatory relationships with stoichiometry.""" - + regulator_configs = [ (catalyst_map, "pos", "catalyst"), - (negative_regulator_map, "neg", "regulator"), + (negative_regulator_map, "neg", "regulator"), (positive_regulator_map, "pos", "regulator"), ] - + for map_df, pos_neg, edge_type_override in regulator_configs: for _, row in map_df.iterrows(): - stoichiometry = row.get("stoichiometry", 1) # Get stoichiometry if available - - pathway_logic_network_data.append({ - "source_id": row["uuid"], - "target_id": row["reaction_uuid"], - "pos_neg": pos_neg, - "and_or": and_or, - "edge_type": edge_type_override, - "source_stoichiometry": stoichiometry, - "target_stoichiometry": 1, # Reactions are typically 1:1 with regulators - "stoichiometry_ratio": 1.0, - }) + stoichiometry = row.get( + "stoichiometry", 1 + ) # Get stoichiometry if available + + pathway_logic_network_data.append( + { + "source_id": row["uuid"], + "target_id": row["reaction_uuid"], + "pos_neg": pos_neg, + "and_or": and_or, + "edge_type": edge_type_override, + "source_stoichiometry": stoichiometry, + "target_stoichiometry": 1, # Reactions are typically 1:1 with regulators + "stoichiometry_ratio": 1.0, + } + ) + def _calculate_reaction_statistics(reaction_connections: pd.DataFrame) -> None: """Calculate and print statistics about reactions without preceding events.""" @@ -369,12 +398,14 @@ def _calculate_reaction_statistics(reaction_connections: pd.DataFrame) -> None: reaction_connections["preceding_reaction_id"] ) ] - + num_reactions_without_preceding = len(reactions_without_preceding_events) num_total_reactions = len(reaction_connections) - + if num_total_reactions > 0: - percentage_without_preceding = (num_reactions_without_preceding / num_total_reactions) * 100 + percentage_without_preceding = ( + num_reactions_without_preceding / num_total_reactions + ) * 100 print("Percentage of reactions without preceding events") print(percentage_without_preceding) @@ -382,7 +413,7 @@ def _calculate_reaction_statistics(reaction_connections: pd.DataFrame) -> None: def _print_regulator_statistics( positive_regulator_map: pd.DataFrame, negative_regulator_map: pd.DataFrame, - catalyst_map: pd.DataFrame + catalyst_map: pd.DataFrame, ) -> None: """Print statistics about regulators and catalysts.""" print( @@ -393,56 +424,62 @@ def _print_regulator_statistics( def extract_stoichiometry_for_entity( - decomposed_uid_mapping: pd.DataFrame, - hash_value: str, - entity_reactome_ids: List[str] + decomposed_uid_mapping: pd.DataFrame, + hash_value: str, + entity_reactome_ids: List[str], ) -> Dict[str, int]: """Extract stoichiometry information for entities.""" stoichiometry_map = {} - + for reactome_id in entity_reactome_ids: # Get stoichiometry from decomposed mapping filtered_rows = decomposed_uid_mapping[ - (decomposed_uid_mapping["uid"] == hash_value) & - (decomposed_uid_mapping["input_or_output_reactome_id"] == reactome_id) + (decomposed_uid_mapping["uid"] == hash_value) + & (decomposed_uid_mapping["input_or_output_reactome_id"] == reactome_id) ] - + if not filtered_rows.empty: stoichiometry = filtered_rows["stoichiometry"].iloc[0] - stoichiometry_map[reactome_id] = stoichiometry if pd.notna(stoichiometry) else 1 + stoichiometry_map[reactome_id] = ( + stoichiometry if pd.notna(stoichiometry) else 1 + ) else: stoichiometry_map[reactome_id] = 1 - + return stoichiometry_map + def add_pathway_connections_with_stoichiometry( - input_uuids: List[str], - output_uuids: List[str], + input_uuids: List[str], + output_uuids: List[str], input_stoichiometry: Dict[str, int], output_stoichiometry: Dict[str, int], - and_or: str, + and_or: str, edge_type: str, - pathway_logic_network_data: List[Dict[str, Any]] + pathway_logic_network_data: List[Dict[str, Any]], ) -> None: """Add connections with stoichiometry information.""" - + for input_uuid in input_uuids: for output_uuid in output_uuids: # Calculate stoichiometry ratio input_stoich = input_stoichiometry.get(input_uuid, 1) output_stoich = output_stoichiometry.get(output_uuid, 1) stoich_ratio = output_stoich / input_stoich if input_stoich > 0 else 1.0 - - pathway_logic_network_data.append({ - "source_id": input_uuid, - "target_id": output_uuid, - "pos_neg": "pos", - "and_or": and_or, - "edge_type": edge_type, - "source_stoichiometry": input_stoich, - "target_stoichiometry": output_stoich, - "stoichiometry_ratio": stoich_ratio, - }) + + pathway_logic_network_data.append( + { + "source_id": input_uuid, + "target_id": output_uuid, + "pos_neg": "pos", + "and_or": and_or, + "edge_type": edge_type, + "source_stoichiometry": input_stoich, + "target_stoichiometry": output_stoich, + "stoichiometry_ratio": stoich_ratio, + } + ) + def create_pathway_logic_network( decomposed_uid_mapping: pd.DataFrame, @@ -450,7 +487,7 @@ def create_pathway_logic_network( best_matches: Any, ) -> pd.DataFrame: """Create a pathway logic network with stoichiometry support.""" - + # Enhanced column structure columns = { "source_id": pd.Series(dtype="Int64"), @@ -460,40 +497,48 @@ def create_pathway_logic_network( "edge_type": pd.Series(dtype="str"), "source_stoichiometry": pd.Series(dtype="Int64"), # NEW "target_stoichiometry": pd.Series(dtype="Int64"), # NEW - "stoichiometry_ratio": pd.Series(dtype="float"), # NEW + "stoichiometry_ratio": pd.Series(dtype="float"), # NEW } pathway_logic_network_data = [] - + # Extract unique reaction IDs reaction_ids = pd.unique( reaction_connections[["preceding_reaction_id", "following_reaction_id"]] .stack() .dropna() ) - + # Calculate and print statistics _calculate_reaction_statistics(reaction_connections) - + # Create mappings and connections - reaction_id_map = create_reaction_id_map(decomposed_uid_mapping, reaction_ids, best_matches) + reaction_id_map = create_reaction_id_map( + decomposed_uid_mapping, reaction_ids, best_matches + ) catalyst_map = get_catalysts_for_reaction(reaction_id_map, graph) - negative_regulator_map = get_negative_regulators_for_reaction(reaction_id_map, graph) - positive_regulator_map = get_positive_regulators_for_reaction(reaction_id_map, graph) - + negative_regulator_map = get_negative_regulators_for_reaction( + reaction_id_map, graph + ) + positive_regulator_map = get_positive_regulators_for_reaction( + reaction_id_map, graph + ) + uid_reaction_connections = create_uid_reaction_connections( reaction_id_map, best_matches, decomposed_uid_mapping ) - + reaction_uids = pd.unique( uid_reaction_connections[["preceding_uid", "following_uid"]].stack().dropna() ) - + # Print regulator statistics - _print_regulator_statistics(positive_regulator_map, negative_regulator_map, catalyst_map) - + _print_regulator_statistics( + positive_regulator_map, negative_regulator_map, catalyst_map + ) + # Process reactions and regulators reactome_id_to_uuid = {} - + for reaction_uid in reaction_uids: extract_inputs_and_outputs( reaction_uid, @@ -504,7 +549,7 @@ def create_pathway_logic_network( reactome_id_to_uuid, pathway_logic_network_data, ) - + and_or = "" edge_type = "" append_regulators( @@ -516,22 +561,25 @@ def create_pathway_logic_network( and_or, edge_type, ) - + # Create final DataFrame - pathway_logic_network = pd.DataFrame(pathway_logic_network_data, columns=columns.keys()) - + pathway_logic_network = pd.DataFrame( + pathway_logic_network_data, columns=columns.keys() + ) + # Find root inputs and terminal outputs root_inputs = find_root_inputs(pathway_logic_network) terminal_outputs = find_terminal_outputs(pathway_logic_network) - + print( f"root_inputs: {root_inputs}\n" f"terminal_outputs: {terminal_outputs}\n" f"pathway_logic_network: {pathway_logic_network}" ) - + return pathway_logic_network + def find_root_inputs(pathway_logic_network): root_inputs = pathway_logic_network[ (pathway_logic_network["source_id"].notnull()) diff --git a/src/neo4j_connector.py b/src/neo4j_connector.py index 502afc4..6218645 100755 --- a/src/neo4j_connector.py +++ b/src/neo4j_connector.py @@ -79,9 +79,12 @@ def get_complex_components_with_stoichiometry(entity_id: int) -> List[Dict[str, print(f"Complex {entity_id} components with stoichiometry: {result}") return result except Exception: - logger.error("Error in get_complex_components_with_stoichiometry", exc_info=True) + logger.error( + "Error in get_complex_components_with_stoichiometry", exc_info=True + ) raise + def get_set_members(entity_id: int) -> Set[int]: query_get_members_template: str = """ MATCH (entity)-[:hasCandidate|hasMember]->(member) @@ -113,7 +116,9 @@ def get_reactions(pathway_id: int, taxon_id: str) -> List[int]: raise -def get_reaction_input_output_ids_with_stoichiometry(reaction_id: int, input_or_output: str) -> List[Dict[str, Any]]: +def get_reaction_input_output_ids_with_stoichiometry( + reaction_id: int, input_or_output: str +) -> List[Dict[str, Any]]: query_template: str = """ MATCH (reaction)-[r:%s]->(io) WHERE (reaction:Reaction OR reaction:ReactionLikeEvent) AND reaction.dbId=%s @@ -128,9 +133,12 @@ def get_reaction_input_output_ids_with_stoichiometry(reaction_id: int, input_or_ print(f"Reaction {reaction_id} {input_or_output}s with stoichiometry: {result}") return result except Exception: - logger.error("Error in get_reaction_input_output_ids_with_stoichiometry", exc_info=True) + logger.error( + "Error in get_reaction_input_output_ids_with_stoichiometry", exc_info=True + ) raise + def get_reference_entity_id(entity_id: int) -> Union[str, None]: query_template: str = """ MATCH (reference_database:ReferenceDatabase)<-[:referenceDatabase]-(reference_entity_gene:ReferenceEntity)<-[:referenceGene]-(reference_entity:ReferenceEntity)<-[:referenceEntity]-(pe:PhysicalEntity) diff --git a/src/pathway_generator.py b/src/pathway_generator.py index 57322b4..8677679 100755 --- a/src/pathway_generator.py +++ b/src/pathway_generator.py @@ -42,9 +42,7 @@ def generate_pathway_file( [decomposed_uid_mapping, best_matches_list] = get_decomposed_uid_mapping( pathway_id, reaction_connections ) - best_matches = pd.DataFrame( - best_matches_list, columns=["incoming", "outgoing"] - ) + best_matches = pd.DataFrame(best_matches_list, columns=["incoming", "outgoing"]) decomposed_uid_mapping.to_csv(decomposed_uid_mapping_file, index=False) best_matches.to_csv(best_matches_file, index=False) diff --git a/src/reaction_generator.py b/src/reaction_generator.py index 5bfd287..cd75f92 100755 --- a/src/reaction_generator.py +++ b/src/reaction_generator.py @@ -39,7 +39,7 @@ "strategy": "use_canonical", "canonical_representative": "68524", "description": "Ubiquitin isoforms - all serve identical protein degradation signaling function", - "biological_rationale": "Multiple ubiquitin genes produce identical/near-identical proteins" + "biological_rationale": "Multiple ubiquitin genes produce identical/near-identical proteins", }, } @@ -83,13 +83,11 @@ def is_functionally_equivalent_entity(entity_id: str) -> Optional[Dict[str, Any] def apply_functional_equivalence_strategy( - entity_id: str, - functional_group: Dict[str, Any], - original_result: Set[str] + entity_id: str, functional_group: Dict[str, Any], original_result: Set[str] ) -> Set[str]: """Apply functional equivalence strategy to entity decomposition.""" strategy = functional_group.get("strategy", "use_canonical") - + if strategy == "use_canonical": canonical_rep = functional_group.get("canonical_representative", entity_id) return {canonical_rep} @@ -98,9 +96,9 @@ def apply_functional_equivalence_strategy( def get_broken_apart_ids( - broken_apart_members: list[set[str]], + broken_apart_members: list[set[str]], reactome_id: ReactomeID, - stoichiometry_map: Dict[str, int] = None # NEW PARAMETER + stoichiometry_map: Dict[str, int] = None, # NEW PARAMETER ) -> Set[UID]: """Get broken apart IDs with stoichiometry support.""" global decomposed_uid_mapping @@ -119,9 +117,11 @@ def get_broken_apart_ids( iterproduct_components = list(itertools.product(*new_broken_apart_members)) total_combinations = len(iterproduct_components) - + if total_combinations > 1000: - logger.warning(f"Large combination set for reaction {reactome_id}: {total_combinations} combinations") + logger.warning( + f"Large combination set for reaction {reactome_id}: {total_combinations} combinations" + ) iterproduct_components_as_sets = [ set(map(str, item)) for item in iterproduct_components @@ -133,7 +133,7 @@ def get_broken_apart_ids( uid = str(uuid.uuid4()) rows: List[DataFrameRow] = [] row: DataFrameRow - + for member in broken_apart_members: if is_valid_uuid(member): component_ids = decomposed_uid_mapping.loc[ @@ -142,7 +142,7 @@ def get_broken_apart_ids( for component_id in component_ids: # NEW: Add stoichiometry information stoichiometry = stoichiometry_map.get(component_id, 1) - + row = { "uid": uid, "component_id": component_id, @@ -160,7 +160,7 @@ def get_broken_apart_ids( else: # NEW: Add stoichiometry information stoichiometry = stoichiometry_map.get(member, 1) - + row = { "uid": uid, "component_id": member, @@ -183,9 +183,9 @@ def get_broken_apart_ids( def get_uids_for_iterproduct_components( - iterproduct_components: List[Set[ComponentID]], + iterproduct_components: List[Set[ComponentID]], reactome_id: ReactomeID, - stoichiometry_map: Dict[str, int] = None # NEW PARAMETER + stoichiometry_map: Dict[str, int] = None, # NEW PARAMETER ) -> Set[UID]: """Get UID for iterproduct components with stoichiometry support.""" global decomposed_uid_mapping @@ -197,7 +197,7 @@ def get_uids_for_iterproduct_components( for component in iterproduct_components: component_to_input_or_output: Dict[ComponentID, InputOutputID] = {} component_stoichiometries: Dict[ComponentID, int] = {} # NEW - + for item in component: if is_valid_uuid(item): selected_rows = decomposed_uid_mapping.loc[ @@ -207,15 +207,19 @@ def get_uids_for_iterproduct_components( component_id = selected_row["component_id"] component_to_input_or_output[component_id] = item # NEW: Get stoichiometry from existing data - component_stoichiometries[component_id] = selected_row.get("stoichiometry", 1) + component_stoichiometries[component_id] = selected_row.get( + "stoichiometry", 1 + ) else: component_to_input_or_output[item] = item # NEW: Get stoichiometry from the map component_stoichiometries[item] = stoichiometry_map.get(item, 1) # MODIFIED: Include stoichiometry in UID calculation - uid_data = [(comp, io, component_stoichiometries.get(comp, 1)) - for comp, io in sorted(component_to_input_or_output.items())] + uid_data = [ + (comp, io, component_stoichiometries.get(comp, 1)) + for comp, io in sorted(component_to_input_or_output.items()) + ] uid = hashlib.sha256(str(uid_data).encode()).hexdigest() rows: List[DataFrameRow] = [] @@ -226,10 +230,10 @@ def get_uids_for_iterproduct_components( input_or_output_reactome_id = ( input_or_output_id if not is_valid_uuid(input_or_output_id) else None ) - + # NEW: Get stoichiometry information stoichiometry = component_stoichiometries.get(component_id, 1) - + row: DataFrameRow = { "uid": uid, "component_id": component_id, @@ -261,7 +265,9 @@ def break_apart_entity(entity_id: int) -> Set[str]: functional_group = is_functionally_equivalent_entity(entity_str) if functional_group: original_result = break_apart_entity_standard(entity_id) - return apply_functional_equivalence_strategy(entity_str, functional_group, original_result) + return apply_functional_equivalence_strategy( + entity_str, functional_group, original_result + ) # Use original decomposition logic return break_apart_entity_standard(entity_id) @@ -292,7 +298,7 @@ def break_apart_entity_standard(entity_id: int) -> Set[str]: contains_thing = contains_reference_gene_product_molecule_or_isoform(entity_id) if contains_thing: return set([str(entity_id)]) - + member_ids = get_set_members(entity_id) member_list: List[str] = [] @@ -307,17 +313,17 @@ def break_apart_entity_standard(entity_id: int) -> Set[str]: elif "Complex" in labels: broken_apart_members: List[Set[str]] = [] - + # MODIFIED: Get components with stoichiometry component_data = get_complex_components_with_stoichiometry(entity_id) print(f"Complex {entity_id} component data: {component_data}") - + component_stoichiometry = {} member_ids = [] - + for component_info in component_data: - component_id = component_info['component_id'] - stoichiometry = component_info['stoichiometry'] + component_id = component_info["component_id"] + stoichiometry = component_info["stoichiometry"] component_stoichiometry[str(component_id)] = stoichiometry member_ids.append(component_id) @@ -357,28 +363,36 @@ def decompose_by_reactions(reaction_ids: List[int]) -> List[Any]: all_best_matches = [] for reaction_id in reaction_ids: print(f"Processing reaction {reaction_id}") - + # MODIFIED: Get inputs with stoichiometry - input_data = get_reaction_input_output_ids_with_stoichiometry(reaction_id, "input") - input_ids = [item['entity_id'] for item in input_data] - input_stoichiometry = {str(item['entity_id']): item['stoichiometry'] for item in input_data} - + input_data = get_reaction_input_output_ids_with_stoichiometry( + reaction_id, "input" + ) + input_ids = [item["entity_id"] for item in input_data] + input_stoichiometry = { + str(item["entity_id"]): item["stoichiometry"] for item in input_data + } + print(f"Reaction {reaction_id} inputs: {input_ids}") print(f"Input stoichiometry: {input_stoichiometry}") - + broken_apart_input_id = [break_apart_entity(input_id) for input_id in input_ids] input_combinations = get_broken_apart_ids( broken_apart_input_id, str(reaction_id), input_stoichiometry ) # MODIFIED: Get outputs with stoichiometry - output_data = get_reaction_input_output_ids_with_stoichiometry(reaction_id, "output") - output_ids = [item['entity_id'] for item in output_data] - output_stoichiometry = {str(item['entity_id']): item['stoichiometry'] for item in output_data} - + output_data = get_reaction_input_output_ids_with_stoichiometry( + reaction_id, "output" + ) + output_ids = [item["entity_id"] for item in output_data] + output_stoichiometry = { + str(item["entity_id"]): item["stoichiometry"] for item in output_data + } + print(f"Reaction {reaction_id} outputs: {output_ids}") print(f"Output stoichiometry: {output_stoichiometry}") - + broken_apart_output_id = [ break_apart_entity(output_id) for output_id in output_ids ] @@ -414,14 +428,14 @@ def get_decomposed_uid_mapping( reaction_ids = reaction_ids[~pd.isna(reaction_ids)] # removing NA value from list reaction_ids = reaction_ids.astype(int).tolist() # converting to integer - + print(f"Processing {len(reaction_ids)} reactions for pathway {pathway_id}") - + best_matches = decompose_by_reactions(list(reaction_ids)) - + print(f"Final decomposed_uid_mapping shape: {decomposed_uid_mapping.shape}") print(f"Stoichiometry column summary:") - if 'stoichiometry' in decomposed_uid_mapping.columns: - print(decomposed_uid_mapping['stoichiometry'].value_counts()) + if "stoichiometry" in decomposed_uid_mapping.columns: + print(decomposed_uid_mapping["stoichiometry"].value_counts()) return (decomposed_uid_mapping, best_matches) From 53632b253c4341ccb573ec2b37c024232ae5c7f6 Mon Sep 17 00:00:00 2001 From: Thomas Barbazuk Date: Mon, 4 Aug 2025 18:52:35 +0200 Subject: [PATCH 4/4] added in stoichiomtery. Runs well, more changes fortcoming --- src/reaction_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/reaction_generator.py b/src/reaction_generator.py index cd75f92..b3c428e 100755 --- a/src/reaction_generator.py +++ b/src/reaction_generator.py @@ -187,7 +187,7 @@ def get_uids_for_iterproduct_components( reactome_id: ReactomeID, stoichiometry_map: Dict[str, int] = None, # NEW PARAMETER ) -> Set[UID]: - """Get UID for iterproduct components with stoichiometry support.""" + """Get UID for iterproduct components.""" global decomposed_uid_mapping if stoichiometry_map is None: @@ -434,7 +434,7 @@ def get_decomposed_uid_mapping( best_matches = decompose_by_reactions(list(reaction_ids)) print(f"Final decomposed_uid_mapping shape: {decomposed_uid_mapping.shape}") - print(f"Stoichiometry column summary:") + print("Stoichiometry column summary:") if "stoichiometry" in decomposed_uid_mapping.columns: print(decomposed_uid_mapping["stoichiometry"].value_counts())