diff --git a/src/decomposed_uid_mapping.py b/src/decomposed_uid_mapping.py index 384f0e5..85fd5b0 100644 --- a/src/decomposed_uid_mapping.py +++ b/src/decomposed_uid_mapping.py @@ -7,4 +7,7 @@ "component_id_or_reference_entity_id": pd.Int64Dtype(), "input_or_output_uid": str, "input_or_output_reactome_id": pd.Int64Dtype(), + "stoichiometry": pd.Int64Dtype(), + "entity_stoichiometry": pd.Int64Dtype(), # For original entity stoichiometry + "component_stoichiometry": pd.Int64Dtype(), # For complex component stoichiometry } diff --git a/src/logic_network_generator.py b/src/logic_network_generator.py index 3ae850c..b767ef6 100755 --- a/src/logic_network_generator.py +++ b/src/logic_network_generator.py @@ -10,7 +10,9 @@ graph: Graph = Graph(uri, auth=("neo4j", "test")) -def _get_reactome_id_from_hash(decomposed_uid_mapping: pd.DataFrame, hash_value: str) -> int: +def get_reactome_id_from_hash( + decomposed_uid_mapping: pd.DataFrame, hash_value: str +) -> int: """Extract reactome_id for a given hash from decomposed_uid_mapping.""" return decomposed_uid_mapping.loc[ decomposed_uid_mapping["uid"] == hash_value, "reactome_id" @@ -18,161 +20,175 @@ def _get_reactome_id_from_hash(decomposed_uid_mapping: pd.DataFrame, hash_value: def create_reaction_id_map( - decomposed_uid_mapping: pd.DataFrame, - reaction_ids: List[int], - best_matches: pd.DataFrame + decomposed_uid_mapping: pd.DataFrame, + reaction_ids: List[int], + best_matches: pd.DataFrame, ) -> pd.DataFrame: """Create a mapping between reaction UIDs, reactome IDs, and input/output hashes.""" - + reaction_id_map_column_types = { "uid": str, "reactome_id": pd.Int64Dtype(), "input_hash": str, "output_hash": str, } - + print("Checking best_matches contents:") - + rows = [] for _, match in best_matches.iterrows(): - incomming_hash = match["incomming"] + incoming_hash = match["incoming"] outgoing_hash = match["outgoing"] - reactome_id = _get_reactome_id_from_hash(decomposed_uid_mapping, incomming_hash) - + reactome_id = get_reactome_id_from_hash(decomposed_uid_mapping, incoming_hash) + row = { "uid": str(uuid.uuid4()), "reactome_id": int(reactome_id), - "input_hash": incomming_hash, + "input_hash": incoming_hash, "output_hash": outgoing_hash, } print("row") print(row) rows.append(row) - + reaction_id_map = pd.DataFrame(rows).astype(reaction_id_map_column_types) - + return reaction_id_map def create_uid_reaction_connections( - reaction_id_map: pd.DataFrame, - best_matches: pd.DataFrame, - decomposed_uid_mapping: pd.DataFrame + reaction_id_map: pd.DataFrame, + best_matches: pd.DataFrame, + decomposed_uid_mapping: pd.DataFrame, ) -> pd.DataFrame: """Create connections between reaction UIDs based on best matches.""" - + reactome_id_to_uid_mapping = dict( zip(reaction_id_map["reactome_id"], reaction_id_map["uid"]) ) - + uid_reaction_connections_data = [] - + for _, match in best_matches.iterrows(): - incomming_hash = match["incomming"] + incoming_hash = match["incoming"] outgoing_hash = match["outgoing"] - + # Get reactome IDs for both hashes - preceding_reaction_id = _get_reactome_id_from_hash(decomposed_uid_mapping, incomming_hash) - following_reaction_id = _get_reactome_id_from_hash(decomposed_uid_mapping, outgoing_hash) - + preceding_reaction_id = get_reactome_id_from_hash( + decomposed_uid_mapping, incoming_hash + ) + following_reaction_id = get_reactome_id_from_hash( + decomposed_uid_mapping, outgoing_hash + ) + # Get corresponding UIDs preceding_uid = reactome_id_to_uid_mapping.get(preceding_reaction_id) following_uid = reactome_id_to_uid_mapping.get(following_reaction_id) - + # Only add connection if both UIDs exist if preceding_uid is not None and following_uid is not None: - uid_reaction_connections_data.append({ - "preceding_uid": preceding_uid, - "following_uid": following_uid - }) - - return pd.DataFrame(uid_reaction_connections_data) + uid_reaction_connections_data.append( + {"preceding_uid": preceding_uid, "following_uid": following_uid} + ) + + # Fix: Always create DataFrame with proper columns, even if empty + if uid_reaction_connections_data: + return pd.DataFrame(uid_reaction_connections_data) + else: + # Return empty DataFrame with correct column structure + return pd.DataFrame(columns=["preceding_uid", "following_uid"]) def _execute_regulator_query( - graph: Graph, - query: str, - reaction_uuid: str, - function_name: str + graph: Graph, query: str, reaction_uuid: str, function_name: str ) -> List[Dict[str, Any]]: """Execute a regulator query and return processed results.""" try: result = graph.run(query) regulators = [] - + for record in result: regulator_uuid = str(uuid.uuid4()) - regulators.append({ - "reaction": reaction_uuid, - "PhysicalEntity": regulator_uuid, - "edge_type": "regulator", - "uuid": regulator_uuid, - "reaction_uuid": reaction_uuid, - }) - + regulators.append( + { + "reaction": reaction_uuid, + "PhysicalEntity": regulator_uuid, + "edge_type": "regulator", + "uuid": regulator_uuid, + "reaction_uuid": reaction_uuid, + } + ) + return regulators - + except Exception as e: logger.error(f"Error in {function_name}", exc_info=True) raise e def get_catalysts_for_reaction(reaction_id_map: DataFrame, graph: Graph) -> DataFrame: - """Get catalysts for reactions using Neo4j graph queries.""" + """Get catalysts for reactions with stoichiometry using Neo4j graph queries.""" catalyst_list = [] - + for _, row in reaction_id_map.iterrows(): reaction_id = row["reactome_id"] reaction_uuid = row["uid"] - + query = ( - f"MATCH (reaction:ReactionLikeEvent{{dbId: {reaction_id}}})-[:catalystActivity]->(catalystActivity:CatalystActivity)-[:physicalEntity]->(catalyst:PhysicalEntity) " - f"RETURN reaction.dbId AS reaction_id, catalyst.dbId AS catalyst_id, 'catalyst' AS edge_type" + f"MATCH (reaction:ReactionLikeEvent{{dbId: {reaction_id}}})-[r:catalystActivity]->(catalystActivity:CatalystActivity)-[:physicalEntity]->(catalyst:PhysicalEntity) " + f"RETURN reaction.dbId AS reaction_id, catalyst.dbId AS catalyst_id, " + f"COALESCE(r.stoichiometry, 1) AS stoichiometry, 'catalyst' AS edge_type" ) - + try: data = graph.run(query).data() for item in data: item["uuid"] = str(uuid.uuid4()) item["reaction_uuid"] = reaction_uuid catalyst_list.extend(data) - + except Exception as e: logger.error("Error in get_catalysts_for_reaction", exc_info=True) raise e - + return pd.DataFrame( catalyst_list, - columns=["reaction_id", "catalyst_id", "edge_type", "uuid", "reaction_uuid"], + columns=[ + "reaction_id", + "catalyst_id", + "stoichiometry", + "edge_type", + "uuid", + "reaction_uuid", + ], ) def get_positive_regulators_for_reaction( - reaction_id_mapping: DataFrame, - graph: Graph + reaction_id_mapping: DataFrame, graph: Graph ) -> DataFrame: """Get positive regulators for reactions using Neo4j graph queries.""" regulators_list = [] - + for _, row in reaction_id_mapping.iterrows(): reaction_id = row["reactome_id"] reaction_uuid = row["uid"] - + if pd.isna(reaction_uuid): logger.error(f"No UUID found for reaction ID {reaction_id}") continue - + query = ( f"MATCH (reaction)-[:regulatedBy]->(regulator:PositiveRegulation)-[:regulator]->(pe:PhysicalEntity) " f"WHERE reaction.dbId = {reaction_id} " "RETURN reaction.dbId as reaction, pe.dbId as PhysicalEntity" ) - + regulators = _execute_regulator_query( graph, query, reaction_uuid, "get_positive_regulators_for_reaction" ) regulators_list.extend(regulators) - + return pd.DataFrame( regulators_list, columns=["reaction", "PhysicalEntity", "edge_type", "uuid", "reaction_uuid"], @@ -181,31 +197,30 @@ def get_positive_regulators_for_reaction( def get_negative_regulators_for_reaction( - reaction_id_mapping: DataFrame, - graph: Graph + reaction_id_mapping: DataFrame, graph: Graph ) -> DataFrame: """Get negative regulators for reactions using Neo4j graph queries.""" regulators_list = [] - + for _, row in reaction_id_mapping.iterrows(): reaction_id = row["reactome_id"] reaction_uuid = row["uid"] - + if pd.isna(reaction_uuid): logger.error(f"No UUID found for reaction ID {reaction_id}") continue - + query = ( f"MATCH (reaction)-[:regulatedBy]->(regulator:NegativeRegulation)-[:regulator]->(pe:PhysicalEntity) " f"WHERE reaction.dbId = {reaction_id} " "RETURN reaction.dbId as reaction, pe.dbId as PhysicalEntity" ) - + regulators = _execute_regulator_query( graph, query, reaction_uuid, "get_negative_regulators_for_reaction" ) regulators_list.extend(regulators) - + return pd.DataFrame( regulators_list, columns=["reaction", "PhysicalEntity", "edge_type", "uuid", "reaction_uuid"], @@ -213,29 +228,35 @@ def get_negative_regulators_for_reaction( ) -def _get_non_null_values(df: pd.DataFrame, column: str) -> List[Any]: +def get_non_null_values(df: pd.DataFrame, column: str) -> List[Any]: """Extract non-null values from a DataFrame column.""" return [value for value in df[column].tolist() if pd.notna(value)] -def _get_hash_for_reaction(reaction_id_map: pd.DataFrame, uid: str, hash_type: str) -> str: +def get_hash_for_reaction( + reaction_id_map: pd.DataFrame, uid: str, hash_type: str +) -> str: """Get input_hash or output_hash for a given reaction UID.""" - return reaction_id_map.loc[ - reaction_id_map["uid"] == uid, hash_type - ].iloc[0] + return reaction_id_map.loc[reaction_id_map["uid"] == uid, hash_type].iloc[0] -def _extract_uid_and_reactome_values(decomposed_uid_mapping: pd.DataFrame, hash_value: str) -> tuple: +def extract_uid_and_reactome_values( + decomposed_uid_mapping: pd.DataFrame, hash_value: str +) -> tuple: """Extract UID and Reactome ID values for a given hash.""" filtered_rows = decomposed_uid_mapping[decomposed_uid_mapping["uid"] == hash_value] - - uid_values = _get_non_null_values(filtered_rows, "input_or_output_uid") - reactome_id_values = _get_non_null_values(filtered_rows, "input_or_output_reactome_id") - + + uid_values = get_non_null_values(filtered_rows, "input_or_output_uid") + reactome_id_values = get_non_null_values( + filtered_rows, "input_or_output_reactome_id" + ) + return uid_values, reactome_id_values -def _assign_uuids(reactome_ids: List[str], reactome_id_to_uuid: Dict[str, str]) -> List[str]: +def assign_uuids( + reactome_ids: List[str], reactome_id_to_uuid: Dict[str, str] +) -> List[str]: """Assign UUIDs to Reactome IDs, creating new ones if they don't exist.""" return [ reactome_id_to_uuid.setdefault(reactome_id, str(uuid.uuid4())) @@ -243,7 +264,7 @@ def _assign_uuids(reactome_ids: List[str], reactome_id_to_uuid: Dict[str, str]) ] -def _determine_edge_properties(input_uid_values: List[Any]) -> tuple: +def determine_edge_properties(input_uid_values: List[Any]) -> tuple: """Determine and_or and edge_type based on input UID values.""" if input_uid_values: return "and", "input" @@ -251,23 +272,25 @@ def _determine_edge_properties(input_uid_values: List[Any]) -> tuple: return "or", "output" -def _add_pathway_connections( - input_uuids: List[str], - output_uuids: List[str], - and_or: str, +def add_pathway_connections( + input_uuids: List[str], + output_uuids: List[str], + and_or: str, edge_type: str, - pathway_logic_network_data: List[Dict[str, Any]] + pathway_logic_network_data: List[Dict[str, Any]], ) -> None: """Add all input-output connections to the pathway network data.""" for input_uuid in input_uuids: for output_uuid in output_uuids: - pathway_logic_network_data.append({ - "source_id": input_uuid, - "target_id": output_uuid, - "pos_neg": "pos", - "and_or": and_or, - "edge_type": edge_type, - }) + pathway_logic_network_data.append( + { + "source_id": input_uuid, + "target_id": output_uuid, + "pos_neg": "pos", + "and_or": and_or, + "edge_type": edge_type, + } + ) def extract_inputs_and_outputs( @@ -279,37 +302,55 @@ def extract_inputs_and_outputs( reactome_id_to_uuid: Dict[str, str], pathway_logic_network_data: List[Dict[str, Any]], ) -> None: - """Extract inputs and outputs for reactions and add them to the pathway network.""" - + """Extract inputs and outputs with stoichiometry support.""" + for reaction_uid in reaction_uids: - # Extract input information - input_hash = _get_hash_for_reaction(reaction_id_map, reaction_uid, "input_hash") - input_uid_values, input_reactome_id_values = _extract_uid_and_reactome_values( + # Extract input information WITH stoichiometry + input_hash = get_hash_for_reaction(reaction_id_map, reaction_uid, "input_hash") + input_uid_values, input_reactome_id_values = extract_uid_and_reactome_values( decomposed_uid_mapping, input_hash ) - + + # NEW: Extract stoichiometry for inputs + input_stoichiometry = extract_stoichiometry_for_entity( + decomposed_uid_mapping, input_hash, input_reactome_id_values + ) + # Process preceding reactions (outputs) preceding_uids = uid_reaction_connections[ uid_reaction_connections["following_uid"] == reaction_uid ]["preceding_uid"].tolist() - + for preceding_uid in preceding_uids: - # Extract output information - output_hash = _get_hash_for_reaction(reaction_id_map, preceding_uid, "output_hash") - output_uid_values, output_reactome_id_values = _extract_uid_and_reactome_values( - decomposed_uid_mapping, output_hash + # Extract output information WITH stoichiometry + output_hash = get_hash_for_reaction( + reaction_id_map, preceding_uid, "output_hash" + ) + output_uid_values, output_reactome_id_values = ( + extract_uid_and_reactome_values(decomposed_uid_mapping, output_hash) ) - + + # NEW: Extract stoichiometry for outputs + output_stoichiometry = extract_stoichiometry_for_entity( + decomposed_uid_mapping, output_hash, output_reactome_id_values + ) + # Assign UUIDs - input_uuids = _assign_uuids(input_reactome_id_values, reactome_id_to_uuid) - output_uuids = _assign_uuids(output_reactome_id_values, reactome_id_to_uuid) - + input_uuids = assign_uuids(input_reactome_id_values, reactome_id_to_uuid) + output_uuids = assign_uuids(output_reactome_id_values, reactome_id_to_uuid) + # Determine edge properties - and_or, edge_type = _determine_edge_properties(input_uid_values) - - # Add connections to pathway network - _add_pathway_connections( - input_uuids, output_uuids, and_or, edge_type, pathway_logic_network_data + and_or, edge_type = determine_edge_properties(input_uid_values) + + # Add connections with stoichiometry + add_pathway_connections_with_stoichiometry( + input_uuids, + output_uuids, + input_stoichiometry, + output_stoichiometry, + and_or, + edge_type, + pathway_logic_network_data, ) @@ -322,23 +363,32 @@ def append_regulators( and_or: str, edge_type: str, ) -> None: - """Append regulatory relationships to the pathway network.""" - + """Append regulatory relationships with stoichiometry.""" + regulator_configs = [ (catalyst_map, "pos", "catalyst"), (negative_regulator_map, "neg", "regulator"), (positive_regulator_map, "pos", "regulator"), ] - + for map_df, pos_neg, edge_type_override in regulator_configs: for _, row in map_df.iterrows(): - pathway_logic_network_data.append({ - "source_id": row["uuid"], - "target_id": row["reaction_uuid"], - "pos_neg": pos_neg, - "and_or": and_or, - "edge_type": edge_type_override, - }) + stoichiometry = row.get( + "stoichiometry", 1 + ) # Get stoichiometry if available + + pathway_logic_network_data.append( + { + "source_id": row["uuid"], + "target_id": row["reaction_uuid"], + "pos_neg": pos_neg, + "and_or": and_or, + "edge_type": edge_type_override, + "source_stoichiometry": stoichiometry, + "target_stoichiometry": 1, # Reactions are typically 1:1 with regulators + "stoichiometry_ratio": 1.0, + } + ) def _calculate_reaction_statistics(reaction_connections: pd.DataFrame) -> None: @@ -348,12 +398,14 @@ def _calculate_reaction_statistics(reaction_connections: pd.DataFrame) -> None: reaction_connections["preceding_reaction_id"] ) ] - + num_reactions_without_preceding = len(reactions_without_preceding_events) num_total_reactions = len(reaction_connections) - + if num_total_reactions > 0: - percentage_without_preceding = (num_reactions_without_preceding / num_total_reactions) * 100 + percentage_without_preceding = ( + num_reactions_without_preceding / num_total_reactions + ) * 100 print("Percentage of reactions without preceding events") print(percentage_without_preceding) @@ -361,7 +413,7 @@ def _calculate_reaction_statistics(reaction_connections: pd.DataFrame) -> None: def _print_regulator_statistics( positive_regulator_map: pd.DataFrame, negative_regulator_map: pd.DataFrame, - catalyst_map: pd.DataFrame + catalyst_map: pd.DataFrame, ) -> None: """Print statistics about regulators and catalysts.""" print( @@ -371,54 +423,122 @@ def _print_regulator_statistics( ) +def extract_stoichiometry_for_entity( + decomposed_uid_mapping: pd.DataFrame, + hash_value: str, + entity_reactome_ids: List[str], +) -> Dict[str, int]: + """Extract stoichiometry information for entities.""" + stoichiometry_map = {} + + for reactome_id in entity_reactome_ids: + # Get stoichiometry from decomposed mapping + filtered_rows = decomposed_uid_mapping[ + (decomposed_uid_mapping["uid"] == hash_value) + & (decomposed_uid_mapping["input_or_output_reactome_id"] == reactome_id) + ] + + if not filtered_rows.empty: + stoichiometry = filtered_rows["stoichiometry"].iloc[0] + stoichiometry_map[reactome_id] = ( + stoichiometry if pd.notna(stoichiometry) else 1 + ) + else: + stoichiometry_map[reactome_id] = 1 + + return stoichiometry_map + + +def add_pathway_connections_with_stoichiometry( + input_uuids: List[str], + output_uuids: List[str], + input_stoichiometry: Dict[str, int], + output_stoichiometry: Dict[str, int], + and_or: str, + edge_type: str, + pathway_logic_network_data: List[Dict[str, Any]], +) -> None: + """Add connections with stoichiometry information.""" + + for input_uuid in input_uuids: + for output_uuid in output_uuids: + # Calculate stoichiometry ratio + input_stoich = input_stoichiometry.get(input_uuid, 1) + output_stoich = output_stoichiometry.get(output_uuid, 1) + stoich_ratio = output_stoich / input_stoich if input_stoich > 0 else 1.0 + + pathway_logic_network_data.append( + { + "source_id": input_uuid, + "target_id": output_uuid, + "pos_neg": "pos", + "and_or": and_or, + "edge_type": edge_type, + "source_stoichiometry": input_stoich, + "target_stoichiometry": output_stoich, + "stoichiometry_ratio": stoich_ratio, + } + ) + + def create_pathway_logic_network( decomposed_uid_mapping: pd.DataFrame, reaction_connections: pd.DataFrame, best_matches: Any, ) -> pd.DataFrame: - """Create a pathway logic network from decomposed UID mappings and reaction connections.""" - logger.debug("Adding reaction pairs to pathway_logic_network") + """Create a pathway logic network with stoichiometry support.""" - # Initialize data structures + # Enhanced column structure columns = { "source_id": pd.Series(dtype="Int64"), "target_id": pd.Series(dtype="Int64"), "pos_neg": pd.Series(dtype="str"), "and_or": pd.Series(dtype="str"), "edge_type": pd.Series(dtype="str"), + "source_stoichiometry": pd.Series(dtype="Int64"), # NEW + "target_stoichiometry": pd.Series(dtype="Int64"), # NEW + "stoichiometry_ratio": pd.Series(dtype="float"), # NEW } pathway_logic_network_data = [] - + # Extract unique reaction IDs reaction_ids = pd.unique( reaction_connections[["preceding_reaction_id", "following_reaction_id"]] .stack() .dropna() ) - + # Calculate and print statistics _calculate_reaction_statistics(reaction_connections) - + # Create mappings and connections - reaction_id_map = create_reaction_id_map(decomposed_uid_mapping, reaction_ids, best_matches) + reaction_id_map = create_reaction_id_map( + decomposed_uid_mapping, reaction_ids, best_matches + ) catalyst_map = get_catalysts_for_reaction(reaction_id_map, graph) - negative_regulator_map = get_negative_regulators_for_reaction(reaction_id_map, graph) - positive_regulator_map = get_positive_regulators_for_reaction(reaction_id_map, graph) - + negative_regulator_map = get_negative_regulators_for_reaction( + reaction_id_map, graph + ) + positive_regulator_map = get_positive_regulators_for_reaction( + reaction_id_map, graph + ) + uid_reaction_connections = create_uid_reaction_connections( reaction_id_map, best_matches, decomposed_uid_mapping ) - + reaction_uids = pd.unique( uid_reaction_connections[["preceding_uid", "following_uid"]].stack().dropna() ) - + # Print regulator statistics - _print_regulator_statistics(positive_regulator_map, negative_regulator_map, catalyst_map) - + _print_regulator_statistics( + positive_regulator_map, negative_regulator_map, catalyst_map + ) + # Process reactions and regulators reactome_id_to_uuid = {} - + for reaction_uid in reaction_uids: extract_inputs_and_outputs( reaction_uid, @@ -429,7 +549,7 @@ def create_pathway_logic_network( reactome_id_to_uuid, pathway_logic_network_data, ) - + and_or = "" edge_type = "" append_regulators( @@ -441,22 +561,25 @@ def create_pathway_logic_network( and_or, edge_type, ) - + # Create final DataFrame - pathway_logic_network = pd.DataFrame(pathway_logic_network_data, columns=columns.keys()) - + pathway_logic_network = pd.DataFrame( + pathway_logic_network_data, columns=columns.keys() + ) + # Find root inputs and terminal outputs root_inputs = find_root_inputs(pathway_logic_network) terminal_outputs = find_terminal_outputs(pathway_logic_network) - + print( f"root_inputs: {root_inputs}\n" f"terminal_outputs: {terminal_outputs}\n" f"pathway_logic_network: {pathway_logic_network}" ) - + return pathway_logic_network + def find_root_inputs(pathway_logic_network): root_inputs = pathway_logic_network[ (pathway_logic_network["source_id"].notnull()) diff --git a/src/neo4j_connector.py b/src/neo4j_connector.py index 66bf4fb..6218645 100755 --- a/src/neo4j_connector.py +++ b/src/neo4j_connector.py @@ -65,18 +65,23 @@ def get_labels(entity_id: int) -> List[str]: raise -def get_complex_components(entity_id: int) -> Set[int]: +def get_complex_components_with_stoichiometry(entity_id: int) -> List[Dict[str, Any]]: query_get_components_template: str = """ - MATCH (entity)-[:hasComponent]->(component) + MATCH (entity)-[r:hasComponent]->(component) WHERE entity.dbId = %s - RETURN collect(component.dbId) AS component_ids + RETURN component.dbId AS component_id, + COALESCE(r.stoichiometry, 1) AS stoichiometry """ query: str = query_get_components_template % entity_id try: - return set(graph.run(query).data()[0]["component_ids"]) + result = graph.run(query).data() + print(f"Complex {entity_id} components with stoichiometry: {result}") + return result except Exception: - logger.error("Error in get_complex_components", exc_info=True) + logger.error( + "Error in get_complex_components_with_stoichiometry", exc_info=True + ) raise @@ -111,19 +116,26 @@ def get_reactions(pathway_id: int, taxon_id: str) -> List[int]: raise -def get_reaction_input_output_ids(reaction_id: int, input_or_output: str) -> Set[int]: +def get_reaction_input_output_ids_with_stoichiometry( + reaction_id: int, input_or_output: str +) -> List[Dict[str, Any]]: query_template: str = """ - MATCH (reaction)-[:%s]-(io) + MATCH (reaction)-[r:%s]->(io) WHERE (reaction:Reaction OR reaction:ReactionLikeEvent) AND reaction.dbId=%s - RETURN COLLECT(io.dbId) AS io_ids + RETURN io.dbId AS entity_id, + COALESCE(r.stoichiometry, 1) AS stoichiometry """ relation_type: str = "input" if input_or_output == "input" else "output" query: str = query_template % (relation_type, reaction_id) try: - return set(graph.run(query).data()[0]["io_ids"]) + result = graph.run(query).data() + print(f"Reaction {reaction_id} {input_or_output}s with stoichiometry: {result}") + return result except Exception: - logger.error("Error in get_reaction_input_output_ids", exc_info=True) + logger.error( + "Error in get_reaction_input_output_ids_with_stoichiometry", exc_info=True + ) raise @@ -164,3 +176,21 @@ def contains_reference_gene_product_molecule_or_isoform(entity_id: int) -> bool: exc_info=True, ) raise e + + +def get_catalyst_stoichiometry(reaction_id: int) -> List[Dict[str, Any]]: + query_template: str = """ + MATCH (reaction:ReactionLikeEvent)-[r:catalystActivity]->(ca:CatalystActivity)-[:physicalEntity]->(catalyst:PhysicalEntity) + WHERE reaction.dbId = %s + RETURN catalyst.dbId AS catalyst_id, + COALESCE(r.stoichiometry, 1) AS stoichiometry + """ + query: str = query_template % reaction_id + + try: + result = graph.run(query).data() + print(f"Reaction {reaction_id} catalysts with stoichiometry: {result}") + return result + except Exception: + logger.error("Error in get_catalyst_stoichiometry", exc_info=True) + raise diff --git a/src/pathway_generator.py b/src/pathway_generator.py index 53440e0..8677679 100755 --- a/src/pathway_generator.py +++ b/src/pathway_generator.py @@ -42,9 +42,7 @@ def generate_pathway_file( [decomposed_uid_mapping, best_matches_list] = get_decomposed_uid_mapping( pathway_id, reaction_connections ) - best_matches = pd.DataFrame( - best_matches_list, columns=["incomming", "outgoing"] - ) + best_matches = pd.DataFrame(best_matches_list, columns=["incoming", "outgoing"]) decomposed_uid_mapping.to_csv(decomposed_uid_mapping_file, index=False) best_matches.to_csv(best_matches_file, index=False) diff --git a/src/reaction_generator.py b/src/reaction_generator.py index ba5fc79..b3c428e 100755 --- a/src/reaction_generator.py +++ b/src/reaction_generator.py @@ -2,7 +2,7 @@ import itertools import uuid import warnings -from typing import Any, Dict, List, Set, Tuple +from typing import Any, Dict, List, Set, Tuple, Optional import pandas as pd @@ -11,9 +11,9 @@ from src.decomposed_uid_mapping import decomposed_uid_mapping_column_types from src.neo4j_connector import ( contains_reference_gene_product_molecule_or_isoform, - get_complex_components, + get_complex_components_with_stoichiometry, # MODIFIED IMPORT get_labels, - get_reaction_input_output_ids, + get_reaction_input_output_ids_with_stoichiometry, # MODIFIED IMPORT get_reference_entity_id, get_set_members, ) @@ -31,18 +31,34 @@ ReactomeID = str DataFrameRow = Dict[str, Any] +# Enhanced functional equivalence groups configuration +FUNCTIONAL_EQUIVALENT_GROUPS = { + # Ubiquitin - classic case of functionally equivalent isoforms + "68524": { + "group_name": "ubiquitin_isoforms", + "strategy": "use_canonical", + "canonical_representative": "68524", + "description": "Ubiquitin isoforms - all serve identical protein degradation signaling function", + "biological_rationale": "Multiple ubiquitin genes produce identical/near-identical proteins", + }, +} + +# MODIFIED: Add stoichiometry columns to the schema +enhanced_decomposed_uid_mapping_column_types = { + **decomposed_uid_mapping_column_types, + "stoichiometry": pd.Int64Dtype(), + "entity_stoichiometry": pd.Int64Dtype(), + "component_stoichiometry": pd.Int64Dtype(), +} + decomposed_uid_mapping = pd.DataFrame( - columns=decomposed_uid_mapping_column_types.keys() -).astype( # type: ignore - decomposed_uid_mapping_column_types -) + columns=enhanced_decomposed_uid_mapping_column_types.keys() +).astype(enhanced_decomposed_uid_mapping_column_types) reference_entity_dict: Dict[str, str] = {} def get_component_id_or_reference_entity_id(reactome_id): - global reference_entity_dict - if reactome_id in reference_entity_dict: component_id = reference_entity_dict[reactome_id] return component_id @@ -61,12 +77,35 @@ def is_valid_uuid(identifier: Any) -> bool: return True if len(identifier) == 64 else False +def is_functionally_equivalent_entity(entity_id: str) -> Optional[Dict[str, Any]]: + """Check if entity is part of a functional equivalence group.""" + return FUNCTIONAL_EQUIVALENT_GROUPS.get(entity_id) + + +def apply_functional_equivalence_strategy( + entity_id: str, functional_group: Dict[str, Any], original_result: Set[str] +) -> Set[str]: + """Apply functional equivalence strategy to entity decomposition.""" + strategy = functional_group.get("strategy", "use_canonical") + + if strategy == "use_canonical": + canonical_rep = functional_group.get("canonical_representative", entity_id) + return {canonical_rep} + else: + return original_result + + def get_broken_apart_ids( - broken_apart_members: list[set[str]], reactome_id: ReactomeID + broken_apart_members: list[set[str]], + reactome_id: ReactomeID, + stoichiometry_map: Dict[str, int] = None, # NEW PARAMETER ) -> Set[UID]: - """Get broken apart IDs.""" + """Get broken apart IDs with stoichiometry support.""" global decomposed_uid_mapping + if stoichiometry_map is None: + stoichiometry_map = {} + uids: Set[UID] if any(isinstance(member, set) for member in broken_apart_members): new_broken_apart_members = [] @@ -77,22 +116,33 @@ def get_broken_apart_ids( new_broken_apart_members.append({member}) iterproduct_components = list(itertools.product(*new_broken_apart_members)) + total_combinations = len(iterproduct_components) + + if total_combinations > 1000: + logger.warning( + f"Large combination set for reaction {reactome_id}: {total_combinations} combinations" + ) + iterproduct_components_as_sets = [ set(map(str, item)) for item in iterproduct_components ] uids = get_uids_for_iterproduct_components( - iterproduct_components_as_sets, reactome_id + iterproduct_components_as_sets, reactome_id, stoichiometry_map ) else: uid = str(uuid.uuid4()) rows: List[DataFrameRow] = [] row: DataFrameRow + for member in broken_apart_members: if is_valid_uuid(member): component_ids = decomposed_uid_mapping.loc[ decomposed_uid_mapping["uid"] == member, "component_id" ].tolist() for component_id in component_ids: + # NEW: Add stoichiometry information + stoichiometry = stoichiometry_map.get(component_id, 1) + row = { "uid": uid, "component_id": component_id, @@ -102,35 +152,52 @@ def get_broken_apart_ids( ), "input_or_output_uid": member, "input_or_output_reactome_id": None, + "stoichiometry": stoichiometry, # NEW + "entity_stoichiometry": stoichiometry, # NEW + "component_stoichiometry": 1, # NEW } rows.append(row) else: + # NEW: Add stoichiometry information + stoichiometry = stoichiometry_map.get(member, 1) + row = { "uid": uid, "component_id": member, "reactome_id": reactome_id, "component_id_or_reference_entity_id": get_component_id_or_reference_entity_id( - component_id + member ), "input_or_output_uid": None, "input_or_output_reactome_id": member, + "stoichiometry": stoichiometry, # NEW + "entity_stoichiometry": stoichiometry, # NEW + "component_stoichiometry": 1, # NEW } rows.append(row) uids = {uid} + decomposed_uid_mapping = pd.concat([decomposed_uid_mapping, pd.DataFrame(rows)]) return uids def get_uids_for_iterproduct_components( - iterproduct_components: List[Set[ComponentID]], reactome_id: ReactomeID + iterproduct_components: List[Set[ComponentID]], + reactome_id: ReactomeID, + stoichiometry_map: Dict[str, int] = None, # NEW PARAMETER ) -> Set[UID]: """Get UID for iterproduct components.""" global decomposed_uid_mapping + if stoichiometry_map is None: + stoichiometry_map = {} + uids: Set[UID] = set() for component in iterproduct_components: component_to_input_or_output: Dict[ComponentID, InputOutputID] = {} + component_stoichiometries: Dict[ComponentID, int] = {} # NEW + for item in component: if is_valid_uuid(item): selected_rows = decomposed_uid_mapping.loc[ @@ -139,12 +206,21 @@ def get_uids_for_iterproduct_components( for index, selected_row in selected_rows.iterrows(): component_id = selected_row["component_id"] component_to_input_or_output[component_id] = item + # NEW: Get stoichiometry from existing data + component_stoichiometries[component_id] = selected_row.get( + "stoichiometry", 1 + ) else: component_to_input_or_output[item] = item + # NEW: Get stoichiometry from the map + component_stoichiometries[item] = stoichiometry_map.get(item, 1) - uid = hashlib.sha256( - str(sorted(component_to_input_or_output.values())).encode() - ).hexdigest() + # MODIFIED: Include stoichiometry in UID calculation + uid_data = [ + (comp, io, component_stoichiometries.get(comp, 1)) + for comp, io in sorted(component_to_input_or_output.items()) + ] + uid = hashlib.sha256(str(uid_data).encode()).hexdigest() rows: List[DataFrameRow] = [] for component_id, input_or_output_id in component_to_input_or_output.items(): @@ -154,6 +230,10 @@ def get_uids_for_iterproduct_components( input_or_output_reactome_id = ( input_or_output_id if not is_valid_uuid(input_or_output_id) else None ) + + # NEW: Get stoichiometry information + stoichiometry = component_stoichiometries.get(component_id, 1) + row: DataFrameRow = { "uid": uid, "component_id": component_id, @@ -163,6 +243,9 @@ def get_uids_for_iterproduct_components( ), "input_or_output_uid": input_or_output_uid, "input_or_output_reactome_id": input_or_output_reactome_id, + "stoichiometry": stoichiometry, # NEW + "entity_stoichiometry": stoichiometry, # NEW + "component_stoichiometry": stoichiometry, # NEW } rows.append(row) @@ -173,7 +256,25 @@ def get_uids_for_iterproduct_components( def break_apart_entity(entity_id: int) -> Set[str]: - """Break apart entity.""" + """Break apart entity with functional equivalence support.""" + global decomposed_uid_mapping + + entity_str = str(entity_id) + + # Check for functional equivalence rules first + functional_group = is_functionally_equivalent_entity(entity_str) + if functional_group: + original_result = break_apart_entity_standard(entity_id) + return apply_functional_equivalence_strategy( + entity_str, functional_group, original_result + ) + + # Use original decomposition logic + return break_apart_entity_standard(entity_id) + + +def break_apart_entity_standard(entity_id: int) -> Set[str]: + """Standard break_apart_entity logic with stoichiometry support.""" global decomposed_uid_mapping labels = get_labels(entity_id) @@ -197,12 +298,12 @@ def break_apart_entity(entity_id: int) -> Set[str]: contains_thing = contains_reference_gene_product_molecule_or_isoform(entity_id) if contains_thing: return set([str(entity_id)]) + member_ids = get_set_members(entity_id) member_list: List[str] = [] for member_id in member_ids: members = break_apart_entity(member_id) - if isinstance(members, set): member_list.extend(members) else: @@ -212,13 +313,27 @@ def break_apart_entity(entity_id: int) -> Set[str]: elif "Complex" in labels: broken_apart_members: List[Set[str]] = [] - member_ids = get_complex_components(entity_id) + + # MODIFIED: Get components with stoichiometry + component_data = get_complex_components_with_stoichiometry(entity_id) + print(f"Complex {entity_id} component data: {component_data}") + + component_stoichiometry = {} + member_ids = [] + + for component_info in component_data: + component_id = component_info["component_id"] + stoichiometry = component_info["stoichiometry"] + component_stoichiometry[str(component_id)] = stoichiometry + member_ids.append(component_id) for member_id in member_ids: members = break_apart_entity(member_id) broken_apart_members.append(members) - return get_broken_apart_ids(broken_apart_members, str(entity_id)) + return get_broken_apart_ids( + broken_apart_members, str(entity_id), component_stoichiometry + ) elif any( entity_label in labels @@ -232,7 +347,6 @@ def break_apart_entity(entity_id: int) -> Set[str]: "SimpleEntity", ] ): - return {str(entity_id)} else: @@ -241,25 +355,49 @@ def break_apart_entity(entity_id: int) -> Set[str]: def decompose_by_reactions(reaction_ids: List[int]) -> List[Any]: - """Decompose by reactions.""" + """Decompose by reactions with stoichiometry support.""" global decomposed_uid_mapping - logger.debug("Decomposing reactions") + logger.debug("Decomposing reactions with stoichiometry") all_best_matches = [] for reaction_id in reaction_ids: - input_ids = get_reaction_input_output_ids(reaction_id, "input") + print(f"Processing reaction {reaction_id}") + + # MODIFIED: Get inputs with stoichiometry + input_data = get_reaction_input_output_ids_with_stoichiometry( + reaction_id, "input" + ) + input_ids = [item["entity_id"] for item in input_data] + input_stoichiometry = { + str(item["entity_id"]): item["stoichiometry"] for item in input_data + } + + print(f"Reaction {reaction_id} inputs: {input_ids}") + print(f"Input stoichiometry: {input_stoichiometry}") + broken_apart_input_id = [break_apart_entity(input_id) for input_id in input_ids] input_combinations = get_broken_apart_ids( - broken_apart_input_id, str(reaction_id) + broken_apart_input_id, str(reaction_id), input_stoichiometry ) - output_ids = get_reaction_input_output_ids(reaction_id, "output") + # MODIFIED: Get outputs with stoichiometry + output_data = get_reaction_input_output_ids_with_stoichiometry( + reaction_id, "output" + ) + output_ids = [item["entity_id"] for item in output_data] + output_stoichiometry = { + str(item["entity_id"]): item["stoichiometry"] for item in output_data + } + + print(f"Reaction {reaction_id} outputs: {output_ids}") + print(f"Output stoichiometry: {output_stoichiometry}") + broken_apart_output_id = [ break_apart_entity(output_id) for output_id in output_ids ] output_combinations = get_broken_apart_ids( - broken_apart_output_id, str(reaction_id) + broken_apart_output_id, str(reaction_id), output_stoichiometry ) [best_matches, _] = find_best_reaction_match( @@ -274,10 +412,13 @@ def decompose_by_reactions(reaction_ids: List[int]) -> List[Any]: def get_decomposed_uid_mapping( pathway_id: str, reaction_connections: pd.DataFrame ) -> Tuple[pd.DataFrame, List[Any]]: - """Get decomposed UID mapping.""" + """Get decomposed UID mapping with stoichiometry support.""" global decomposed_uid_mapping - decomposed_uid_mapping.drop(decomposed_uid_mapping.index, inplace=True) + # MODIFIED: Use enhanced column types + decomposed_uid_mapping = pd.DataFrame( + columns=enhanced_decomposed_uid_mapping_column_types.keys() + ).astype(enhanced_decomposed_uid_mapping_column_types) reaction_ids = pd.unique( reaction_connections[ @@ -287,6 +428,14 @@ def get_decomposed_uid_mapping( reaction_ids = reaction_ids[~pd.isna(reaction_ids)] # removing NA value from list reaction_ids = reaction_ids.astype(int).tolist() # converting to integer + + print(f"Processing {len(reaction_ids)} reactions for pathway {pathway_id}") + best_matches = decompose_by_reactions(list(reaction_ids)) + print(f"Final decomposed_uid_mapping shape: {decomposed_uid_mapping.shape}") + print("Stoichiometry column summary:") + if "stoichiometry" in decomposed_uid_mapping.columns: + print(decomposed_uid_mapping["stoichiometry"].value_counts()) + return (decomposed_uid_mapping, best_matches)