Source code for ecoli.analysis.causality_network.build_network

"""
BuildNetwork
============

Constructs a network representations of simulation components from sim_data,
and generates files for node lists and edge lists.

Adding new nodes to the network:
--------------------------------

To add a new type of nodes to the network (either a state or process), you need
to write a new function within this file (build_network.py), which goes through
all of the instances of the new node type, and for each instance creates a
node:

.. code-block:: python

    new_node = Node()

adds attributes (``**attr``), which include "node_class", "node_type", "node_id",
"name", and "synonyms":

.. code-block:: python

    new_node.read_attributes(**attr)

and appends the node to the node list:

.. code-block:: python

    self.node_list.append(new_node)

The relevant edges that connect the new node to other nodes also need to be
specified:

.. code-block:: python

    new_edge = Edge("Edge Type")

The source and destination ids for that edge are added with an attribute:

.. code-block:: python

    attr = {
            'src_id': source_id,
            'dst_id': destination_id,
            }

    new_edge.read_attributes(**attr)

and the edge is then added to the edge list:

.. code-block:: python

    self.edge_list.append(new_edge)

With a complete node and edge list, you are ready to add dynamics data to each
node. This is done in read_dynamics.py. You first need to choose appropriate
dynamics data to represents that node's activity, and make sure it is saved in
a listener. read_dynamics.py uses saved listener output to load dynamics into
each node.

read_dynamics.py might require a new function to the dynamics data if it is of
a new node type, specified in the TYPE_TO_READER_FUNCTION dictionary. When the
node list is read, nodes of the new type will be passed into the new function,
which assigns that node dynamics from listener output:

.. code-block:: python

    node.read_dynamics(dynamics, dynamics_units)
"""

from collections import Counter
import numpy as np
import re
import os
import pickle
import json
from typing import Optional


from ecoli.analysis.causality_network.network_components import (
    Node,
    Edge,
    NODELIST_FILENAME,
    EDGELIST_FILENAME,
    NODE_LIST_HEADER,
    EDGE_LIST_HEADER,
    NODELIST_JSON,
    EDGELIST_JSON,
)

# Suffixes that are added to the node IDs of a particular type of node
NODE_ID_SUFFIX = {
    "transcription": "_TRS",
    "maturation": "_MAT",
    "translation": "_TRL",
    "regulation": "_REG",
}

# URL template
URL_TEMPLATE = (
    "https://ecocyc.org/ECOLI/substring-search?type=NIL&object={0}&"
    "quickSearch=Quick+Search"
)
URL_TEMPLATE_COMPOUND = "https://ecocyc.org/compound?orgid=ECOLI&id={0}"

"""
The following groups of molecules participate in multiple processes and are
thus identified here to prevent the addition of duplicate nodes.

Note:
	Identifying multi-process participatory molecules in this way is not
	required because --check_sanity checks for duplicate nodes. However,
	identifying such molecules here can streamline network building by
	eliminating the need to search through nodes that were added previously.

TODO:
	Future proof by programmatically finding multi-process participatory
	molecules (maybe dictionary of booleans).
"""
# Proteins that are reactants or products of a metabolic reaction
PROTEINS_IN_METABOLISM = [
    "EG50003-MONOMER[c]",
    "PHOB-MONOMER[c]",
    "PTSI-MONOMER[c]",
    "PTSH-MONOMER[c]",
]

# Equilibrium complexes that are formed from deactivated equilibrium reactions,
# but are reactants in a complexation reaction
EQUILIBRIUM_COMPLEXES_IN_COMPLEXATION = [
    "CPLX0-7701[c]",
    "CPLX0-7677[c]",
    "MONOMER0-1781[c]",
    "CPLX0-7702[c]",
]

# Metabolites that are used as ligands in equilibrium, but do not participate
# in any metabolic reactions
METABOLITES_ONLY_IN_EQUILIBRIUM = ["CPD-7[c]"]

# Molecules in 2CS (two component system) reactions that are not proteins
NONPROTEIN_MOLECULES_IN_2CS = [
    "ATP[c]",
    "ADP[c]",
    "WATER[c]",
    "Pi[c]",
    "PROTON[c]",
    "PHOSPHO-PHOB[c]",
]

COMPARTMENTS = {
    "n": "nucleoid",
    "j": "projection",
    "w": "cell wall",
    "c": "cytoplasm",
    "e": "extracellular",
    "m": "membrane",
    "o": "outer membrane",
    "p": "periplasm",
    "l": "pilus",
    "i": "inner membrane",
    "s": "flagellum",
}



[docs]
def molecule_compartment(molecule):
    match = re.match(r".+\[(.)]$", molecule)
    if match:
        return COMPARTMENTS.get(match.groups()[0])




[docs]
class BuildNetwork(object):
    """
    Constructs a causality network of simulation components, namely states and
    processes, of a whole-cell simulation using sim_data. Writes two files
    (node list and edge list) that are subsequently used by the dynamics reader
    to extract simulation results, and for the visual representation of the
    network.
    """

    def __init__(self, sim_data_file, output_dir, check_sanity=False):
        """
        TODO: have check_sanity looks for disconnected nodes, and edges
        with non-existent nodes.

        Args:
            sim_data_file: path to the variant sim_data pickle file used for
                building the network.
            output_dir: output directory for the node list and edge list files.
            check_sanity: if set to True, checks if there are any nodes with
                duplicate IDs in the network.
        """
        # Open simulation data and save as attribute
        with open(sim_data_file, "rb") as f:
            self.sim_data = pickle.load(f)

        self.output_dir = output_dir
        self.check_sanity = check_sanity

        self.common_names = self.sim_data.common_names

        self.node_list = []
        self.edge_list = []


[docs]
    def run(self):
        """
        Build the network and write node/edge list files.
        """
        self._build_network()
        self._write_json()

        # self._write_files()


[docs]
    def build_nodes_and_edges(self):
        """Build the network and return the node and edge lists."""
        self._build_network()
        return self._node_list(), self._edge_list()



[docs]
    def _build_network(self):
        """
        Add nodes and edges to the node/edge lists, and check for network
        sanity (optional).
        """
        # Add global nodes to the node list
        self._add_global_nodes()

        # Add state/process-specific nodes and edges to the node and edge list
        self._add_genes()
        self._add_transcription_and_transcripts()
        self._add_rna_maturation_and_mature_transcripts()
        self._add_translation_and_monomers()
        self._add_complexation_and_complexes()
        self._add_metabolism_and_metabolites()
        self._add_equilibrium()
        self._add_regulation()
        self._remove_hanging_edges()

        # Check for network sanity (optional)
        if self.check_sanity:
            self._find_duplicate_nodes()



[docs]
    def _node_list(self):
        return [node.to_dict() for node in self.node_list]



[docs]
    def _edge_list(self):
        def edge_dict(edge):
            return {
                "src_node_id": edge.src_id,
                "dst_node_id": edge.dst_id,
                "stoichiometry": edge.stoichiometry,
                "process": edge.process,
            }

        return [edge_dict(edge) for edge in self.edge_list]



[docs]
    def _write_files(self):
        """
        Write node/edge list as separate .tsv files.
        """
        # Open node list file
        with open(
            os.path.join(self.output_dir, NODELIST_FILENAME), "w"
        ) as nodelist_file:
            # Write header row
            nodelist_file.write(NODE_LIST_HEADER + "\n")

            # Write one row for each node
            for node in self.node_list:
                node.write_nodelist(nodelist_file)

        # Open edge list file
        with open(
            os.path.join(self.output_dir, EDGELIST_FILENAME), "w"
        ) as edgelist_file:
            # Write header row
            edgelist_file.write(EDGE_LIST_HEADER + "\n")

            # Write one row for each edge
            for edge in self.edge_list:
                edge.write_edgelist(edgelist_file)



[docs]
    def _write_json(self):
        """Write node and edge lists as json files."""
        nodes = self._node_list()
        node_json = json.dumps(nodes)
        node_path = os.path.join(self.output_dir, NODELIST_JSON)
        print("writing {} nodes to node file {}".format(len(nodes), node_path))
        with open(node_path, "w") as node_file:
            node_file.write(node_json)

        edges = self._edge_list()
        edge_json = json.dumps(edges)
        edge_path = os.path.join(self.output_dir, EDGELIST_JSON)
        print("writing {} edges to edge file {}".format(len(edges), edge_path))
        with open(edge_path, "w") as edge_file:
            edge_file.write(edge_json)



[docs]
    def _add_global_nodes(self):
        """
        Add global state nodes to the node list.
        """
        # Add total cell mass node to node list
        mass_node = Node()
        attr = {
            "node_class": "State",
            "node_type": "Global",
            "node_id": "cell_mass",
            "name": "Total cell mass",
        }
        mass_node.read_attributes(**attr)

        # Add total cell volume node to node list
        volume_node = Node()
        attr = {
            "node_class": "State",
            "node_type": "Global",
            "node_id": "cell_volume",
            "name": "Total cell volume",
        }
        volume_node.read_attributes(**attr)

        self.node_list.extend([mass_node, volume_node])



[docs]
    def _add_genes(self):
        """
        Add gene state nodes to the node list.
        """
        # Loop through all genes (in the order listed in transcription)
        for gene_id in self.sim_data.process.transcription.cistron_data["gene_id"]:
            # Initialize a single gene node
            gene_node = Node()

            # Get name and synonyms for gene
            gene_name = self.common_names.get_common_name(gene_id)
            gene_synonyms = self.common_names.get_synonyms(gene_id)

            # Get URL for gene
            gene_url = URL_TEMPLATE.format(gene_id)

            attr = {
                "node_class": "State",
                "node_type": "Gene",
                "node_id": gene_id,
                "name": gene_name,
                "synonyms": gene_synonyms,
                "url": gene_url,
                "location": COMPARTMENTS["n"],
            }

            gene_node.read_attributes(**attr)

            # Append gene node to node_list
            self.node_list.append(gene_node)



[docs]
    def _add_transcription_and_transcripts(self):
        """
        Add transcription process nodes and transcript state nodes to the node
        list, and edges connected to the transcription nodes to the edge list.
        """
        ntp_ids = self.sim_data.molecule_groups.ntps
        ppi_id = self.sim_data.molecule_ids.ppi
        rnap_id = self.sim_data.molecule_ids.full_RNAP
        all_gene_ids = self.sim_data.process.transcription.cistron_data["gene_id"]

        # Loop through all transcripts (in the order listed in transcription)
        for rna_id in self.sim_data.process.transcription.rna_data["id"]:
            # Initialize a single transcript node
            rna_node = Node()

            # Add attributes to the node
            # Add common name and synonyms
            rna_id_no_compartment = rna_id[:-3]
            rna_name = self.common_names.get_common_name(rna_id_no_compartment)
            rna_synonyms = self.common_names.get_synonyms(rna_id_no_compartment)

            attr = {
                "node_class": "State",
                "node_type": "RNA",
                "node_id": rna_id,
                "name": rna_name,
                "synonyms": rna_synonyms,
                "location": molecule_compartment(rna_id),
            }

            rna_node.read_attributes(**attr)

            # Append transcript node to node_list
            self.node_list.append(rna_node)

            # Initialize a single transcription node for each TU
            transcription_node = Node()

            # Add attributes to the node
            transcription_id = rna_id_no_compartment + NODE_ID_SUFFIX["transcription"]
            transcription_name = rna_name + " transcription"
            if isinstance(rna_synonyms, list):
                transcription_synonyms = [x + " transcription" for x in rna_synonyms]
            else:
                transcription_synonyms = [rna_synonyms]

            attr = {
                "node_class": "Process",
                "node_type": "Transcription",
                "node_id": transcription_id,
                "name": transcription_name,
                "synonyms": transcription_synonyms,
                "location": molecule_compartment(transcription_id),
            }
            transcription_node.read_attributes(**attr)

            # Append transcription node to node_list
            self.node_list.append(transcription_node)

            # Get IDs of genes that constitute the transcript
            constituent_gene_ids = [
                all_gene_ids[i]
                for i in self.sim_data.process.transcription.rna_id_to_cistron_indexes(
                    rna_id
                )
            ]

            # Add edge from gene to transcription node
            for gene_id in constituent_gene_ids:
                self._append_edge("Transcription", gene_id, transcription_id)

            # Add edge from transcription to transcript node
            self._append_edge("Transcription", transcription_id, rna_id)

            # Add edges from NTPs to transcription nodes
            for ntp_id in ntp_ids:
                self._append_edge("Transcription", ntp_id, transcription_id)

            # Add edge from transcription to Ppi
            self._append_edge("Transcription", transcription_id, ppi_id)

            # Add edges from RNA polymerases to transcription
            self._append_edge("Transcription", rnap_id, transcription_id)



[docs]
    def _add_rna_maturation_and_mature_transcripts(self):
        """
        Add RNA maturation process nodes and mature transcript state nodes to
        the node list, and edges connected to these nodes to the edge list.
        """
        transcription = self.sim_data.process.transcription
        mature_rna_ids = transcription.mature_rna_data["id"]
        unprocessed_rna_ids = transcription.rna_data["id"][
            transcription.rna_data["is_unprocessed"]
        ]
        rna_maturation_stoich_matrix = transcription.rna_maturation_stoich_matrix
        rna_maturation_enzyme_matrix = transcription.rna_maturation_enzyme_matrix
        rna_maturation_enzymes = transcription.rna_maturation_enzymes
        ntp_ids = self.sim_data.molecule_groups.ntps
        ppi_id = self.sim_data.molecule_ids.ppi

        # Loop through all mature transcripts (in the order listed in
        # transcription)
        for mature_rna_id in mature_rna_ids:
            # Initialize a single transcript node
            rna_node = Node()

            # Add attributes to the node
            # Add common name and synonyms
            rna_id_no_compartment = mature_rna_id[:-3]
            rna_name = self.common_names.get_common_name(rna_id_no_compartment)
            rna_synonyms = self.common_names.get_synonyms(rna_id_no_compartment)

            attr = {
                "node_class": "State",
                "node_type": "RNA",
                "node_id": mature_rna_id,
                "name": rna_name,
                "synonyms": rna_synonyms,
                "location": molecule_compartment(mature_rna_id),
            }

            rna_node.read_attributes(**attr)

            # Append transcript node to node_list
            self.node_list.append(rna_node)

        for i, unprocessed_rna_id in enumerate(unprocessed_rna_ids):
            # Initialize a single RNA maturation node for each unprocessed transcipt
            rna_maturation_node = Node()

            # Add attributes to the node
            rna_id_no_compartment = unprocessed_rna_id[:-3]
            rna_name = self.common_names.get_common_name(rna_id_no_compartment)
            rna_synonyms = self.common_names.get_synonyms(rna_id_no_compartment)

            maturation_id = rna_id_no_compartment + NODE_ID_SUFFIX["maturation"]
            maturation_name = rna_name + " maturation"
            if isinstance(rna_synonyms, list):
                maturation_synonyms = [x + " maturation" for x in rna_synonyms]
            else:
                maturation_synonyms = [rna_synonyms]

            attr = {
                "node_class": "Process",
                "node_type": "RNA Maturation",
                "node_id": maturation_id,
                "name": maturation_name,
                "synonyms": maturation_synonyms,
            }
            rna_maturation_node.read_attributes(**attr)

            # Append transcription node to node_list
            self.node_list.append(rna_maturation_node)

            # Get IDs of mature RNAs that are generated from this transcript
            constituent_transcript_ids = [
                mature_rna_ids[i]
                for i in rna_maturation_stoich_matrix.getcol(i).nonzero()[0]
            ]

            # Add edge from the maturation node to mature transcripts
            for rna_id in constituent_transcript_ids:
                self._append_edge("RNA Maturation", maturation_id, rna_id)

            # Add edge from the unprocessed rna to the maturation node
            self._append_edge("RNA Maturation", unprocessed_rna_id, maturation_id)

            # Add edges from maturation node to NTPs
            for ntp_id in ntp_ids:
                self._append_edge("RNA Maturation", maturation_id, ntp_id)

            # Add edge from ppi to maturation node
            self._append_edge("RNA Maturation", ppi_id, maturation_id)

            # Get indexes of enzymes that catalyze the maturation
            enzyme_indexes = np.where(rna_maturation_enzyme_matrix[i, :])[0]

            # Add edge from each enzyme to maturation node
            for enzyme_index in enzyme_indexes:
                self._append_edge(
                    "RNA Maturation",
                    rna_maturation_enzymes[enzyme_index],
                    maturation_id,
                )



[docs]
    def _add_translation_and_monomers(self):
        """
        Add translation process nodes and protein (monomer) state nodes to the
        node list, and edges connected to the translation nodes to the edge
        list.
        """
        # Create nodes for amino acids
        aa_ids = self.sim_data.molecule_groups.amino_acids
        gtp_id = "GTP[c]"
        gdp_id = "GDP[c]"
        water_id = self.sim_data.molecule_ids.water
        ppi_id = self.sim_data.molecule_ids.ppi

        ribosome_subunit_ids = [
            self.sim_data.molecule_ids.s30_full_complex,
            self.sim_data.molecule_ids.s50_full_complex,
        ]
        all_rna_ids = self.sim_data.process.transcription.rna_data["id"]

        # Construct dictionary to get corresponding gene IDs from RNA IDs
        rna_id_to_gene_id = {}
        for cistron_id, gene_id in zip(
            self.sim_data.process.transcription.cistron_data["id"],
            self.sim_data.process.transcription.cistron_data["gene_id"],
        ):
            rna_id_to_gene_id[cistron_id] = gene_id

        # Loop through all translatable genes
        for monomer_id, cistron_id in zip(
            self.sim_data.process.translation.monomer_data["id"],
            self.sim_data.process.translation.monomer_data["cistron_id"],
        ):
            gene_id = rna_id_to_gene_id[cistron_id]

            # Initialize a single protein node
            protein_node = Node()

            # Add attributes to the node
            monomer_id_no_compartment = monomer_id[:-3]
            monomer_name = self.common_names.get_common_name(monomer_id_no_compartment)
            monomer_synonyms = self.common_names.get_synonyms(monomer_id_no_compartment)
            gene_name = self.common_names.get_common_name(gene_id)
            gene_synonyms = self.common_names.get_synonyms(gene_id)

            attr = {
                "node_class": "State",
                "node_type": "Protein",
                "node_id": monomer_id,
                "name": monomer_name,
                "synonyms": monomer_synonyms,
                "url": URL_TEMPLATE.format(gene_id),
                "location": molecule_compartment(monomer_id),
            }

            protein_node.read_attributes(**attr)

            # Append protein node to node_list
            self.node_list.append(protein_node)

            # Initialize a single translation node for each transcript
            translation_node = Node()

            # Add attributes to the node
            translation_id = gene_id + NODE_ID_SUFFIX["translation"]
            translation_name = gene_name + " translation"
            if isinstance(gene_synonyms, list):
                translation_synonyms = [x + " translation" for x in gene_synonyms]
            else:
                translation_synonyms = [gene_synonyms]

            attr = {
                "node_class": "Process",
                "node_type": "Translation",
                "node_id": translation_id,
                "name": translation_name,
                "synonyms": translation_synonyms,
                "url": URL_TEMPLATE.format(gene_id),
                "location": molecule_compartment(translation_id),
            }
            translation_node.read_attributes(**attr)

            # Append translation node to node_list
            self.node_list.append(translation_node)

            # Add edges from all transcripts that encode for the protein to
            # the translation node
            rna_ids = [
                all_rna_ids[i]
                for i in self.sim_data.process.transcription.cistron_id_to_rna_indexes(
                    cistron_id
                )
            ]

            for rna_id in rna_ids:
                self._append_edge("Translation", rna_id, translation_id)

            # Add edge from translation to monomer node
            self._append_edge("Translation", translation_id, monomer_id)

            # Add edges from amino acids to translation node
            for aa_id in aa_ids:
                self._append_edge("Translation", aa_id, translation_id)

            # Add edges from other reactants to translation node
            for reactant_id in [gtp_id, water_id]:
                self._append_edge("Translation", reactant_id, translation_id)

            # Add edges from translation to other product nodes
            for product_id in [gdp_id, ppi_id, water_id]:
                self._append_edge("Translation", translation_id, product_id)

            # Add edges from ribosome subunits to translation node
            for subunit_id in ribosome_subunit_ids:
                self._append_edge("Translation", subunit_id, translation_id)



[docs]
    def _add_complexation_and_complexes(self):
        """
        Add complexation process nodes and complex state nodes to the node
        list, and edges connected to the complexation nodes to the edge
        list.
        """
        # List of all complex IDs and reaction IDs
        complex_ids = (
            self.sim_data.process.complexation.ids_complexes
            + EQUILIBRIUM_COMPLEXES_IN_COMPLEXATION
        )
        reaction_ids = self.sim_data.process.complexation.ids_reactions

        molecule_ids = self.sim_data.process.complexation.molecule_names
        stoich_matrix = self.sim_data.process.complexation.stoich_matrix()

        # Loop through all complexation reactions
        for reaction_index, reaction_id in enumerate(reaction_ids):
            # Initialize a single complexation node for each complexation reaction
            complexation_node = Node()

            # Add attributes to the node
            attr = {
                "node_class": "Process",
                "node_type": "Complexation",
                "node_id": reaction_id,
                "name": reaction_id,
                "url": URL_TEMPLATE.format(reaction_id.replace("_RXN", "")),
                "location": molecule_compartment(reaction_id),
            }
            complexation_node.read_attributes(**attr)

            # Append node to node_list
            self.node_list.append(complexation_node)

            # Get reaction stoichiometry from stoichimetric matrix
            stoich_vector = stoich_matrix[:, reaction_index]
            molecule_indices = np.where(stoich_vector)[0]
            stoich_coeffs = stoich_vector[molecule_indices]

            # Loop through all proteins participating in the reaction
            for molecule_index, stoich in zip(molecule_indices, stoich_coeffs):
                # Add complexation edges
                # Note: the direction of the edge is determined by the sign of the
                # stoichiometric coefficient.
                if stoich > 0:
                    self._append_edge(
                        "Complexation",
                        reaction_id,
                        molecule_ids[molecule_index],
                        stoich,
                    )
                else:
                    self._append_edge(
                        "Complexation",
                        molecule_ids[molecule_index],
                        reaction_id,
                        stoich,
                    )

        for complex_id in complex_ids:
            # Initialize a single complex node for each complex
            complex_node = Node()

            # Add attributes to the node
            complex_id_no_compartment = complex_id[:-3]
            complex_name = self.common_names.get_common_name(complex_id_no_compartment)
            complex_synonyms = self.common_names.get_synonyms(complex_id_no_compartment)

            attr = {
                "node_class": "State",
                "node_type": "Complex",
                "node_id": complex_id,
                "name": complex_name,
                "synonyms": complex_synonyms,
                "url": URL_TEMPLATE.format(complex_id_no_compartment),
                "location": molecule_compartment(complex_id),
            }

            complex_node.read_attributes(**attr)

            # Append node to node_list
            self.node_list.append(complex_node)



[docs]
    def _add_metabolism_and_metabolites(self):
        """
        Add metabolism process nodes and metabolite state nodes to the node
        list, add edges connected to the metabolism nodes to the edge list.
        Note: forward and reverse reactions are represented as separate nodes.
        """
        # Get all reaction stoichiometry from sim_data
        reaction_stoich = self.sim_data.process.metabolism.reaction_stoich

        # Get reaction to catalyst dict from sim_data
        reaction_catalysts = self.sim_data.process.metabolism.reaction_catalysts

        # get transport reactions and remove from metabolism
        transport_reactions = set(self.sim_data.process.metabolism.transport_reactions)

        # Initialize list of metabolite IDs
        metabolite_ids = []

        # Loop through all reactions
        for reaction_id, stoich_dict in reaction_stoich.items():
            node_type = "Metabolism"

            # make a transport node if the reaction is in transport_reactions
            if reaction_id in transport_reactions:
                node_type = "Transport"

            # Initialize a single metabolism node for each reaction
            metabolism_node = Node()

            # Get URL for metabolism reaction
            if reaction_id.startswith("TRANS"):
                reaction_url_tag = "-".join(reaction_id.split("-")[:3])
            elif reaction_id.startswith("RXN"):
                reaction_url_tag = "-".join(reaction_id.split("-")[:2])
            else:
                reaction_url_tag = reaction_id.split("-RXN")[0] + "-RXN"
            reaction_url = URL_TEMPLATE.format(
                reaction_url_tag.replace(" (reverse)", "")
            )

            # Add attributes to the node
            attr = {
                "node_class": "Process",
                "node_type": node_type,
                "node_id": reaction_id,
                "name": reaction_id,
                "url": reaction_url,
                "location": molecule_compartment(reaction_id),
            }
            metabolism_node.read_attributes(**attr)

            # Append node to node_list
            self.node_list.append(metabolism_node)

            # Get list of proteins that catalyze this reaction
            catalyst_list = reaction_catalysts.get(reaction_id, [])

            # Add an edge from each catalyst to the metabolism node
            for catalyst in catalyst_list:
                self._append_edge(node_type, catalyst, reaction_id)

            # Loop through all metabolites participating in the reaction
            for metabolite, stoich in stoich_dict.items():
                # Add metabolites that were not encountered
                if metabolite not in metabolite_ids:
                    metabolite_ids.append(metabolite)

                # Add Metabolism edges
                # Note: the direction of the edge is determined by the sign of the
                # stoichiometric coefficient.
                if stoich > 0:
                    self._append_edge(node_type, reaction_id, metabolite, stoich)
                else:
                    self._append_edge(node_type, metabolite, reaction_id, stoich)

        # Add specific charging reactions
        # TODO (Travis): add charged/uncharged tRNA as RNA not metabolites?
        transcription = self.sim_data.process.transcription
        uncharged_trnas = transcription.uncharged_trna_names
        charging_stoich = transcription.charging_stoich_matrix().T
        charging_molecules = np.array(transcription.charging_molecules)
        synthetases = np.array(transcription.synthetase_names)
        trna_to_synthetase = transcription.aa_from_trna.T.dot(
            transcription.aa_from_synthetase
        )

        for stoich, trna, synth_idx in zip(
            charging_stoich, uncharged_trnas, trna_to_synthetase
        ):
            rxn = "{} net charging".format(trna[:-3])

            charging_node = Node()
            attr = {
                "node_class": "Process",
                "node_type": "Charging",
                "node_id": rxn,
                "name": rxn,
                "location": molecule_compartment(trna),
            }
            charging_node.read_attributes(**attr)

            # Append node to node_list
            self.node_list.append(charging_node)

            for synthetase in synthetases[synth_idx != 0]:
                self._append_edge("Charging", synthetase, rxn, 1)

            # Loop through all metabolites participating in the reaction
            mol_idx = np.where(stoich != 0)[0]
            for mol, direction in zip(charging_molecules[mol_idx], stoich[mol_idx]):
                # Add metabolites that were not encountered
                if mol != trna and mol not in metabolite_ids:
                    metabolite_ids.append(mol)

                # Add Charging edges
                # Note: the direction of the edge is determined by the sign of the
                # stoichiometric coefficient.
                if direction > 0:
                    self._append_edge("Charging", rxn, mol, direction)
                else:
                    self._append_edge("Charging", mol, rxn, direction)

        # Loop through all metabolites
        for metabolite_id in metabolite_ids:
            # Skip proteins - they should have already been added
            if metabolite_id in PROTEINS_IN_METABOLISM:
                continue

            # Initialize a single metabolite node for each metabolite
            metabolite_node = Node()

            # Add attributes to the node
            metabolite_id_no_compartment = metabolite_id[:-3]
            metabolite_name = self.common_names.get_common_name(
                metabolite_id_no_compartment
            )
            metabolite_synonyms = self.common_names.get_synonyms(
                metabolite_id_no_compartment
            )

            attr = {
                "node_class": "State",
                "node_type": "Metabolite",
                "node_id": metabolite_id,
                "name": metabolite_name,
                "synonyms": metabolite_synonyms,
                "url": URL_TEMPLATE_COMPOUND.format(metabolite_id_no_compartment),
                "location": molecule_compartment(metabolite_id),
            }

            metabolite_node.read_attributes(**attr)

            # Append node to node_list
            self.node_list.append(metabolite_node)



[docs]
    def _add_equilibrium(self):
        """
        Add equilibrium nodes to the node list, and add edges connected to the
        equilibrium nodes to the edge list.
        """
        # Get equilibrium-specific data from sim_data
        equilibrium_molecule_ids = self.sim_data.process.equilibrium.molecule_names
        equilibrium_reaction_ids = self.sim_data.process.equilibrium.rxn_ids
        equilibrium_stoich_matrix = self.sim_data.process.equilibrium.stoich_matrix()

        # Get IDs of complexes that were already added
        complexation_complex_ids = self.sim_data.process.complexation.ids_complexes

        # Get list of complex IDs in equilibrium
        equilibrium_complex_ids = self.sim_data.process.equilibrium.ids_complexes

        # Loop through each equilibrium reaction
        for reaction_index, reaction_id in enumerate(equilibrium_reaction_ids):
            # Initialize a single equilibrium node for each equilibrium reaction
            equilibrium_node = Node()

            # Add attributes to the node
            reaction_name = reaction_id[:-4] + " equilibrium rxn"
            attr = {
                "node_class": "Process",
                "node_type": "Equilibrium",
                "node_id": reaction_id,
                "name": reaction_name,
                "location": molecule_compartment(reaction_id),
            }
            equilibrium_node.read_attributes(**attr)

            # Append new node to node_list
            self.node_list.append(equilibrium_node)

            # Extract column corresponding to reaction in the stoichiometric matrix
            equilibrium_stoich_matrix_column = equilibrium_stoich_matrix[
                :, reaction_index
            ]

            # Loop through each element in column
            for molecule_index, stoich in enumerate(equilibrium_stoich_matrix_column):
                if stoich == 0:
                    continue

                molecule_id = equilibrium_molecule_ids[molecule_index]

                # Add Equilibrium edges
                # Note: the direction of the edge is determined by the sign of the
                # stoichiometric coefficient.
                if stoich > 0:
                    self._append_edge("Equilibrium", reaction_id, molecule_id, stoich)
                else:
                    self._append_edge("Equilibrium", molecule_id, reaction_id, stoich)

        # Get 2CS-specific data from sim_data
        tcs_molecule_ids = self.sim_data.process.two_component_system.molecule_names
        tcs_reaction_ids = self.sim_data.process.two_component_system.rxn_ids
        tcs_stoich_matrix = self.sim_data.process.two_component_system.stoich_matrix()

        # Initialize list of complex IDs in 2CS
        # TODO (ggsun): add this to sim_data
        tcs_complex_ids = []

        # Get lists of monomers that were already added
        monomer_ids = list(self.sim_data.process.translation.monomer_data["id"])

        # Loop through each 2CS reaction
        for reaction_index, reaction_id in enumerate(tcs_reaction_ids):
            # Initialize a single equilibrium node for each equilibrium reaction
            equilibrium_node = Node()

            # Add attributes to the node
            reaction_name = reaction_id[:-4] + " 2CS rxn"
            attr = {
                "node_class": "Process",
                "node_type": "Equilibrium",
                "node_id": reaction_id,
                "name": reaction_name,
                "location": molecule_compartment(reaction_id),
            }
            equilibrium_node.read_attributes(**attr)

            # Append new node to node_list
            self.node_list.append(equilibrium_node)

            # Extract column corresponding to reaction in the stoichiometric matrix
            tcs_stoich_matrix_column = tcs_stoich_matrix[:, reaction_index]

            # Loop through each element in column
            for molecule_index, stoich in enumerate(tcs_stoich_matrix_column):
                if stoich == 0:
                    continue

                molecule_id = tcs_molecule_ids[molecule_index]

                if molecule_id not in monomer_ids + NONPROTEIN_MOLECULES_IN_2CS:
                    tcs_complex_ids.append(molecule_id)

                # Add Equilibrium edges
                # Note: the direction of the edge is determined by the sign of the
                # stoichiometric coefficient.
                if stoich > 0:
                    self._append_edge("Equilibrium", reaction_id, molecule_id, stoich)
                else:
                    self._append_edge("Equilibrium", molecule_id, reaction_id, stoich)

        # Add new complexes that were encountered here
        for complex_id in list(set(equilibrium_complex_ids + tcs_complex_ids)):
            if complex_id in complexation_complex_ids:
                continue

            # Initialize a single complex node for each complex
            complex_node = Node()

            # Add attributes to the node
            complex_id_no_compartment = complex_id[:-3]
            complex_name = self.common_names.get_common_name(complex_id_no_compartment)

            attr = {
                "node_class": "State",
                "node_type": "Complex",
                "node_id": complex_id,
                "name": complex_name,
                "location": molecule_compartment(complex_id),
            }
            complex_node.read_attributes(**attr)

            # Append node to node_list
            self.node_list.append(complex_node)

        # Loop through metabolites that only appear in equilibrium
        for metabolite_id in METABOLITES_ONLY_IN_EQUILIBRIUM:
            # Initialize a single metabolite node for each metabolite
            metabolite_node = Node()

            # Add attributes to the node
            metabolite_id_no_compartment = metabolite_id[:-3]
            metabolite_name = self.common_names.get_common_name(
                metabolite_id_no_compartment
            )

            attr = {
                "node_class": "State",
                "node_type": "Metabolite",
                "node_id": metabolite_id,
                "name": metabolite_name,
                "location": molecule_compartment(metabolite_id),
            }
            metabolite_node.read_attributes(**attr)

            # Append node to node_list
            self.node_list.append(metabolite_node)



[docs]
    def _add_regulation(self):
        """
        Add regulation nodes with to the node list, and add edges connected to
        the regulation nodes to the edge list.
        """
        # Get list of transcription factor IDs and transcription unit IDs
        tf_ids = self.sim_data.process.transcription_regulation.tf_ids
        rna_ids = self.sim_data.process.transcription.rna_data["id"]
        cistron_ids = self.sim_data.process.transcription.cistron_data["id"]

        # Get delta_prob matrix from sim_data
        delta_prob = self.sim_data.process.transcription_regulation.delta_prob

        # Build dict that maps TFs to indexes of transcription units they
        # regulate
        TF_to_TU_idx = {}

        for i, tf in enumerate(tf_ids):
            TF_to_TU_idx[tf] = delta_prob["deltaI"][delta_prob["deltaJ"] == i]

        # Build dict that maps RNA IDs to gene IDs
        cistron_id_to_gene_id = {}

        for cistron_id, gene_id in zip(
            self.sim_data.process.replication.gene_data["cistron_id"],
            self.sim_data.process.replication.gene_data["name"],
        ):
            cistron_id_to_gene_id[cistron_id] = gene_id

        # Loop through all TFs
        for tf_id in tf_ids:
            # Add TF bound to DNA
            bound_tf_id = f"{tf_id}-bound"
            tf_node = Node()
            attr = {
                "node_class": "Process",
                "node_type": "TF Binding",
                "node_id": bound_tf_id,
                "name": bound_tf_id,
                "location": "c",
            }
            tf_node.read_attributes(**attr)
            self.node_list.append(tf_node)
            self._append_edge("Regulation", tf_id + "[c]", bound_tf_id)

            # Get IDs of RNAs that are regulated by the TF
            regulated_rna_ids = rna_ids[TF_to_TU_idx[tf_id]]

            for regulated_rna_id in regulated_rna_ids:
                regulated_cistron_ids = [
                    cistron_ids[i]
                    for i in self.sim_data.process.transcription.rna_id_to_cistron_indexes(
                        regulated_rna_id
                    )
                ]

                for regulated_cistron_id in regulated_cistron_ids:
                    # Find corresponding ID of gene
                    gene_id = cistron_id_to_gene_id[regulated_cistron_id]

                    # Initialize a single regulation node for each TF-gene pair
                    regulation_node = Node()

                    # Add attributes to the node
                    reg_id = tf_id + "_" + gene_id + NODE_ID_SUFFIX["regulation"]
                    reg_name = tf_id + "-" + gene_id + " gene regulation"
                    attr = {
                        "node_class": "Process",
                        "node_type": "Regulation",
                        "node_id": reg_id,
                        "name": reg_name,
                        "location": molecule_compartment(reg_id),
                    }
                    regulation_node.read_attributes(**attr)

                    self.node_list.append(regulation_node)

                    # Add edge from TF to this regulation node
                    self._append_edge("Regulation", bound_tf_id, reg_id)

                    # Add edge from this regulation node to the gene
                    self._append_edge("Regulation", reg_id, gene_id)



[docs]
    def _find_duplicate_nodes(self):
        """
        Identify nodes that have duplicate IDs.
        """
        counters = Counter([node.get_node_id() for node in self.node_list])
        duplicate_ids = {node_id for node_id in counters if counters[node_id] > 1}

        # Print duplicate node IDs that were found
        if len(duplicate_ids) > 0:
            raise ValueError(
                "%d node IDs have duplicates: %s" % (len(duplicate_ids), duplicate_ids)
            )



[docs]
    def _remove_hanging_edges(self):
        """
        Remove edges that are not connected to existing nodes.
        """
        # Get set of all node IDs
        node_ids = {node.node_id for node in self.node_list}
        disconnected_edge_indexes = []

        # Find edges whose source and destination nodes are not defined
        for index, edge in enumerate(self.edge_list):
            if edge.src_id not in node_ids or edge.dst_id not in node_ids:
                disconnected_edge_indexes.append(index)

        # Remove these edges
        for index in disconnected_edge_indexes[::-1]:
            self.edge_list.pop(index)



[docs]
    def _append_edge(
        self, type_: str, src: str, dst: str, stoichiometry: Optional[str | int] = ""
    ):
        """
        Helper function for appending new nodes to the network.
        """
        edge = Edge(type_)
        edge.read_attributes(src_id=src, dst_id=dst, stoichiometry=stoichiometry)
        self.edge_list.append(edge)