Source code for reconstruction.ecoli.dataclasses.getter_functions

"""
SimulationData getter functions
"""

import itertools
import re

from Bio.Seq import Seq
import numpy as np
import numpy.typing as npt

from reconstruction.ecoli.dataclasses.molecule_groups import POLYMERIZED_FRAGMENT_PREFIX
from wholecell.utils import units

EXCLUDED_RNA_TYPES = {"pseudo", "phantom"}

# Mapping of compartment IDs to abbreviations for compartments undefined in
# flat/compartments.tsv
UNDEFINED_COMPARTMENT_IDS_TO_ABBREVS = {
    "CCI-PERI-BAC-GN": "p",
    "CCI-PM-BAC-NEG-GN": "i",
    "CCO-PM-BAC-ACT": "i",
    "CCO-OUT": "e",
    "CCO-IN": "c",
    "NIL": "c",
    "CCO-CW-BAC-NEG": "o",
    "CCO-CE-BAC": "m",
    "CCO-BAC-NUCLEOID": "c",
    "CCO-RIBOSOME": "c",
}

IGNORED_DNA_SITE_TYPES = {
    "dna-binding-site",
    "phage-attachment-site",
    "rep-element",
    "repeat-region",
}



[docs]
class TranscriptionDirectionError(Exception):
    pass




[docs]
class UnknownMolecularWeightError(Exception):
    pass




[docs]
class UnknownModifiedRnaError(Exception):
    pass




[docs]
class InvalidProteinComplexError(Exception):
    pass




[docs]
class GetterFunctions(object):
    """getterFunctions"""

    _compartment_tag = re.compile(r"\[[a-z]]")

    def __init__(self, raw_data, sim_data):
        self._n_submass_indexes = len(sim_data.submass_name_to_index)
        self._submass_name_to_index = sim_data.submass_name_to_index

        self._build_sequences(raw_data)
        self._build_all_masses(raw_data, sim_data)
        self._build_compartments(raw_data, sim_data)
        self._build_genomic_coordinates(raw_data)


[docs]
    def get_sequences(self, ids: list[str] | np.ndarray) -> list[str]:
        """
        Return a list of sequences of the molecules with the given IDs.
        """
        assert isinstance(ids, (list, np.ndarray))
        return [self._sequences[mol_id] for mol_id in ids]



[docs]
    def get_mass(self, mol_id: str):
        """
        Return the total mass of the molecule with a given ID.
        """
        assert isinstance(mol_id, str)
        return (
            self._mass_units
            * self._all_total_masses[self._compartment_tag.sub("", mol_id)]
        )



[docs]
    def get_masses(self, mol_ids: list | np.ndarray):
        """
        Return an array of total masses of the molecules with the given IDs.
        """
        assert isinstance(mol_ids, (list, np.ndarray))
        masses = [
            self._all_total_masses[self._compartment_tag.sub("", mol_id)]
            for mol_id in mol_ids
        ]
        return self._mass_units * np.array(masses)



[docs]
    def get_submass_array(self, mol_id: str):
        """
        Return the submass array of the molecule with a given ID.
        """
        assert isinstance(mol_id, str)
        return (
            self._mass_units
            * self._all_submass_arrays[self._compartment_tag.sub("", mol_id)]
        )



[docs]
    def get_compartment(self, mol_id: str) -> list[list[str]]:
        """
        Returns the list of one-letter codes for the compartments that the
        molecule with the given ID can exist in.
        """
        assert isinstance(mol_id, str)
        return self._all_compartments[mol_id]



[docs]
    def get_compartments(self, ids: list[str] | np.ndarray) -> list[list[str]]:
        """
        Returns a list of the list of one-letter codes for the compartments
        that each of the molecules with the given IDs can exist in.
        """
        assert isinstance(ids, (list, np.ndarray))
        return [self._all_compartments[x] for x in ids]



[docs]
    def get_compartment_tag(self, id_: str) -> str:
        """
        Look up a molecule id and return a compartment suffix tag like '[c]'.
        """
        return f"[{self._all_compartments[id_][0]}]"



[docs]
    def is_valid_molecule(self, mol_id: str) -> bool:
        """
        Returns True if the molecule with the given ID is a valid molecule (has
        both a submass array and a compartment tag).
        """
        return mol_id in self._all_submass_arrays and mol_id in self._all_compartments



[docs]
    def get_all_valid_molecules(self) -> list[str]:
        """
        Returns a list of all molecule IDs with assigned submass arrays and
        compartments.
        """
        return sorted(self._all_submass_arrays.keys() & self._all_compartments.keys())



[docs]
    def get_genomic_coordinates(self, site_id: str) -> tuple[int, int]:
        """
        Returns the genomic coordinates of the left and right ends of a DNA site
        given the ID of the site.
        """
        assert isinstance(site_id, str)
        return self._all_genomic_coordinates[site_id]



[docs]
    def get_miscrnas_with_singleton_tus(self) -> list[str]:
        """
        Returns a list of all miscRNA IDs with corresponding single-gene
        transcription units.
        """
        return sorted(self._miscrna_id_to_singleton_tu_id.keys())



[docs]
    def get_singleton_tu_id(self, rna_id: str) -> str:
        """
        Returns the ID of the single-gene transcription unit corresponding to
        the given miscRNA ID, if such a transcription unit exists. This is
        necessary to replace some references to miscRNA IDs in complexation or
        equilibrium reactions with their corresponding TU IDs, which are the
        molecules that are actually transcribed.
        """
        assert isinstance(rna_id, str)
        return self._miscrna_id_to_singleton_tu_id[rna_id]



[docs]
    def _build_sequences(self, raw_data):
        """
        Builds sequences of RNAs and proteins.
        """
        self._sequences = {}
        self._build_rna_sequences(raw_data)
        self._build_protein_sequences(raw_data)



[docs]
    def _build_rna_sequences(self, raw_data):
        """
        Builds nucleotide sequences of each transcription unit using the genome
        sequence and the left and right end positions.
        """
        genome_sequence = raw_data.genome_sequence

        def parse_sequence(tu_id, left_end_pos, right_end_pos, direction):
            """
            Parses genome sequence to get transcription unit sequence, given
            left and right end positions and transcription direction (Note:
            the left and right end positions in the raw data files are given as
            1-indexed coordinates)
            """
            if direction == "+":
                return genome_sequence[left_end_pos - 1 : right_end_pos].transcribe()
            elif direction == "-":
                return (
                    genome_sequence[left_end_pos - 1 : right_end_pos]
                    .reverse_complement()
                    .transcribe()
                )
            else:
                raise TranscriptionDirectionError(
                    f"Unidentified transcription direction given for {tu_id}"
                )

        # Get set of valid gene IDs that have positions on the chromosome, and
        # is not a pseudogene or a phantom gene
        gene_id_to_rna_type = {rna["gene_id"]: rna["type"] for rna in raw_data.rnas}
        valid_gene_ids = {
            gene["id"]
            for gene in raw_data.genes
            if gene["left_end_pos"] is not None
            and gene["right_end_pos"] is not None
            and gene_id_to_rna_type[gene["id"]] not in EXCLUDED_RNA_TYPES
        }

        # Set of gene IDs that are covered by listed transcription units
        covered_gene_ids = set()

        # Keep track of gene tuples of TUs to remove duplicate TUs that cover
        # the same set of genes but have different left & right end coordinates.
        # The first TU in the list that covers the given set of genes is always
        # selected over later TUs. Later TUs are discarded, but their evidence
        # codes are added to the list of evidence codes of the first TU.
        gene_tuple_to_tu_index = {}

        # Add sequences from transcription_units file
        gene_id_to_left_end_pos = {
            gene["id"]: gene["left_end_pos"] for gene in raw_data.genes
        }
        gene_id_to_right_end_pos = {
            gene["id"]: gene["right_end_pos"] for gene in raw_data.genes
        }
        gene_id_to_rna_id = {gene["id"]: gene["rna_ids"][0] for gene in raw_data.genes}
        gene_id_to_direction = {
            gene["id"]: gene["direction"] for gene in raw_data.genes
        }
        self._miscrna_id_to_singleton_tu_id = {}

        for i, tu in enumerate(raw_data.transcription_units):
            # Get list of genes in TU after excluding invalid genes
            gene_tuple = tuple(
                sorted(
                    [gene_id for gene_id in tu["genes"] if gene_id in valid_gene_ids]
                )
            )

            # Skip TUs with only invalid genes
            if len(gene_tuple) == 0:
                continue

            # Skip duplicate TUs but compile all evidence codes
            if gene_tuple in gene_tuple_to_tu_index:
                evidence_list = raw_data.transcription_units[
                    gene_tuple_to_tu_index[gene_tuple]
                ]["evidence"]
                new_evidence = tu["evidence"]

                # Add nonduplicate evidence codes to original list
                evidence_list.extend(list(set(new_evidence) - set(evidence_list)))
                continue
            else:
                gene_tuple_to_tu_index[gene_tuple] = i

            # Use gene coordinates if left and right end positions are not given
            if not isinstance(tu["left_end_pos"], int):
                tu["left_end_pos"] = min(
                    [gene_id_to_left_end_pos[gene_id] for gene_id in tu["genes"]]
                )
            if not isinstance(tu["right_end_pos"], int):
                tu["right_end_pos"] = max(
                    [gene_id_to_right_end_pos[gene_id] for gene_id in tu["genes"]]
                )

            # Keep track of genes that are covered
            covered_gene_ids |= set(tu["genes"])

            # Add mapping from miscRNA IDs to single-gene TU IDs
            if len(gene_tuple) == 1 and gene_id_to_rna_type[gene_tuple[0]] == "miscRNA":
                self._miscrna_id_to_singleton_tu_id[
                    gene_id_to_rna_id[gene_tuple[0]]
                ] = tu["id"]

            self._sequences[tu["id"]] = parse_sequence(
                tu["id"], tu["left_end_pos"], tu["right_end_pos"], tu["direction"]
            )

        # Add sequences of individual RNAs that are not part of any
        # transcription unit (these genes are assumed to be transcribed as
        # monocistronic transcription units)
        rna_id_to_gene_id = {gene["rna_ids"][0]: gene["id"] for gene in raw_data.genes}
        all_rna_ids = sorted(set([rna["id"] for rna in raw_data.rnas]))

        for rna_id in all_rna_ids:
            # Skip RNAs without associated genes
            if rna_id not in rna_id_to_gene_id:
                continue

            gene_id = rna_id_to_gene_id[rna_id]

            # Skip excluded genes
            if gene_id not in valid_gene_ids:
                continue

            # Skip mRNAs that are already covered by transcription units
            if gene_id in covered_gene_ids and gene_id_to_rna_type[gene_id] == "mRNA":
                continue

            left_end_pos = gene_id_to_left_end_pos[gene_id]
            right_end_pos = gene_id_to_right_end_pos[gene_id]

            self._sequences[rna_id] = parse_sequence(
                rna_id, left_end_pos, right_end_pos, gene_id_to_direction[gene_id]
            )



[docs]
    def _build_protein_sequences(self, raw_data):
        """
        Builds the amino acid sequences of each protein monomer using sequences
        in raw_data.
        """
        for protein in raw_data.proteins:
            if protein["seq"] is not None:
                # Remove internal stop codons
                self._sequences[protein["id"]] = Seq(protein["seq"].replace("*", ""))



[docs]
    def _build_submass_array(
        self, mw: float, submass_name: str
    ) -> npt.NDArray[np.float64]:
        """
        Converts a scalar molecular weight value to an array of submasses,
        given the name of the submass category that the molecule belongs to.
        """
        mw_array = np.zeros(self._n_submass_indexes)
        mw_array[self._submass_name_to_index[submass_name]] = mw
        return mw_array



[docs]
    def _build_all_masses(self, raw_data, sim_data):
        """
        Builds dictionary of molecular weights keyed with the IDs of molecules.
        Depending on the type of the molecule, weights are either pulled
        directly from raw data, or dynamically calculated from the existing
        data.
        """
        self._all_submass_arrays = {}

        metabolites_with_masses = [
            met for met in raw_data.metabolites if met["mw"] is not None
        ]

        # Water is the only metabolite with a separate submass classification
        self._all_submass_arrays.update(
            {
                met["id"]: (
                    self._build_submass_array(met["mw"], "metabolite")
                    if met["id"] != sim_data.molecule_ids.water[:-3]
                    else self._build_submass_array(met["mw"], "water")
                )
                for met in metabolites_with_masses
            }
        )

        # These updates can be dependent on metabolite masses
        self._all_submass_arrays.update(
            self._build_polymerized_subunit_masses(sim_data)
        )
        self._all_submass_arrays.update(self._build_rna_masses(raw_data, sim_data))
        self._all_submass_arrays.update(self._build_protein_masses(raw_data, sim_data))
        self._all_submass_arrays.update(
            self._build_full_chromosome_mass(raw_data, sim_data)
        )

        # These updates can be dependent on the masses calculated above
        self._all_submass_arrays.update(self._build_modified_rna_masses(raw_data))
        self._all_submass_arrays.update(self._build_protein_complex_masses(raw_data))

        # This update can be dependent on the masses calculated above
        self._all_submass_arrays.update(
            self._build_modified_protein_masses(raw_data, sim_data)
        )

        self._mass_units = units.g / units.mol

        # Build dictionary of total masses of each molecule
        self._all_total_masses = {
            mol_id: mass_array.sum()
            for (mol_id, mass_array) in self._all_submass_arrays.items()
        }



[docs]
    def _build_polymerized_subunit_masses(self, sim_data):
        """
        Builds dictionary of molecular weights of polymerized subunits (NTPs,
        dNTPs, and amino acids) by subtracting the end weights from the weights
        of original subunits.
        """
        polymerized_subunit_masses = {}

        def add_subunit_masses(subunit_ids, end_group_id):
            """
            Add subunit masses to dictionary given the IDs of the original
            subunits and the end group.
            """
            # Subtract mw of end group from each polymer subunit
            end_group_mw = self._all_submass_arrays[end_group_id[:-3]].sum()
            polymerized_subunit_mws = [
                self._all_submass_arrays[met_id[:-3]].sum() - end_group_mw
                for met_id in subunit_ids
            ]

            # Add to dictionary with prefixed ID
            polymerized_subunit_masses.update(
                {
                    POLYMERIZED_FRAGMENT_PREFIX
                    + subunit_id[:-3]: self._build_submass_array(mw, "metabolite")
                    for subunit_id, mw in zip(subunit_ids, polymerized_subunit_mws)
                }
            )

        add_subunit_masses(sim_data.molecule_groups.ntps, sim_data.molecule_ids.ppi)
        add_subunit_masses(sim_data.molecule_groups.dntps, sim_data.molecule_ids.ppi)
        add_subunit_masses(
            sim_data.molecule_groups.amino_acids, sim_data.molecule_ids.water
        )

        return polymerized_subunit_masses



[docs]
    def _build_rna_masses(self, raw_data, sim_data):
        """
        Builds dictionary of molecular weights of RNAs keyed with the RNA IDs.
        Molecular weights are calculated from the RNA sequence and the weights
        of polymerized NTPs.
        """
        rnas_with_seqs = [
            rna["id"]
            for rna in itertools.chain(raw_data.rnas, raw_data.transcription_units)
            if rna["id"] in self._sequences
        ]

        # Get RNA nucleotide compositions
        rna_seqs = self.get_sequences(rnas_with_seqs)
        nt_counts = []
        for seq in rna_seqs:
            nt_counts.append(
                [seq.count(letter) for letter in sim_data.ntp_code_to_id_ordered.keys()]
            )
        nt_counts = np.array(nt_counts)

        # Calculate molecular weights
        ppi_mw = self._all_submass_arrays[sim_data.molecule_ids.ppi[:-3]].sum()
        polymerized_ntp_mws = np.array(
            [
                self._all_submass_arrays[met_id[:-3]].sum()
                for met_id in sim_data.molecule_groups.polymerized_ntps
            ]
        )

        mws = nt_counts.dot(polymerized_ntp_mws) + ppi_mw  # Add end weight

        # Map monocistronic RNA IDs to RNA types
        rna_id_to_type = {rna["id"]: rna["type"] for rna in raw_data.rnas}

        # Add polycistronic RNAs to mapping
        gene_id_to_rna_id = {gene["id"]: gene["rna_ids"][0] for gene in raw_data.genes}

        for tu in raw_data.transcription_units:
            if tu["id"] not in self._sequences:
                continue
            tu_rna_types = [
                rna_id_to_type[gene_id_to_rna_id[gene]]
                for gene in tu["genes"]
                if rna_id_to_type[gene_id_to_rna_id[gene]] not in EXCLUDED_RNA_TYPES
            ]
            if len(set(tu_rna_types)) > 1 and set(tu_rna_types) != {"rRNA", "tRNA"}:
                raise ValueError(
                    f"Transcription unit {tu['id']} includes genes"
                    f" that encode for two or more different types of RNAs."
                    f" Such transcription units are not supported by this"
                    f" version of the model with the exception of rRNA"
                    f" transcription units with tRNA genes."
                )

            if len(tu_rna_types) == 1:
                rna_id_to_type[tu["id"]] = tu_rna_types[0]
            else:
                # Hybrid RNAs are set to have nonspecific mass
                rna_id_to_type[tu["id"]] = "nonspecific_RNA"

        return {
            rna_id: self._build_submass_array(mw, rna_id_to_type[rna_id])
            for (rna_id, mw) in zip(rnas_with_seqs, mws)
        }



[docs]
    def _build_protein_masses(self, raw_data, sim_data):
        """
        Builds dictionary of molecular weights of protein monomers keyed with
        the protein IDs. Molecular weights are calculated from the protein
        sequence and the weights of polymerized amino acids.
        """
        proteins_with_seqs = [
            protein["id"]
            for protein in raw_data.proteins
            if protein["id"] in self._sequences
        ]

        # Get protein amino acid compositions
        protein_seqs = self.get_sequences(proteins_with_seqs)
        aa_counts = []
        for seq in protein_seqs:
            aa_counts.append(
                [
                    seq.count(letter)
                    for letter in sim_data.amino_acid_code_to_id_ordered.keys()
                ]
            )
        aa_counts = np.array(aa_counts)

        # Calculate molecular weights
        water_mw = self._all_submass_arrays[sim_data.molecule_ids.water[:-3]].sum()
        polymerized_aa_mws = np.array(
            [
                self._all_submass_arrays[met_id[:-3]].sum()
                for met_id in sim_data.molecule_groups.polymerized_amino_acids
            ]
        )
        mws = aa_counts.dot(polymerized_aa_mws) + water_mw  # Add end weight

        return {
            protein_id: self._build_submass_array(mw, "protein")
            for (protein_id, mw) in zip(proteins_with_seqs, mws)
        }



[docs]
    def _build_full_chromosome_mass(self, raw_data, sim_data):
        """
        Calculates the mass of the full chromosome from its sequence and the
        weigths of polymerized dNTPs.
        """
        # Get chromosome dNTP compositions
        chromosome_seq = raw_data.genome_sequence
        forward_strand_nt_counts = np.array(
            [
                chromosome_seq.count(letter)
                for letter in sim_data.dntp_code_to_id_ordered.keys()
            ]
        )
        reverse_strand_nt_counts = np.array(
            [
                chromosome_seq.reverse_complement().count(letter)
                for letter in sim_data.dntp_code_to_id_ordered.keys()
            ]
        )

        # Calculate molecular weight
        polymerized_dntp_mws = np.array(
            [
                self._all_submass_arrays[met_id[:-3]].sum()
                for met_id in sim_data.molecule_groups.polymerized_dntps
            ]
        )
        mw = float(
            np.dot(
                forward_strand_nt_counts + reverse_strand_nt_counts,
                polymerized_dntp_mws,
            )
        )

        return {
            sim_data.molecule_ids.full_chromosome[:-3]: self._build_submass_array(
                mw, "DNA"
            )
        }



[docs]
    def _build_modified_rna_masses(self, raw_data):
        """
        Builds dictionary of molecular weights of modified RNAs keyed with
        the molecule IDs. Molecular weights are calculated from the
        stoichiometries of the modification reactions.
        """
        modified_rna_masses = {}

        # Get all modified form IDs from RNA data
        all_modified_rna_ids = {
            modified_rna_id
            for rna in raw_data.rnas
            for modified_rna_id in rna["modified_forms"]
        }

        # Loop through each charging reaction
        for rxn in raw_data.trna_charging_reactions:
            # Find molecule IDs whose masses are still unknown
            unknown_mol_ids = [
                mol_id
                for mol_id in rxn["stoichiometry"].keys()
                if mol_id not in self._all_submass_arrays
            ]
            # The only molecule whose mass is unknown should be the modified
            # RNA molecule
            if len(unknown_mol_ids) != 1:
                raise UnknownMolecularWeightError(
                    "RNA modification reaction %s has two or more reactants or products with unknown molecular weights: %s"
                    % (rxn["id"], unknown_mol_ids)
                )

            # All modified RNAs must be assigned to a specific RNA
            modified_rna_id = unknown_mol_ids[0]
            if modified_rna_id not in all_modified_rna_ids:
                raise UnknownModifiedRnaError(
                    "The modified RNA molecule %s was not assigned to any RNAs."
                    % (modified_rna_id,)
                )

            # Calculate mw of the modified RNA from the stoichiometry
            mw_sum = np.zeros(self._n_submass_indexes)
            for mol_id, coeff in rxn["stoichiometry"].items():
                if mol_id == modified_rna_id:
                    modified_rna_coeff = coeff
                else:
                    mw_sum += coeff * self._all_submass_arrays[mol_id]

            modified_rna_masses[modified_rna_id] = -mw_sum / modified_rna_coeff

        return modified_rna_masses



[docs]
    def _build_protein_complex_masses(self, raw_data):
        """
        Builds dictionary of molecular weights of protein complexes keyed with
        the molecule IDs. Molecular weights are calculated from the
        stoichiometries of the complexation/equilibrium reactions. For
        complexes whose subunits are also complexes, the molecular weights are
        calculated recursively.
        """
        protein_complex_masses = {}

        # Build mapping from complex ID to its subunit stoichiometry
        complex_id_to_stoich = {}

        for rxn in itertools.chain(
            raw_data.complexation_reactions, raw_data.equilibrium_reactions
        ):
            # Get the ID of the complex and the stoichiometry of subunits
            complex_ids = []
            subunit_stoich = {}

            for mol_id, coeff in rxn["stoichiometry"].items():
                # Replace miscRNA subunit IDs with TU IDs
                if mol_id in self._miscrna_id_to_singleton_tu_id:
                    mol_id = self.get_singleton_tu_id(mol_id)

                # Assume coefficients given as "null" equate to -1
                if coeff is None:
                    coeff = -1

                if coeff == 1:
                    complex_ids.append(mol_id)
                elif coeff < 0:
                    subunit_stoich.update({mol_id: -coeff})

                # Each molecule should either be a complex with the coefficient
                # 1 or a subunit with a negative coefficient
                else:
                    raise InvalidProteinComplexError(
                        "Reaction %s contains an invalid molecule %s with stoichiometric coefficient %.1f."
                        % (rxn["id"], mol_id, coeff)
                    )

            # Each reaction should only produce a single protein complex
            if len(complex_ids) != 1:
                raise InvalidProteinComplexError(
                    "Reaction %s does not produce a single protein complex."
                    % (rxn["id"],)
                )

            complex_id_to_stoich[complex_ids[0]] = subunit_stoich

        def get_mw(mol_id):
            """
            Recursive function that returns the molecular weight given a
            molecule ID.
            """
            # Look for molecule ID in existing dictionaries, and return
            # molecular weight array if found
            if mol_id in self._all_submass_arrays:
                return self._all_submass_arrays[mol_id]
            elif mol_id in protein_complex_masses:
                return protein_complex_masses[mol_id]

            # If molecule ID does not exist, recursively add up the molecular
            # weights of the molecule's subunits
            else:
                # Each complex molecule should have a corresponding subunit
                # stoichiometry
                if mol_id not in complex_id_to_stoich:
                    raise InvalidProteinComplexError(
                        "Complex %s is not being produced by any complexation or equilibrium reaction."
                        % (mol_id,)
                    )
                mw_sum = np.zeros(self._n_submass_indexes)

                for subunit_id, coeff in complex_id_to_stoich[mol_id].items():
                    mw_sum += coeff * get_mw(subunit_id)

                return mw_sum

        # Get molecular weights of each protein complex
        for complex_id in complex_id_to_stoich.keys():
            protein_complex_masses.update({complex_id: get_mw(complex_id)})

        return protein_complex_masses



[docs]
    def _build_modified_protein_masses(self, raw_data, sim_data):
        """
        Builds dictionary of molecular weights of modified proteins keyed with
        the molecule IDs. Molecular weights are calculated from the
        stoichiometries of the modification reactions.
        """
        modified_protein_masses = {}

        # List IDs of all metabolites involved in 2CS
        ALL_2CS_METABOLITES = ["ATP", "ADP", "Pi", "WATER", "PROTON"]

        # Map types of modified proteins to their phosphorylation reactions that
        # can be used to calculate their molecular weights
        PROTEIN_TYPE_TO_PHOSPHORYLATION_RXN_ID = {
            "PHOSPHO-HK": "POS-HK-PHOSPHORYLATION_RXN",
            "PHOSPHO-HK-LIGAND": "POS-LIGAND-BOUND-HK-PHOSPHORYLATION_RXN",
            "PHOSPHO-RR": "POS-RR-DEPHOSPHORYLATION_RXN",
        }

        # Get stoichiometries of all 2CS reaction types
        all_2cs_reaction_stoichs = {
            rxn["id"]: {mol["molecule"]: mol["coeff"] for mol in rxn["stoichiometry"]}
            for rxn in raw_data.two_component_system_templates
        }

        all_2cs_systems = [sys["molecules"] for sys in raw_data.two_component_systems]

        for system in all_2cs_systems:
            for mol_type, mol_id in system.items():
                # Skip molecules that are not modified proteins
                if mol_type not in PROTEIN_TYPE_TO_PHOSPHORYLATION_RXN_ID:
                    continue
                else:
                    # Get reaction stoichiometry of phosphorylation reaction
                    phosphorylation_reaction = PROTEIN_TYPE_TO_PHOSPHORYLATION_RXN_ID[
                        mol_type
                    ]
                    reaction_stoich = all_2cs_reaction_stoichs[phosphorylation_reaction]

                    # Initialize submass array
                    mw = np.zeros(self._n_submass_indexes)
                    coeff_modified_protein = None

                    # Calculate mass such that reaction is mass balanced
                    for mol, coeff in reaction_stoich.items():
                        if mol == mol_type:
                            coeff_modified_protein = coeff
                            continue
                        elif mol in ALL_2CS_METABOLITES:
                            mw += -coeff * self._all_submass_arrays[mol]
                        else:
                            mw += -coeff * self._all_submass_arrays[system[mol]]

                    mw = mw / coeff_modified_protein

                    # Remove water submass and add to metabolite submass
                    if mw[sim_data.submass_name_to_index["water"]] != 0:
                        mw[sim_data.submass_name_to_index["metabolite"]] += mw[
                            sim_data.submass_name_to_index["water"]
                        ]
                        mw[sim_data.submass_name_to_index["water"]] = 0

                    modified_protein_masses[mol_id] = mw

        return modified_protein_masses



[docs]
    def _build_compartments(self, raw_data, sim_data):
        self._all_compartments = {}
        all_compartments = [comp["abbrev"] for comp in raw_data.compartments]

        compartment_ids_to_abbreviations = {
            comp["id"]: comp["abbrev"] for comp in raw_data.compartments
        }
        # TODO (ggsun): Add some of these to list of compartments?
        compartment_ids_to_abbreviations.update(UNDEFINED_COMPARTMENT_IDS_TO_ABBREVS)

        # RNAs, modified RNAs, and full chromosomes only localize to the
        # cytosol
        self._all_compartments.update(
            {
                rna["id"]: ["c"]
                for rna in itertools.chain(raw_data.rnas, raw_data.transcription_units)
                if rna["id"] in self._sequences
            }
        )
        self._all_compartments.update(
            {
                modified_rna_id: ["c"]
                for rna in raw_data.rnas
                for modified_rna_id in rna["modified_forms"]
                if modified_rna_id in self._all_submass_arrays
            }
        )
        self._all_compartments.update(
            {sim_data.molecule_ids.full_chromosome[:-3]: ["c"]}
        )

        # Proteins are assumed to localize to a single compartment. If there is
        # experimental evidence for the protein existing in the cytosol, the
        # protein is assigned to the cytosol. If there are multiple
        # experimentally determined localizations other than the cytosol, the
        # first compartment given in the list is used. If an experimentally
        # determined localization doesn't exist, the first compartment given in
        # the list of computationally determined compartments is used. If no
        # compartment information is given, the protein is assumed to localize
        # to the cytosol.
        for protein in raw_data.proteins:
            exp_compartment = protein["experimental_compartment"]
            comp_compartment = protein["computational_compartment"]

            if (
                "CCO-CYTOSOL" in exp_compartment
                or len(exp_compartment) + len(comp_compartment) == 0
            ):
                compartment = "CCO-CYTOSOL"
            elif len(exp_compartment) > 0:
                compartment = exp_compartment[0]
            else:
                compartment = comp_compartment[0]

            self._all_compartments.update(
                {protein["id"]: [compartment_ids_to_abbreviations[compartment]]}
            )

        # Modified proteins localize to the single compartment specified in raw
        # data for each protein
        self._all_compartments.update(
            {
                protein["id"]: [protein["compartment"]]
                for protein in raw_data.modified_proteins
            }
        )

        # Metabolites localize to the cytosol plus any other compartment that
        # is specified in the list of metabolic reactions, or the periplasm if
        # the metabolite is a 2CS ligand
        metabolite_compartments = {met["id"]: ["c"] for met in raw_data.metabolites}

        # Loop through list of metabolic reactions and add missing compartments
        # for each metabolite
        for rxn in raw_data.metabolic_reactions:
            for met in rxn["stoichiometry"].keys():
                met_id, compartment, _ = re.split(r"[\[\]]", met)
                compartment_tag = compartment_ids_to_abbreviations[compartment]

                if (
                    met_id in metabolite_compartments
                    and compartment_tag not in metabolite_compartments[met_id]
                ):
                    metabolite_compartments[met_id].append(compartment_tag)

        # Add periplasm as valid compartment for 2CS ligands
        two_component_system_ligands = [
            system["molecules"]["LIGAND"] for system in raw_data.two_component_systems
        ]

        for met in two_component_system_ligands:
            if "p" not in metabolite_compartments[met]:
                metabolite_compartments[met].append("p")

        self._all_compartments.update(metabolite_compartments)

        # Polymerized subunits can localize to all compartments being modeled
        self._all_compartments.update(
            {
                subunit_id[:-3]: all_compartments
                for subunit_id in sim_data.molecule_groups.polymerized_subunits
            }
        )

        # Sort compartment abbreviations based on complexation priority: when
        # a molecule in a compartment with a higher complexation priority
        # forms a complex with a molecule in a compartment with low priority,
        # the complex is assigned to the higher-priority compartment.
        complexation_priorities = np.array(
            [comp["complexation_priority"] for comp in raw_data.compartments]
        )
        all_compartments_sorted = [
            all_compartments[i] for i in np.argsort(complexation_priorities)
        ]

        # Protein complexes are localized based on the compartments of their
        # subunits
        self._all_compartments.update(
            self._build_protein_complex_compartments(raw_data, all_compartments_sorted)
        )



[docs]
    def _build_protein_complex_compartments(self, raw_data, all_compartments_sorted):
        """
        Builds dictionary of compartment tags for protein complexes keyed with
        the molecule IDs. Each complex is assigned to the compartment that
        contains a subunit of the complex and has the highest complexation
        priority. Compartment tags of subunits that are also protein complexes
        are determined recursively. Compartments of metabolite subunits are not
        considered since metabolites are assigned to all compartments that are
        being modeled.
        """
        protein_complex_compartments = {}
        metabolite_ids = {met["id"] for met in raw_data.metabolites}

        # Build mapping from complex ID to its subunit IDs
        complex_id_to_subunit_ids = {}

        for rxn in itertools.chain(
            raw_data.complexation_reactions, raw_data.equilibrium_reactions
        ):
            # Get the ID of the complex and the stoichiometry of subunits
            complex_id = None
            subunit_ids = []

            for mol_id, coeff in rxn["stoichiometry"].items():
                # Replace miscRNA subunit IDs with TU IDs
                if mol_id in self._miscrna_id_to_singleton_tu_id:
                    mol_id = self.get_singleton_tu_id(mol_id)

                if coeff is None:
                    coeff = -1

                if coeff == 1:
                    complex_id = mol_id
                elif coeff < 0:
                    subunit_ids.append(mol_id)

            complex_id_to_subunit_ids[complex_id] = subunit_ids

        def get_compartment(mol_id):
            """
            Recursive function that returns a compartment tag given a molecule ID.
            """
            # Look for molecule ID in existing dictionaries, and return
            # compartment tag if found
            if mol_id in self._all_compartments:
                return self._all_compartments[mol_id]
            elif mol_id in protein_complex_compartments:
                return protein_complex_compartments[mol_id]

            # If molecule ID does not exist, return the subunit compartment
            # with the highest priority
            else:
                all_subunit_compartments = []
                for subunit_id in complex_id_to_subunit_ids[mol_id]:
                    # Skip metabolites (can exist in all compartments)
                    if subunit_id in metabolite_ids:
                        continue
                    else:
                        all_subunit_compartments.extend(get_compartment(subunit_id))

                if len(all_subunit_compartments) == 0:
                    raise InvalidProteinComplexError(
                        "Complex %s is composed of only metabolites." % (mol_id,)
                    )

                # Return compartment with highest complexation priority
                for loc in all_compartments_sorted:
                    if loc in all_subunit_compartments:
                        return [loc]

        # Get compartments of each protein complex
        for complex_id in complex_id_to_subunit_ids.keys():
            protein_complex_compartments.update(
                {complex_id: get_compartment(complex_id)}
            )

        return protein_complex_compartments



[docs]
    def _build_genomic_coordinates(self, raw_data):
        """
        Builds a dictionary of genomic coordinates of DNA sites. Keys are the
        IDs of the sites, and the values are tuples of the left-end coordinate
        and the right-end coordinate of the site. Sites whose types are included
        in IGNORED_DNA_SITE_TYPES are ignored.
        """
        self._all_genomic_coordinates = {
            site["id"]: (site["left_end_pos"], site["right_end_pos"])
            for site in raw_data.dna_sites
            if site["type"] not in IGNORED_DNA_SITE_TYPES
        }