Source code for reconstruction.ecoli.dataclasses.process.transcription

"""
SimulationData for transcription process

TODO: add mapping of tRNA to charged tRNA if allowing more than one modified form of tRNA and separate mappings for tRNA and charged tRNA to AA
TODO: handle ppGpp and DksA-ppGpp regulation separately
"""

from functools import cache
from typing import cast

import numpy as np

from scipy.sparse import csr_matrix
import sympy as sp

from reconstruction.ecoli.dataclasses.getter_functions import EXCLUDED_RNA_TYPES
from .replication import MAX_TIMESTEP
from ecoli.library.schema import bulk_name_to_idx, counts
from wholecell.utils import data, fitting, units
from wholecell.utils.fast_nonnegative_least_squares import fast_nnls
from wholecell.utils.fitting import normalize
from wholecell.utils.unit_struct_array import UnitStructArray
from wholecell.utils.polymerize import polymerize
from wholecell.utils.random import make_elongation_rates


RNA_SEQ_ANALYSIS = "rsem_tpm"
PPGPP_CONC_UNITS = units.umol / units.L
PRINT_VALUES = False  # print values for supplemental table if True



[docs]
class TranscriptionDirectionError(Exception):
    pass




[docs]
class Transcription(object):
    """
    SimulationData for the transcription process
    """

    def __init__(self, raw_data, sim_data):
        self._build_ppgpp_regulation(raw_data, sim_data)
        self._build_oric_terc_coordinates(raw_data, sim_data)
        self._build_cistron_data(raw_data, sim_data)
        self._build_rna_data(raw_data, sim_data)
        self._build_mature_rna_data(raw_data, sim_data)
        self._build_transcription(raw_data, sim_data)
        self._build_charged_trna(raw_data, sim_data)
        self._build_attenuation(raw_data, sim_data)
        self._build_elongation_rates(raw_data, sim_data)
        self._build_new_gene_data(raw_data, sim_data)

    def __getstate__(self):
        """Return the state to pickle with transcriptionSequences removed and
        only storing data from transcriptionSequences with pad values stripped.
        """

        state = data.dissoc_strict(self.__dict__, ("transcription_sequences",))
        state["sequences"] = np.array(
            [seq[seq != polymerize.PAD_VALUE] for seq in self.transcription_sequences],
            dtype=object,
        )
        state["sequence_shape"] = self.transcription_sequences.shape
        return state

    def __setstate__(self, state):
        """Restore transcriptionSequences and remove processed versions of the data."""
        sequences = state.pop("sequences")
        sequence_shape = state.pop("sequence_shape")
        self.__dict__.update(state)

        self.transcription_sequences = np.full(
            sequence_shape, polymerize.PAD_VALUE, dtype=np.int8
        )
        for i, seq in enumerate(sequences):
            self.transcription_sequences[i, : len(seq)] = seq


[docs]
    def _build_ppgpp_regulation(self, raw_data, sim_data):
        """
        Determine which genes are regulated by ppGpp and store the fold
        change in expression associated with each gene.

        Attributes set:
                ppgpp_regulated_genes (ndarray[str]): cistron ID of regulated genes
                ppgpp_fold_changes (ndarray[float]): log2 fold change for each gene
                        in ppgpp_regulated_genes
                _ppgpp_growth_parameters: parameters for interpolate.splev
                        to estimate growth rate from ppGpp concentration
        """

        def read_value(d, k):
            """Handle empty values from raw_data as 0"""
            val = d[k]
            return 0 if val == "" else val

        # Flag to check when ppGpp regulated expression has been set
        # see set_ppgpp_expression()
        self._ppgpp_expression_set = False

        # Determine KM for ppGpp binding to RNAP
        self._solve_ppgpp_km(raw_data, sim_data)

        # Read regulation data from raw_data
        # Treats ppGpp and DksA-ppGpp regulation the same
        gene_to_rna = {g["symbol"]: g["rna_ids"][0] for g in raw_data.genes}
        rna_id_to_rna_type = {r["id"]: r["type"] for r in raw_data.rnas}
        regulation = {}
        for reg in raw_data.ppgpp_regulation:
            # Convert to regulated RNA
            gene = reg["Gene"]
            rna = gene_to_rna.get(gene, None)
            if rna is None or rna_id_to_rna_type[rna] in EXCLUDED_RNA_TYPES:
                continue

            # Add additional gene symbols for matching FC data
            curated_gene = reg["Curated Gene"]
            if gene != curated_gene:
                gene_to_rna[curated_gene] = rna

            # Update value (some genes are repeated in raw_data))
            direction = read_value(reg, "ppGpp") + read_value(reg, "DksA-ppGpp")
            regulation[rna] = regulation.get(rna, 0) + direction

        # Read fold change data from raw_data
        ## Categories A-D are statistically significant fold changes
        ## Categories E-G are not significant or data is not usable
        valid_categories = {"A", "B", "C", "D"}
        sample_time = (
            5  # Could also be 10 (5 min minimizes downstream regulation impacts)
        )
        sample_id = "1+2+ {} min".format(
            sample_time
        )  # Column contains FC data for given time
        rna_fold_changes = {}
        for fc in raw_data.ppgpp_fc:
            # Convert to regulated RNA
            gene = fc["Gene"]
            rna = gene_to_rna.get(gene, None)
            if rna is None:
                continue

            category = fc["{} Category".format(sample_id)]
            if category not in valid_categories:
                continue
            rna_fold_changes[rna] = fc[sample_id]

        # Store arrays of regulation
        regulated_genes = []
        regulation_direction = []
        fold_changes = []
        for rna in sorted(regulation):
            reg_dir = regulation[rna]
            fc_dir = rna_fold_changes.get(rna, 0)

            # Ignore inconsistent regulatory directions
            if reg_dir == 0:
                continue

            # Use default value if annotated direction does not match data direction
            if reg_dir * fc_dir < 0:
                fc_dir = 0

            regulated_genes.append(rna)
            regulation_direction.append(np.sign(reg_dir))
            fold_changes.append(fc_dir)
        self.ppgpp_regulated_genes = np.array(regulated_genes)
        regulation_direction = np.array(regulation_direction)

        # Replace fold changes without data with the average
        fold_changes = np.array(fold_changes)
        average_positive_fc = fold_changes[fold_changes > 0].mean()
        fold_changes[(fold_changes == 0) & (regulation_direction < 0)] = (
            self._fit_ppgpp_fc
        )
        fold_changes[(fold_changes == 0) & (regulation_direction > 0)] = (
            average_positive_fc
        )
        self.ppgpp_fold_changes = fold_changes

        # Predict growth rate from ppGpp level
        # Transforms selected for good fit and to keep the growth rate positive
        # even at high ppGpp concentrations.
        per_dry_mass_to_per_volume = (
            sim_data.constants.cell_density * sim_data.mass.cell_dry_mass_fraction
        )
        ppgpp = np.array(
            [
                (d["ppGpp_conc"] * per_dry_mass_to_per_volume).asNumber(
                    PPGPP_CONC_UNITS
                )
                for d in raw_data.growth_rate_dependent_parameters
            ]
        )
        growth_rates = np.log(2) / np.array(
            [
                d["doublingTime"].asNumber(units.s)
                for d in raw_data.growth_rate_dependent_parameters
            ]
        )
        self._ppgpp_growth_parameters = fitting.fit_linearized_transforms(
            ppgpp, growth_rates, x_fun=["none"], y_fun=["1/sqrt"]
        )

        if PRINT_VALUES:
            print(
                "Supplement value (KM): {:.1f}".format(np.sqrt(self._ppgpp_km_squared))
            )
            print(
                "Supplement value (FC): [{:.2f}, {:.2f}]".format(
                    fold_changes.min(), fold_changes.max()
                )
            )
            print("Supplement value (FC-): {:.2f}".format(self._fit_ppgpp_fc))
            print("Supplement value (FC+): {:.2f}".format(average_positive_fc))

        # Calculate the expected active fraction when RNAP is bound to ppGpp or free
        doubling_times = units.min * np.linspace(25, 100, 10)
        ppgpp = sim_data.growth_rate_parameters.get_ppGpp_conc(doubling_times)
        fraction_active = sim_data.growth_rate_parameters.get_fraction_active_rnap(
            doubling_times
        )
        fraction_bound = self.fraction_rnap_bound_ppgpp(ppgpp)
        A = np.vstack((fraction_bound, 1 - fraction_bound)).T
        self.fraction_active_rnap_bound, self.fraction_active_rnap_free = (
            np.linalg.lstsq(A, fraction_active, rcond=None)[0]
        )
        assert 0 < self.fraction_active_rnap_bound < 1
        assert 0 < self.fraction_active_rnap_free < 1



[docs]
    def _build_oric_terc_coordinates(self, raw_data, sim_data):
        """
        Builds coordinates of oriC and terC that are used when calculating
        genomic positions of cistrons and RNAs relative to the origin
        """
        # Get coordinates of oriC and terC
        oric_left, oric_right = sim_data.getter.get_genomic_coordinates(
            sim_data.molecule_ids.oriC_site
        )
        terc_left, terc_right = sim_data.getter.get_genomic_coordinates(
            sim_data.molecule_ids.terC_site
        )
        self._oric_coordinate = round((oric_left + oric_right) / 2)
        self._terc_coordinate = round((terc_left + terc_right) / 2)
        self._genome_length = len(raw_data.genome_sequence)



[docs]
    def _build_cistron_data(self, raw_data, sim_data):
        """
        Build cistron-associated simulation data from raw data. Cistrons are
        sections of RNAs that encode for a specific polypeptide. A single RNA
        molecule may contain one or more cistrons.
        """
        # Get list of all cistrons with an associated gene and right and left
        # end positions
        cistron_id_to_gene_id = {
            gene["rna_ids"][0]: gene["id"] for gene in raw_data.genes
        }
        gene_id_to_left_end_pos = {
            gene["id"]: gene["left_end_pos"] for gene in raw_data.genes
        }
        gene_id_to_right_end_pos = {
            gene["id"]: gene["right_end_pos"] for gene in raw_data.genes
        }

        all_cistrons = [
            rna
            for rna in raw_data.rnas
            if rna["id"] in cistron_id_to_gene_id
            and gene_id_to_left_end_pos[cistron_id_to_gene_id[rna["id"]]] is not None
            and gene_id_to_right_end_pos[cistron_id_to_gene_id[rna["id"]]] is not None
            and rna["type"] not in EXCLUDED_RNA_TYPES
        ]
        all_cistron_ids = [cistron["id"] for cistron in all_cistrons]

        # Load gene IDs associated with each cistron
        gene_id = np.array([cistron_id_to_gene_id[rna["id"]] for rna in all_cistrons])
        gene_id_to_cistron_id = {g: c for (c, g) in cistron_id_to_gene_id.items()}

        # Calculate lengths of each cistron from their gene end positions
        cistron_lengths = np.array(
            [
                np.abs(
                    gene_id_to_right_end_pos[cistron_id_to_gene_id[cistron["id"]]]
                    - gene_id_to_left_end_pos[cistron_id_to_gene_id[cistron["id"]]]
                )
                + 1
                for cistron in all_cistrons
            ]
        )

        # Get mapping from cistron IDs to coordinate and direction
        cistron_id_to_coordinate = {}
        cistron_id_to_direction = {}
        for gene in raw_data.genes:
            cistron_id_to_direction[gene["rna_ids"][0]] = gene["direction"]
            if gene["direction"] == "+":
                cistron_id_to_coordinate[gene["rna_ids"][0]] = gene["left_end_pos"]
            else:
                cistron_id_to_coordinate[gene["rna_ids"][0]] = gene["right_end_pos"]

        # Get location of each cistron on the chromosome relative to the origin
        replication_coordinate = [
            self._get_relative_coordinates(cistron_id_to_coordinate[cistron["id"]])
            for cistron in all_cistrons
        ]

        # Get direction of each cistron
        is_forward = [
            cistron_id_to_direction[cistron["id"]] == "+" for cistron in all_cistrons
        ]

        # Get cistron nucleotide compositions
        genome_sequence = raw_data.genome_sequence

        def parse_sequence(cistron_id, left_end_pos, right_end_pos, direction):
            """
            Parses genome sequence to get the sequence of the RNA transcribed
            from the cistron, given left and right end positions and
            transcription direction (Note: the left and right end positions in
            the raw data files are given as 1-indexed coordinates)
            """
            if direction == "+":
                return genome_sequence[left_end_pos - 1 : right_end_pos].transcribe()
            elif direction == "-":
                return (
                    genome_sequence[left_end_pos - 1 : right_end_pos]
                    .reverse_complement()
                    .transcribe()
                )
            else:
                raise TranscriptionDirectionError(
                    f"Unidentified transcription direction given for {cistron_id}"
                )

        rna_seqs = [
            parse_sequence(
                cistron_id,
                gene_id_to_left_end_pos[cistron_id_to_gene_id[cistron_id]],
                gene_id_to_right_end_pos[cistron_id_to_gene_id[cistron_id]],
                cistron_id_to_direction[cistron_id],
            )
            for cistron_id in all_cistron_ids
        ]

        nt_counts = []
        for seq in rna_seqs:
            nt_counts.append(
                [seq.count(letter) for letter in sim_data.ntp_code_to_id_ordered.keys()]
            )
        nt_counts = np.array(nt_counts)

        # Calculate molecular weights of the RNAs corresponding to each cistron
        ppi_mw = sim_data.getter.get_mass(sim_data.molecule_ids.ppi[:-3]).asNumber(
            units.g / units.mol
        )
        polymerized_ntp_mws = np.array(
            [
                sim_data.getter.get_mass(met_id[:-3]).asNumber(units.g / units.mol)
                for met_id in sim_data.molecule_groups.polymerized_ntps
            ]
        )

        mws = nt_counts.dot(polymerized_ntp_mws) + ppi_mw  # Add end weight

        # Get boolean arrays for each RNA type
        is_mRNA = [rna["type"] == "mRNA" for rna in all_cistrons]
        is_miscRNA = [rna["type"] == "miscRNA" for rna in all_cistrons]
        is_rRNA = [rna["type"] == "rRNA" for rna in all_cistrons]
        is_tRNA = [rna["type"] == "tRNA" for rna in all_cistrons]

        # Load set of mRNA cistron ids
        mRNA_cistron_ids = set(np.array(all_cistron_ids)[is_mRNA])

        # Load cistron half lives
        cistron_id_to_half_life = {}
        reported_mRNA_cistron_half_lives = []

        for gene in raw_data.rna_half_lives:
            if (
                gene["id"] in gene_id_to_cistron_id
                and gene["half_life"].asNumber(units.s) > 0
            ):
                cistron_id = gene_id_to_cistron_id[gene["id"]]
                cistron_id_to_half_life[cistron_id] = gene["half_life"]
                if cistron_id in mRNA_cistron_ids:
                    reported_mRNA_cistron_half_lives.append(gene["half_life"])

        # Calculate averaged reported half life of mRNAs
        self.average_mRNA_cistron_half_life = np.mean(reported_mRNA_cistron_half_lives)

        # Half-lives of rRNAs are set to be equal to the average reported half
        # life of mRNAs
        # Note: rRNAs complexed into ribosomal subunits will not degrade, so
        # this will only significantly affect excess rRNAs
        rRNA_cistron_ids = np.array(all_cistron_ids)[is_rRNA]
        for cistron_id in rRNA_cistron_ids:
            cistron_id_to_half_life[cistron_id] = self.average_mRNA_cistron_half_life

        # Half-life of tRNAs are set to the stable RNA half life value defined
        # in sim_data.constants
        tRNA_cistron_ids = np.array(all_cistron_ids)[is_tRNA]
        for cistron_id in tRNA_cistron_ids:
            cistron_id_to_half_life[cistron_id] = (
                sim_data.constants.stable_RNA_half_life
            )

        # Get half life of each RNA cistron - if the half life is not given, use
        # the averaged reported half life of mRNAs
        cistron_half_lives = np.array(
            [
                cistron_id_to_half_life.get(
                    cistron_id, self.average_mRNA_cistron_half_life
                ).asNumber(units.s)
                for cistron_id in all_cistron_ids
            ]
        )

        # Calculate expected first-order degradation rates of each cistron
        cistron_deg_rates = np.log(2) / cistron_half_lives

        # Construct boolean arrays for ribosomal protein and RNAP-encoding
        # cistrons
        n_cistrons = len(all_cistrons)

        is_ribosomal_protein = np.zeros(n_cistrons, dtype=bool)
        is_RNAP = np.zeros(n_cistrons, dtype=bool)
        for i, cistron in enumerate(all_cistrons):
            for monomer_id in cistron["monomer_ids"]:
                if monomer_id + "[c]" in sim_data.molecule_groups.ribosomal_proteins:
                    is_ribosomal_protein[i] = True
                if monomer_id + "[c]" in sim_data.molecule_groups.RNAP_subunits:
                    is_RNAP[i] = True

        # Construct boolean arrays and index arrays for each rRNA type
        is_23S = np.zeros(n_cistrons, dtype=bool)
        is_16S = np.zeros(n_cistrons, dtype=bool)
        is_5S = np.zeros(n_cistrons, dtype=bool)
        idx_23S = []
        idx_16S = []
        idx_5S = []

        for rnaIndex, cistron in enumerate(all_cistrons):
            if cistron["id"] + "[c]" in sim_data.molecule_groups.s50_23s_rRNA:
                is_23S[rnaIndex] = True
                idx_23S.append(rnaIndex)
            if cistron["id"] + "[c]" in sim_data.molecule_groups.s30_16s_rRNA:
                is_16S[rnaIndex] = True
                idx_16S.append(rnaIndex)
            if cistron["id"] + "[c]" in sim_data.molecule_groups.s50_5s_rRNA:
                is_5S[rnaIndex] = True
                idx_5S.append(rnaIndex)

        max_cistron_id_length = max(len(rna["id"]) for rna in all_cistrons)
        max_gene_id_length = max(len(id_) for id_ in gene_id)

        cistron_data = np.zeros(
            n_cistrons,
            dtype=[
                ("id", "U{}".format(max_cistron_id_length)),
                ("gene_id", "U{}".format(max_gene_id_length)),
                ("length", "i8"),
                ("replication_coordinate", "i8"),
                ("is_forward", "bool"),
                ("mw", "f8"),
                ("deg_rate", "f8"),
                ("is_mRNA", "bool"),
                ("is_miscRNA", "bool"),
                ("is_rRNA", "bool"),
                ("is_tRNA", "bool"),
                ("is_23S_rRNA", "bool"),
                ("is_16S_rRNA", "bool"),
                ("is_5S_rRNA", "bool"),
                ("is_ribosomal_protein", "bool"),
                ("is_RNAP", "bool"),
                ("uses_corrected_seq_counts", "bool"),
                ("is_new_gene", "bool"),
            ],
        )

        cistron_data["id"] = [rna["id"] for rna in all_cistrons]
        cistron_data["gene_id"] = gene_id
        cistron_data["length"] = cistron_lengths
        cistron_data["replication_coordinate"] = replication_coordinate
        cistron_data["is_forward"] = is_forward
        cistron_data["mw"] = mws
        cistron_data["deg_rate"] = cistron_deg_rates
        cistron_data["is_mRNA"] = is_mRNA
        cistron_data["is_miscRNA"] = is_miscRNA
        cistron_data["is_rRNA"] = is_rRNA
        cistron_data["is_tRNA"] = is_tRNA
        cistron_data["is_23S_rRNA"] = is_23S
        cistron_data["is_16S_rRNA"] = is_16S
        cistron_data["is_5S_rRNA"] = is_5S
        cistron_data["is_ribosomal_protein"] = is_ribosomal_protein
        cistron_data["is_RNAP"] = is_RNAP
        cistron_data["uses_corrected_seq_counts"] = np.zeros(n_cistrons, dtype=bool)
        cistron_data["is_new_gene"] = [k.startswith("NG") for k in gene_id]

        cistron_field_units = {
            "id": None,
            "gene_id": None,
            "length": units.nt,
            "replication_coordinate": None,
            "is_forward": None,
            "mw": units.g / units.mol,
            "deg_rate": 1 / units.s,
            "is_mRNA": None,
            "is_miscRNA": None,
            "is_rRNA": None,
            "is_tRNA": None,
            "is_23S_rRNA": None,
            "is_16S_rRNA": None,
            "is_5S_rRNA": None,
            "is_ribosomal_protein": None,
            "is_RNAP": None,
            "uses_corrected_seq_counts": None,
            "is_new_gene": None,
        }

        self.cistron_data = UnitStructArray(cistron_data, cistron_field_units)
        self._cistron_id_to_index = {
            cistron_id: i for (i, cistron_id) in enumerate(self.cistron_data["id"])
        }

        # Load expression levels of individual cistrons from sequencing data
        cistron_expression = []
        cistron_id_to_gene_id = {
            gene["rna_ids"][0]: gene["id"] for gene in raw_data.genes
        }
        seq_data = {
            x["Gene"]: x[sim_data.basal_expression_condition]
            for x in getattr(raw_data.rna_seq_data, f"rnaseq_{RNA_SEQ_ANALYSIS}_mean")
        }

        cistron_rnaseq_coverage = []
        for cistron_id in self.cistron_data["id"]:
            gene_id = cistron_id_to_gene_id[cistron_id]
            # If sequencing data is not found, initialize expression to zero.
            cistron_expression.append(seq_data.get(gene_id, 0.0))
            cistron_rnaseq_coverage.append(gene_id in seq_data)

        cistron_expression = np.array(cistron_expression)
        self._cistron_is_rnaseq_covered = np.array(cistron_rnaseq_coverage)

        # Set basal expression levels of each cistron - conditional values are
        # set in the parca.
        self.cistron_expression = {}
        self.cistron_expression["basal"] = cistron_expression / cistron_expression.sum()

        # Initialize dictionary for fitted cistron expression levels. Values for
        # this dictionary are set in the parca.
        self.fit_cistron_expression = {}



[docs]
    def _build_rna_data(self, raw_data, sim_data):
        """
        Build RNA-associated simulation data from raw data.
        """
        self._basal_rna_fractions = sim_data.mass.get_basal_rna_fractions()

        # Get list of transcription units used by the model
        all_valid_tus = [
            tu
            for tu in raw_data.transcription_units
            if sim_data.getter.is_valid_molecule(tu["id"])
        ]

        # Get mapping from transcription unit IDs to list of constituent
        # cistrons
        gene_id_to_rna_id = {gene["id"]: gene["rna_ids"][0] for gene in raw_data.genes}
        tu_id_to_cistron_ids = {
            tu["id"]: [
                gene_id_to_rna_id[gene]
                for gene in tu["genes"]
                if gene_id_to_rna_id[gene] in self.cistron_data["id"]
            ]
            for tu in all_valid_tus
        }

        # Get list of cistrons that are covered by one or more TUs
        cistrons_covered_by_tus = []
        for cistrons in tu_id_to_cistron_ids.values():
            cistrons_covered_by_tus.extend(cistrons)
        cistrons_covered_by_tus = set(cistrons_covered_by_tus)

        # Compile IDs of all RNAs that should be directly transcribed in the
        # model, including RNAs for genes that are not covered by any
        # transcription units
        rna_ids = [tu["id"] for tu in all_valid_tus]
        rna_ids.extend(
            [
                cistron_id
                for cistron_id in self.cistron_data["id"]
                if sim_data.getter.is_valid_molecule(cistron_id)
                and cistron_id not in cistrons_covered_by_tus
            ]
        )
        n_rnas = len(rna_ids)

        # Build mapping matrix between transcription units and constituent
        # cistrons
        cistron_indexes = []
        rna_indexes = []
        v = []

        # Mapping from cistron ID to index
        cistron_id_to_index = {
            cistron_id: cistron_index
            for (cistron_index, cistron_id) in enumerate(self.cistron_data["id"])
        }

        for rna_index, rna_id in enumerate(rna_ids):
            if rna_id in tu_id_to_cistron_ids:
                for mc_rna_id in tu_id_to_cistron_ids[rna_id]:
                    cistron_indexes.append(cistron_id_to_index[mc_rna_id])
                    rna_indexes.append(rna_index)
                    v.append(1)
            else:
                cistron_indexes.append(cistron_id_to_index[rna_id])
                rna_indexes.append(rna_index)
                v.append(1)

        cistron_indexes = np.array(cistron_indexes)
        rna_indexes = np.array(rna_indexes)
        v = np.array(v)
        shape = (cistron_indexes.max() + 1, rna_indexes.max() + 1)

        # Build sparse mapping matrix
        self.cistron_tu_mapping_matrix = csr_matrix(
            (v, (cistron_indexes, rna_indexes)), shape=shape
        )

        # Find groups of cistrons and TUs that belong to the same operons.
        # self.operons is a list of tuples that each specify an operon with
        # a list of cistron indexes and a list of RNA indexes
        # (e.g. ([380, 379, 377], [2356, 2433]))
        visited_cistron_indexes = set()
        visited_rna_indexes = set()
        self.operons = []

        def rna_DFS(rna_index, operon_cistron_indexes, operon_rna_indexes):
            """
            Recursive function to look for indexes of RNAs (transcription units)
            and cistrons that belong to the same operon as the RNA with the
            given index.
            """
            visited_rna_indexes.add(rna_index)
            operon_rna_indexes.append(rna_index)

            for i in cistron_indexes[rna_indexes == rna_index]:
                if i not in visited_cistron_indexes:
                    cistron_DFS(i, operon_cistron_indexes, operon_rna_indexes)

        def cistron_DFS(cistron_index, operon_cistron_indexes, operon_rna_indexes):
            """
            Recursive function to look for indexes of RNAs (transcription units)
            and cistrons that belong to the same operon as the cistron with the
            given index.
            """
            visited_cistron_indexes.add(cistron_index)
            operon_cistron_indexes.append(cistron_index)

            for i in rna_indexes[cistron_indexes == cistron_index]:
                if i not in visited_rna_indexes:
                    rna_DFS(i, operon_cistron_indexes, operon_rna_indexes)

        # Loop through each RNA index
        for rna_index in range(n_rnas):
            # Search for cistrons and RNAs that can be grouped together into the
            # same operon
            if rna_index not in visited_rna_indexes:
                operon_cistron_indexes = []
                operon_rna_indexes = []
                rna_DFS(rna_index, operon_cistron_indexes, operon_rna_indexes)

                # Sort cistron indexes by coordinates
                operon_cistron_indexes = sorted(
                    operon_cistron_indexes,
                    key=lambda i: self.cistron_data["replication_coordinate"][i],
                )

                self.operons.append((operon_cistron_indexes, operon_rna_indexes))

        # Build list of all RNA IDs with compartment tags
        compartments = sim_data.getter.get_compartments(rna_ids)
        rna_ids_with_compartments = [
            f"{rna_id}[{loc[0]}]" for (rna_id, loc) in zip(rna_ids, compartments)
        ]

        # Apply RNAseq corrections to shorter genes if required by operon version
        if sim_data.operons_on:
            self._apply_rnaseq_correction()

        expression, _ = self.fit_rna_expression(self.cistron_expression["basal"])

        # TODO (Albert): should modify more when other types of hybrid RNAs
        #  are introduced (only rRNA-tRNA hybrids are included currently)
        # Determine type of each RNA
        is_mRNA = (
            self.cistron_data["is_mRNA"] @ self.cistron_tu_mapping_matrix
        ).astype(bool)
        is_miscRNA = (
            self.cistron_data["is_miscRNA"] @ self.cistron_tu_mapping_matrix
        ).astype(bool)
        is_rRNA = (
            self.cistron_data["is_rRNA"] @ self.cistron_tu_mapping_matrix
        ).astype(bool)
        # All hybrid TUs containing both rRNAs and tRNAs are assumed to be rRNAs
        includes_tRNA = (
            self.cistron_data["is_tRNA"] @ self.cistron_tu_mapping_matrix
        ).astype(bool)
        is_tRNA = np.logical_and(includes_tRNA, ~is_rRNA)
        is_rtRNA = np.logical_or(is_rRNA, is_tRNA)

        # Get boolean array for unprocessed rRNA/tRNA molecules
        mature_cistron_ids = set(
            self.cistron_data["id"][
                self.cistron_data["is_tRNA"] | self.cistron_data["is_rRNA"]
            ]
        )
        is_mature_rtRNA = np.array([rna_id in mature_cistron_ids for rna_id in rna_ids])
        is_unprocessed = is_rtRNA & ~is_mature_rtRNA

        # Determine if each RNA contains cistrons that encode for special
        # components
        includes_ribosomal_protein = (
            self.cistron_data["is_ribosomal_protein"] @ self.cistron_tu_mapping_matrix
        ).astype(bool)
        includes_RNAP = (
            self.cistron_data["is_RNAP"] @ self.cistron_tu_mapping_matrix
        ).astype(bool)

        # Build the relative abundance matrix between transcription units and
        # constituent cistrons
        cistron_indexes = []
        rna_indexes = []
        v = []

        for cistron_index, cistron_id in enumerate(self.cistron_data["id"]):
            rna_indexes_this_cistron = self.cistron_id_to_rna_indexes(cistron_id)
            v_this_cistron = np.zeros(len(rna_indexes_this_cistron))
            for i, rna_index in enumerate(rna_indexes_this_cistron):
                cistron_indexes.append(cistron_index)
                rna_indexes.append(rna_index)
                v_this_cistron[i] = expression[rna_index]

            # Assume uniform distribution if cistron is not expressed
            if v_this_cistron.sum() == 0:
                v_this_cistron[:] = 1.0 / len(v_this_cistron)
            else:
                v_this_cistron = v_this_cistron / v_this_cistron.sum()

            v.extend(v_this_cistron)

        cistron_indexes = np.array(cistron_indexes)
        rna_indexes = np.array(rna_indexes)
        v = np.array(v)
        shape = (cistron_indexes.max() + 1, rna_indexes.max() + 1)

        cistron_tu_relative_abundancy_matrix = csr_matrix(
            (v, (cistron_indexes, rna_indexes)), shape=shape
        )

        rna_deg_rates = np.zeros(n_rnas)
        # If a measured half-life for the transcription unit exists, use the
        # measured value to calculate the degradation rate
        rna_id_to_index = {
            rna_id: cistron_index for (cistron_index, rna_id) in enumerate(rna_ids)
        }
        for rna in raw_data.rna_half_lives:
            if rna["id"] in rna_id_to_index and rna["id"] not in cistron_id_to_index:
                rna_deg_rates[rna_id_to_index[rna["id"]]] = np.log(2) / rna[
                    "half_life"
                ].asNumber(units.s)

        # Get mask for RNAs with measured deg rates
        mask_measured_deg_rate = rna_deg_rates > 0

        # Set the minimum possible degradation rates of mRNAs to the minimum of
        # the measured degradation rates of mRNA cistrons
        min_deg_rates = np.zeros(n_rnas)
        cistron_deg_rates = self.cistron_data["deg_rate"].asNumber(1 / units.s)
        mRNA_cistron_deg_rates = cistron_deg_rates[self.cistron_data["is_mRNA"]]
        min_deg_rates[is_mRNA] = mRNA_cistron_deg_rates.min()
        min_deg_rates = min_deg_rates[~mask_measured_deg_rate]

        # Solve NNLS for unmeasured degredation rates, using the fact that
        # A(x + m) = b is equivalent to Ax = b - Am
        abundancy_no_measurements = cistron_tu_relative_abundancy_matrix[
            :, ~mask_measured_deg_rate
        ]
        abundancy_with_measurements = cistron_tu_relative_abundancy_matrix[
            :, mask_measured_deg_rate
        ]
        deg_rates_from_min_rates = abundancy_no_measurements.dot(min_deg_rates)
        deg_rates_from_measured_rates = abundancy_with_measurements.dot(
            rna_deg_rates[mask_measured_deg_rate]
        )
        rna_deg_rates_estimated_minus_min, _ = fast_nnls(
            abundancy_no_measurements,
            cistron_deg_rates
            - deg_rates_from_measured_rates
            - deg_rates_from_min_rates,
        )

        rna_deg_rates[~mask_measured_deg_rate] = (
            rna_deg_rates_estimated_minus_min + min_deg_rates
        )

        # Clip mRNA degradation rates that are higher than the maximum measured
        # degradation rate
        max_mRNA_deg_rate = mRNA_cistron_deg_rates.max()
        rna_deg_rates[np.logical_and(is_mRNA, rna_deg_rates > max_mRNA_deg_rate)] = (
            max_mRNA_deg_rate
        )

        # Set degradation rates of rRNAs and tRNAs from the stable RNA half life
        # value defined in sim_data.constants
        rna_deg_rates[is_rtRNA] = np.log(
            2
        ) / sim_data.constants.stable_RNA_half_life.asNumber(units.s)

        # Calculate synthesis probabilities from expression and normalize
        synth_prob = expression * (
            np.log(2) / sim_data.doubling_time.asNumber(units.s) + rna_deg_rates
        )
        synth_prob /= synth_prob.sum()

        # Load RNA sequences and molecular weights from getter functions
        rna_seqs = sim_data.getter.get_sequences(rna_ids)
        mws = sim_data.getter.get_masses(rna_ids).asNumber(units.g / units.mol)

        # Calculate the masses of component rRNA and tRNA cistrons of each RNA
        rRNA_cistron_mws = self.cistron_data["mw"].asNumber(units.g / units.mol)
        rRNA_cistron_mws[~self.cistron_data["is_rRNA"]] = 0.0
        tRNA_cistron_mws = self.cistron_data["mw"].asNumber(units.g / units.mol)
        tRNA_cistron_mws[~self.cistron_data["is_tRNA"]] = 0.0

        rRNA_mws = self.cistron_tu_mapping_matrix.T.dot(rRNA_cistron_mws)
        tRNA_mws = self.cistron_tu_mapping_matrix.T.dot(tRNA_cistron_mws)

        # Calculate lengths and nt counts from sequence
        rna_lengths = np.array([len(seq) for seq in rna_seqs])

        # Get RNA nucleotide compositions
        ntp_abbreviations = [ntp_id[0] for ntp_id in sim_data.molecule_groups.ntps]
        nt_counts = []
        for seq in rna_seqs:
            nt_counts.append([seq.count(letter) for letter in ntp_abbreviations])
        nt_counts = np.array(nt_counts)

        # Get mapping from cistron IDs to coordinate and direction
        rna_id_to_coordinate = {}
        rna_id_to_direction = {}
        for gene in raw_data.genes:
            rna_id_to_direction[gene["rna_ids"][0]] = gene["direction"]
            if gene["direction"] == "+":
                rna_id_to_coordinate[gene["rna_ids"][0]] = gene["left_end_pos"]
            else:
                rna_id_to_coordinate[gene["rna_ids"][0]] = gene["right_end_pos"]

        # Further extend the dictionaries to include mappings from transcription
        # unit IDs to coordinate and direction
        for tu in all_valid_tus:
            rna_id_to_direction[tu["id"]] = tu["direction"]
            if tu["direction"] == "+":
                rna_id_to_coordinate[tu["id"]] = tu["left_end_pos"]
            else:
                rna_id_to_coordinate[tu["id"]] = tu["right_end_pos"]

        # Get mapping from cistron IDs to lengths
        cistron_id_to_length = {
            cistron["id"]: cistron["length"] for cistron in self.cistron_data
        }

        # Get location of transcription initiation relative to origin and the
        # transcription direction for each transcription unit
        replication_coordinate = [
            self._get_relative_coordinates(rna_id_to_coordinate[rna_id])
            for rna_id in rna_ids
        ]
        is_forward = [rna_id_to_direction[rna_id] == "+" for rna_id in rna_ids]

        # Calculate relative start and end positions of each cistron within each
        # transcription unit
        all_cistron_ids = self.cistron_data["id"]
        self.cistron_start_end_pos_in_tu = {}

        for rna_idx, rna_id in enumerate(rna_ids):
            rna_coordinate = rna_id_to_coordinate[rna_id]
            constituent_cistron_indexes = self.cistron_tu_mapping_matrix.getcol(
                rna_idx
            ).nonzero()[0]

            for cistron_idx in constituent_cistron_indexes:
                cistron_id = all_cistron_ids[cistron_idx]
                cistron_coordinate = rna_id_to_coordinate[cistron_id]

                start_pos = np.abs(cistron_coordinate - rna_coordinate)
                end_pos = start_pos + cistron_id_to_length[cistron_id] - 1

                # End position should stay within length of entire RNA
                assert end_pos < rna_lengths[rna_idx]

                # Key: (index of cistron, index of RNA)
                # Value: (start position, end position)
                self.cistron_start_end_pos_in_tu[(cistron_idx, rna_idx)] = (
                    start_pos,
                    end_pos,
                )

        max_rna_id_length = max(len(id_) for id_ in rna_ids_with_compartments)

        # Get evidence codes for each transcription unit from raw data
        self.rna_id_to_evidence_codes = {
            tu["id"]: sorted(tu["evidence"]) for tu in raw_data.transcription_units
        }

        rna_data = np.zeros(
            n_rnas,
            dtype=[
                ("id", "U{}".format(max_rna_id_length)),
                ("deg_rate", "f8"),
                ("deg_rate_is_measured", "bool"),
                ("length", "i8"),
                ("counts_ACGU", "4i8"),
                ("mw", "f8"),
                ("rRNA_mw", "f8"),
                ("tRNA_mw", "f8"),
                ("Km_endoRNase", "f8"),
                ("replication_coordinate", "int64"),
                ("wt_replication_coordinate", "int64"),
                ("is_forward", "bool"),
                ("is_mRNA", "bool"),
                ("is_miscRNA", "bool"),
                ("is_rRNA", "bool"),
                ("is_tRNA", "bool"),
                ("includes_tRNA", "bool"),
                ("is_unprocessed", "bool"),
                ("includes_ribosomal_protein", "bool"),
                ("includes_RNAP", "bool"),
            ],
        )

        rna_data["id"] = rna_ids_with_compartments
        rna_data["deg_rate"] = rna_deg_rates
        rna_data["deg_rate_is_measured"] = mask_measured_deg_rate
        rna_data["length"] = rna_lengths
        rna_data["counts_ACGU"] = nt_counts
        rna_data["mw"] = mws
        rna_data["rRNA_mw"] = rRNA_mws
        rna_data["tRNA_mw"] = tRNA_mws
        rna_data["Km_endoRNase"] = np.zeros(
            len(rna_ids_with_compartments)
        )  # Set later in ParCa
        rna_data["replication_coordinate"] = replication_coordinate
        rna_data["wt_replication_coordinate"] = replication_coordinate
        rna_data["is_forward"] = is_forward
        rna_data["is_mRNA"] = is_mRNA
        rna_data["is_miscRNA"] = is_miscRNA
        rna_data["is_rRNA"] = is_rRNA
        rna_data["is_tRNA"] = is_tRNA
        rna_data["includes_tRNA"] = includes_tRNA
        rna_data["is_unprocessed"] = is_unprocessed
        rna_data["includes_ribosomal_protein"] = includes_ribosomal_protein
        rna_data["includes_RNAP"] = includes_RNAP

        field_units = {
            "id": None,
            "deg_rate": 1 / units.s,
            "deg_rate_is_measured": None,
            "length": units.nt,
            "counts_ACGU": units.nt,
            "mw": units.g / units.mol,
            "rRNA_mw": units.g / units.mol,
            "tRNA_mw": units.g / units.mol,
            "Km_endoRNase": units.mol / units.L,
            "replication_coordinate": None,
            "wt_replication_coordinate": None,
            "is_forward": None,
            "is_mRNA": None,
            "is_miscRNA": None,
            "is_rRNA": None,
            "is_tRNA": None,
            "includes_tRNA": None,
            "is_unprocessed": None,
            "includes_ribosomal_protein": None,
            "includes_RNAP": None,
        }

        self.rna_data = UnitStructArray(rna_data, field_units)
        self._rna_id_to_index = {
            rna_id: cistron_index
            for (cistron_index, rna_id) in enumerate(self.rna_data["id"])
        }

        # Set basal expression and synthesis probabilities - conditional values
        # are set in the parca.
        self.rna_expression = {}
        self.rna_synth_prob = {}
        self.rna_expression["basal"] = expression / expression.sum()
        self.rna_synth_prob["basal"] = synth_prob / synth_prob.sum()



[docs]
    def cistron_id_to_rna_indexes(self, cistron_id):
        """
        Returns the indexes of transcription units containing the given RNA
        cistron given the ID of the cistron.
        """
        return self.cistron_tu_mapping_matrix.getrow(
            self._cistron_id_to_index[cistron_id]
        ).nonzero()[1]



[docs]
    @cache
    def rna_id_to_cistron_indexes(self, rna_id):
        """
        Returns the indexes of cistrons that constitute the given transcription
        unit given the ID of the RNA transcription unit.
        """
        return self.cistron_tu_mapping_matrix.getcol(
            self._rna_id_to_index[rna_id]
        ).nonzero()[0]



[docs]
    def fit_rna_expression(self, cistron_expression):
        """
        Calculates the expression of RNA transcription units that best fits the
        given expression levels of cistrons using nonnegative least squares.
        """
        rna_exp, res = fast_nnls(self.cistron_tu_mapping_matrix, cistron_expression)
        return rna_exp, res



[docs]
    def fit_trna_expression(self, tRNA_cistron_expression):
        """
        Calculates the expression of tRNA transcription units that best fits the
        given expression levels of tRNA cistrons using nonnegative least
        squares.
        """
        tRNA_exp, res = fast_nnls(
            self.tRNA_cistron_tu_mapping_matrix, tRNA_cistron_expression
        )
        return tRNA_exp, res



[docs]
    def _get_relative_coordinates(self, coordinates):
        """
        Returns the genomic coordinates of a given gene coordinate relative
        to the origin of replication.
        """
        if coordinates < self._terc_coordinate:
            relative_coordinates = (
                self._genome_length - self._oric_coordinate + coordinates
            )
        elif coordinates < self._oric_coordinate:
            relative_coordinates = coordinates - self._oric_coordinate + 1
        else:
            relative_coordinates = coordinates - self._oric_coordinate

        return relative_coordinates



[docs]
    def _apply_rnaseq_correction(self):
        """
        Applies correction to RNAseq data for shorter genes as required when
        operon structure is included in the model.
        """
        cistron_expression = self.cistron_expression["basal"].copy()
        zero_exp_mask = cistron_expression == 0

        # Find minimum length of cistron with nonzero expression
        cistron_lengths = self.cistron_data["length"].asNumber(units.nt)
        length_threshold = cistron_lengths[~zero_exp_mask].min()

        # Get mask for cistrons that are mRNAs, shorter than the threshold
        # length, covered by RNAseq data, and with zero expression
        correction_mask = np.logical_and.reduce(
            (
                self.cistron_data["is_mRNA"],
                zero_exp_mask,
                cistron_lengths < length_threshold,
                self._cistron_is_rnaseq_covered,
            )
        )
        corrected_indexes = []

        for cistron_index in np.where(correction_mask)[0]:
            # Get indexes of cistrons in the same operon
            cistrons_in_operon = None
            rnas_in_operon = None
            for operon in self.operons:
                if cistron_index in operon[0]:
                    cistrons_in_operon = operon[0]
                    rnas_in_operon = operon[1]
                    break
            assert cistrons_in_operon is not None

            # Skip monocistronic operons
            if len(cistrons_in_operon) == 1:
                continue

            # Get cistron-TU mapping matrix for this operon
            mapping_matrix_this_operon = self.cistron_tu_mapping_matrix[
                cistrons_in_operon, :
            ][:, rnas_in_operon].toarray()

            # Remove given gene from operon and run NNLS
            pos_in_operon = cistrons_in_operon.index(cistron_index)
            mapping_matrix_gene_removed = mapping_matrix_this_operon.copy()
            mapping_matrix_gene_removed[pos_in_operon, :] = 0

            rna_exp, _ = fast_nnls(
                mapping_matrix_gene_removed, cistron_expression[cistrons_in_operon]
            )

            # Use solution to get expected expression for given gene
            exp = mapping_matrix_this_operon.dot(rna_exp)[pos_in_operon]
            cistron_expression[cistron_index] = exp
            corrected_indexes.append(cistron_index)

        # Reset cistron_expression to new values
        self.cistron_expression["basal"] = cistron_expression / cistron_expression.sum()

        # Keep record of cistrons whose expression was corrected
        self.cistron_data["uses_corrected_seq_counts"][np.array(corrected_indexes)] = (
            True
        )



[docs]
    def _build_mature_rna_data(self, raw_data, sim_data):
        """
        Build mature RNA-associated simulation data from raw data.
        """
        unprocessed_rna_indexes = np.where(self.rna_data["is_unprocessed"])[0]

        # Get IDs of all mature RNAs that are derived from unprocessed RNAs
        mature_rna_cistron_indexes = np.unique(
            self.cistron_tu_mapping_matrix[:, unprocessed_rna_indexes].nonzero()[0]
        )
        mature_rna_ids = self.cistron_data["id"][mature_rna_cistron_indexes]
        n_mature_rnas = len(mature_rna_ids)
        compartments = sim_data.getter.get_compartments(mature_rna_ids)
        mature_rna_ids_with_compartments = [
            f"{rna_id}[{loc[0]}]" for (rna_id, loc) in zip(mature_rna_ids, compartments)
        ]

        # Get stoichiometric matrix for RNA maturation process
        self.rna_maturation_stoich_matrix = self.cistron_tu_mapping_matrix[
            :, unprocessed_rna_indexes
        ][mature_rna_cistron_indexes, :]

        # Build matrix of the end positions of each mature RNA within each
        # unprocessed RNA
        self.mature_rna_end_positions = np.zeros(
            self.rna_maturation_stoich_matrix.shape
        )
        (rows, columns) = self.rna_maturation_stoich_matrix.nonzero()

        for i, j in zip(rows, columns):
            self.mature_rna_end_positions[i, j] = self.cistron_start_end_pos_in_tu[
                (mature_rna_cistron_indexes[i], unprocessed_rna_indexes[j])
            ][1]

        # Get mapping matrix from mature RNA to processing enzymes that are
        # needed to get the mature forms
        mature_rna_to_enzyme_list = {
            row["rna_id"]: row["enzymes"] for row in raw_data.rna_maturation_enzymes
        }
        mature_rna_id_to_index = {
            rna_id: i for (i, rna_id) in enumerate(mature_rna_ids)
        }

        all_enzymes = []
        mature_rna_indexes = []
        enzyme_indexes = []

        for rna_id, enzyme_list in mature_rna_to_enzyme_list.items():
            # Skip if RNA is not a mature RNA
            try:
                mature_rna_index = mature_rna_id_to_index[rna_id]
            except KeyError:
                continue

            for enzyme in enzyme_list:
                mature_rna_indexes.append(mature_rna_index)
                if enzyme in all_enzymes:
                    enzyme_indexes.append(all_enzymes.index(enzyme))
                else:
                    enzyme_indexes.append(len(all_enzymes))
                    all_enzymes.append(enzyme)

        mature_rna_indexes = np.array(mature_rna_indexes, dtype=int)
        enzyme_indexes = np.array(enzyme_indexes, dtype=int)
        mature_rna_to_enzyme_mapping_matrix = np.zeros(
            (n_mature_rnas, len(all_enzymes)), dtype=bool
        )
        mature_rna_to_enzyme_mapping_matrix[mature_rna_indexes, enzyme_indexes] = True

        # Convert to mapping matrix between unprocessed RNAs and enzymes
        self.rna_maturation_enzyme_matrix = self.rna_maturation_stoich_matrix.T.dot(
            mature_rna_to_enzyme_mapping_matrix
        ).astype(bool)
        enzyme_compartments = sim_data.getter.get_compartments(all_enzymes)
        self.rna_maturation_enzymes = [
            f"{enzyme_id}[{loc[0]}]"
            for (enzyme_id, loc) in zip(all_enzymes, enzyme_compartments)
        ]

        # Get mapping matrix between rtRNA cistrons and TUs
        rRNA_indexes = np.where(self.rna_data["is_rRNA"])[0]
        rRNA_cistron_indexes = np.where(self.cistron_data["is_rRNA"])[0]
        self.rRNA_cistron_tu_mapping_matrix = self.cistron_tu_mapping_matrix[
            :, rRNA_indexes
        ][rRNA_cistron_indexes, :]

        tRNA_indexes = np.where(self.rna_data["includes_tRNA"])[0]
        tRNA_cistron_indexes = np.where(self.cistron_data["is_tRNA"])[0]
        self.tRNA_cistron_tu_mapping_matrix = self.cistron_tu_mapping_matrix[
            :, tRNA_indexes
        ][tRNA_cistron_indexes, :]

        # Get RNA nucleotide compositions of unprocessed and processed RNAs
        unprocessed_rna_nt_counts = self.rna_data["counts_ACGU"][
            unprocessed_rna_indexes
        ].asNumber(units.nt)

        mature_rna_seqs = sim_data.getter.get_sequences(mature_rna_ids)
        lengths = np.array([len(seq) for seq in mature_rna_seqs])
        ntp_abbreviations = [ntp_id[0] for ntp_id in sim_data.molecule_groups.ntps]

        mature_rna_nt_counts = np.zeros((0, 4))
        for seq in mature_rna_seqs:
            mature_rna_nt_counts = np.vstack(
                (
                    mature_rna_nt_counts,
                    np.array([seq.count(letter) for letter in ntp_abbreviations]),
                )
            )

        # Calculate number of nucleotides that are degraded as part of the
        # maturation process for each unprocessed RNA
        degraded_nt_counts = unprocessed_rna_nt_counts.copy()
        rows, cols = self.rna_maturation_stoich_matrix.nonzero()

        for i, j in zip(rows, cols):
            degraded_nt_counts[j, :] -= mature_rna_nt_counts[i, :]

        assert np.all(degraded_nt_counts >= 0)
        self.rna_maturation_degraded_nt_counts = degraded_nt_counts

        # Get identities of each stable RNA
        is_rRNA = self.cistron_data["is_rRNA"][mature_rna_cistron_indexes]
        is_tRNA = self.cistron_data["is_tRNA"][mature_rna_cistron_indexes]
        is_23S_rRNA = self.cistron_data["is_23S_rRNA"][mature_rna_cistron_indexes]
        is_16S_rRNA = self.cistron_data["is_16S_rRNA"][mature_rna_cistron_indexes]
        is_5S_rRNA = self.cistron_data["is_5S_rRNA"][mature_rna_cistron_indexes]

        rna_deg_rates = np.zeros(n_mature_rnas)
        if sim_data.stable_rrna:
            # If stable rRNA option is on, set degradation rates of mature rRNAs
            # to the values calculated from the half-life in sim_data.constants
            rna_deg_rates[is_rRNA] = np.log(
                2
            ) / sim_data.constants.stable_RNA_half_life.asNumber(units.s)
        else:
            # Default: Set degradation rates of mature rRNAs to the average
            # reported degradation rates of mRNAs
            # Note: rRNAs complexed into ribosomal subunits will not degrade, so
            # this will only significantly affect excess rRNAs
            rna_deg_rates[is_rRNA] = np.log(
                2
            ) / self.average_mRNA_cistron_half_life.asNumber(units.s)

        # Set degradation rates of tRNAs to the values calculated from the
        # half-life in sim_data.constants
        rna_deg_rates[is_tRNA] = np.log(
            2
        ) / sim_data.constants.stable_RNA_half_life.asNumber(units.s)

        # Get MWs of mature RNA molecules
        mws = sim_data.getter.get_masses(mature_rna_ids).asNumber(units.g / units.mol)

        if n_mature_rnas > 0:
            max_rna_id_length = max(
                len(id_) for id_ in mature_rna_ids_with_compartments
            )
        else:
            max_rna_id_length = 1

        mature_rna_data = np.zeros(
            n_mature_rnas,
            dtype=[
                ("id", "U{}".format(max_rna_id_length)),
                ("deg_rate", "f8"),
                ("length", "i8"),
                ("counts_ACGU", "4i8"),
                ("mw", "f8"),
                ("Km_endoRNase", "f8"),
                ("is_rRNA", "bool"),
                ("is_tRNA", "bool"),
                ("is_23S_rRNA", "bool"),
                ("is_16S_rRNA", "bool"),
                ("is_5S_rRNA", "bool"),
            ],
        )

        mature_rna_data["id"] = mature_rna_ids_with_compartments
        mature_rna_data["deg_rate"] = rna_deg_rates
        mature_rna_data["length"] = lengths
        mature_rna_data["counts_ACGU"] = mature_rna_nt_counts
        mature_rna_data["mw"] = mws
        mature_rna_data["Km_endoRNase"] = np.zeros(
            len(mature_rna_ids_with_compartments)
        )  # Set later in ParCa
        mature_rna_data["is_rRNA"] = is_rRNA
        mature_rna_data["is_tRNA"] = is_tRNA
        mature_rna_data["is_23S_rRNA"] = is_23S_rRNA
        mature_rna_data["is_16S_rRNA"] = is_16S_rRNA
        mature_rna_data["is_5S_rRNA"] = is_5S_rRNA

        field_units = {
            "id": None,
            "deg_rate": 1 / units.s,
            "length": units.nt,
            "counts_ACGU": units.nt,
            "mw": units.g / units.mol,
            "Km_endoRNase": units.mol / units.L,
            "is_rRNA": None,
            "is_tRNA": None,
            "is_23S_rRNA": None,
            "is_16S_rRNA": None,
            "is_5S_rRNA": None,
        }

        self.mature_rna_data = UnitStructArray(mature_rna_data, field_units)



[docs]
    def _build_transcription(self, raw_data, sim_data):
        """
        Build transcription-associated simulation data from raw data.
        """
        # Load sequence data
        rna_seqs = sim_data.getter.get_sequences(
            [rna_id[:-3] for rna_id in self.rna_data["id"]]
        )

        # Construct transcription sequence matrix
        maxLen = np.int64(
            self.rna_data["length"].asNumber().max()
            # Add buffer to account for elongation by max timestep
            + MAX_TIMESTEP
            * sim_data.constants.RNAP_elongation_rate_for_stable_RNA.asNumber(
                units.nt / units.s
            )
        )

        self.transcription_sequences = np.full(
            (len(rna_seqs), maxLen), polymerize.PAD_VALUE, dtype=np.int8
        )
        ntMapping = {ntpId: i for i, ntpId in enumerate(["A", "C", "G", "U"])}
        for i, sequence in enumerate(rna_seqs):
            for j, letter in enumerate(sequence):
                self.transcription_sequences[i, j] = ntMapping[letter]

        # Calculate weights of transcript nucleotide monomers
        self.transcription_monomer_weights = (
            (
                sim_data.getter.get_masses(sim_data.molecule_groups.ntps)
                - sim_data.getter.get_masses([sim_data.molecule_ids.ppi])
            )
            / sim_data.constants.n_avogadro
        ).asNumber(units.fg)

        self.transcription_end_weight = (
            sim_data.getter.get_masses([sim_data.molecule_ids.ppi])
            / sim_data.constants.n_avogadro
        ).asNumber(units.fg)

        # Load active RNAP footprint on DNA
        molecule_id_to_footprint_sizes = {
            row["molecule_id"]: row["footprint_size"]
            for row in raw_data.footprint_sizes
        }
        try:
            self.active_rnap_footprint_size = molecule_id_to_footprint_sizes[
                sim_data.molecule_ids.full_RNAP[:-3]
            ]
        except KeyError:
            raise ValueError("DNA footprint size for RNA polymerses not found.")



[docs]
    def _build_charged_trna(self, raw_data, sim_data):
        """
        Loads information and creates data structures necessary for charging of tRNA

        Note:
                Requires self.rna_data so can't be built in translation even if some
                data structures would be more appropriate there.
        """
        # Create list of charged tRNAs
        uncharged_trna_names = [
            x + "[c]" for x in self.cistron_data["id"][self.cistron_data["is_tRNA"]]
        ]

        charged_trnas = [
            x["modified_forms"]
            for x in raw_data.rnas
            if x["id"] + "[c]" in uncharged_trna_names
        ]

        filtered_charged_trna = []
        for charged_list in charged_trnas:
            for trna in charged_list:
                # Skip modified forms so only one charged tRNA per uncharged tRNA
                if "FMET" in trna or "modified" in trna:
                    continue

                assert "c" in sim_data.getter.get_compartment(trna)
                filtered_charged_trna += [trna + "[c]"]

        self.uncharged_trna_names = uncharged_trna_names
        self.charged_trna_names = filtered_charged_trna
        assert len(self.charged_trna_names) == len(self.uncharged_trna_names)

        # Create mapping of each tRNA/charged tRNA to associated AA
        trna_dict = {
            "RNA0-300[c]": "VAL",
            "RNA0-301[c]": "LYS",
            "RNA0-302[c]": "LYS",
            "RNA0-303[c]": "LYS",
            "RNA0-304[c]": "ASN",
            "RNA0-305[c]": "ILE",
            "RNA0-306[c]": "MET",
        }
        aa_names = sim_data.molecule_groups.amino_acids
        aa_indices = {aa: i for i, aa in enumerate(aa_names)}
        trna_indices = {trna: i for i, trna in enumerate(self.uncharged_trna_names)}
        self.aa_from_trna = np.zeros((len(aa_names), len(self.uncharged_trna_names)))
        for trna in self.uncharged_trna_names:
            aa = trna[:3].upper()
            if aa == "ALA":
                aa = "L-ALPHA-ALANINE"
            elif aa == "ASP":
                aa = "L-ASPARTATE"
            elif aa == "SEL":
                aa = "L-SELENOCYSTEINE"
            elif aa == "RNA":
                aa = trna_dict[trna]

            assert "c" in sim_data.getter.get_compartment(aa)
            aa += "[c]"
            if aa in aa_names:
                aa_idx = aa_indices[aa]
                trna_idx = trna_indices[trna]
                self.aa_from_trna[aa_idx, trna_idx] = 1

        # Arrays for stoichiometry and synthetase mapping matrices
        molecules = []

        # Sparse matrix representation - i, j are row/column indices and v is value
        stoich_matrix_i = []
        stoich_matrix_j = []
        stoich_matrix_v = []

        synthetase_names = []
        synthetase_mapping_aa = []
        synthetase_mapping_syn = []
        synthetase_metabolites = {}
        # Get IDs of all metabolites
        metabolite_ids = {met["id"] for met in raw_data.metabolites}

        # Create stoichiometry matrix for charging reactions
        for reaction in raw_data.trna_charging_reactions:
            # Get uncharged tRNA name for the given reaction
            trna = None
            for mol_id in reaction["stoichiometry"].keys():
                if f"{mol_id}[c]" in self.uncharged_trna_names:
                    trna = f"{mol_id}[c]"
                    break

            if trna is None:
                continue

            trna_index = trna_indices[trna]

            # Get molecule information
            aa_idx = None
            for mol_id, coeff in reaction["stoichiometry"].items():
                if mol_id in metabolite_ids:
                    molecule_name = "{}[{}]".format(
                        mol_id,
                        "c",
                        # Assume all metabolites are in cytosol
                    )
                else:
                    molecule_name = "{}[{}]".format(
                        mol_id, sim_data.getter.get_compartment(mol_id)[0]
                    )

                if molecule_name not in molecules:
                    molecules.append(molecule_name)
                    molecule_index = len(molecules) - 1
                else:
                    molecule_index = molecules.index(molecule_name)

                aa_idx = aa_indices.get(molecule_name, aa_idx)

                # Assume coefficents given as null are -1
                if coeff is None:
                    coeff = -1
                assert coeff % 1 == 0

                stoich_matrix_i.append(molecule_index)
                stoich_matrix_j.append(trna_index)
                stoich_matrix_v.append(coeff)

            assert aa_idx is not None

            # Create mapping for synthetases catalyzing charging
            for synthetase in reaction["catalyzed_by"]:
                synthetase_metabolites[synthetase] = (
                    synthetase_metabolites.get(synthetase, set())
                    | reaction["stoichiometry"].keys()
                )
                synthetase = "{}[{}]".format(
                    synthetase, sim_data.getter.get_compartment(synthetase)[0]
                )

                if synthetase not in synthetase_names:
                    synthetase_names.append(synthetase)

                synthetase_mapping_aa.append(aa_idx)
                synthetase_mapping_syn.append(synthetase_names.index(synthetase))

        # Extract KM data for amino acids and tRNA in charging reactions
        synthetase_names_without_tag = {name[:-3] for name in synthetase_names}
        aa_names_without_tag = [aa[:-3] for aa in sim_data.molecule_groups.amino_acids]
        aa_kms = {}
        trna_kms = {}
        skipped_reactions = {"RXN-16165"}  # Not correct MET charging reaction
        for row in raw_data.metabolism_kinetics:
            # Only look at data for charging reactions
            if (
                row["reactionID"] in skipped_reactions
                or row["enzymeID"] not in synthetase_names_without_tag
            ):
                continue

            for met, km in zip(row["substrateIDs"], row["kM"]):
                if met in aa_names_without_tag:
                    # Prevent data from mismatched amino acid/synthetases being used
                    if met not in synthetase_metabolites[row["enzymeID"]]:
                        continue
                    aa_kms[met] = aa_kms.get(met, []) + [km]
                elif "tRNA" in met:
                    # Exclude suspiciously high data
                    if km > 5 * sim_data.constants.Km_synthetase_uncharged_trna:
                        continue
                    aa = met.split("-")[0]
                    if aa == "ALA":
                        aa = "L-ALPHA-ALANINE"
                    elif aa == "ASP":
                        aa = "L-ASPARTATE"
                    elif aa == "Elongation":
                        aa = "MET"
                    trna_kms[aa] = trna_kms.get(aa, []) + [km]

        # Save average KM values and use the default value if no data is available
        km_units = units.umol / units.L
        average_aa_kms = []
        average_trna_kms = []
        for aa_id in aa_names_without_tag:
            average_aa_kms.append(
                np.mean(
                    aa_kms.get(aa_id, sim_data.constants.Km_synthetase_amino_acid)
                ).asNumber(km_units)
            )
            average_trna_kms.append(
                np.mean(
                    trna_kms.get(aa_id, sim_data.constants.Km_synthetase_uncharged_trna)
                ).asNumber(km_units)
            )
        self.aa_kms = km_units * np.array(average_aa_kms)
        self.trna_kms = km_units * np.array(average_trna_kms)

        # Save matrices and related lists of names
        self._stoich_matrix_i = np.array(stoich_matrix_i)
        self._stoich_matrix_j = np.array(stoich_matrix_j)
        self._stoich_matrix_v = np.array(stoich_matrix_v)

        self.aa_from_synthetase = np.zeros((len(aa_names), len(synthetase_names)))
        self.aa_from_synthetase[synthetase_mapping_aa, synthetase_mapping_syn] = 1

        self.synthetase_names = synthetase_names
        self.charging_molecules = molecules



[docs]
    def charging_stoich_matrix(self):
        """
        Creates stoich matrix from i, j, v arrays

        Returns 2D array with rows of metabolites for each tRNA charging reaction on the column
        """
        shape = (self._stoich_matrix_i.max() + 1, self._stoich_matrix_j.max() + 1)

        out = np.zeros(shape, np.float64)
        out[self._stoich_matrix_i, self._stoich_matrix_j] = self._stoich_matrix_v

        return out



[docs]
    def _build_attenuation(self, raw_data, sim_data):
        """
        Load fold changes related to transcriptional attenuation.
        """
        # Load data from file
        aa_rna_pair_to_log_fcs = {}
        gene_symbol_to_cistron_id = {
            g["symbol"]: g["rna_ids"][0] for g in raw_data.genes
        }
        for row in raw_data.transcriptional_attenuation:
            trna_aa = row["tRNA"].split("-")[1].upper() + "[c]"
            gene = row["Target"]
            cistron_id = gene_symbol_to_cistron_id[gene]

            # Map FC to all RNAs that cover the cistron
            rna_indexes_with_cistron = self.cistron_id_to_rna_indexes(cistron_id)
            for rna_idx in rna_indexes_with_cistron:
                rna_id = self.rna_data["id"][rna_idx]
                if (trna_aa, self.rna_data["id"][rna_idx]) in aa_rna_pair_to_log_fcs:
                    aa_rna_pair_to_log_fcs[(trna_aa, rna_id)].append(row["log2 FC"])
                else:
                    aa_rna_pair_to_log_fcs[(trna_aa, rna_id)] = [row["log2 FC"]]

        aa_trnas = []
        attenuated_rnas = []
        fold_changes = []

        for (trna_aa, rna_id), all_log_fcs in aa_rna_pair_to_log_fcs.items():
            aa_trnas.append(trna_aa)
            attenuated_rnas.append(rna_id)

            # Take the average of the reported FCs of each constituent cistron
            fold_changes.append(2 ** np.mean(all_log_fcs))

        self.attenuated_rna_ids = np.unique(attenuated_rnas)

        # Convert data to matrix mapping tRNA to genes with a fold change
        trna_to_row = {t: i for i, t in enumerate(sim_data.molecule_groups.amino_acids)}
        rna_to_col = {r: i for i, r in enumerate(self.attenuated_rna_ids)}
        n_aas = len(sim_data.molecule_groups.amino_acids)
        n_rnas = len(self.attenuated_rna_ids)
        self._attenuation_rna_fold_changes = np.ones((n_aas, n_rnas))
        for trna, cistron_id, fc in zip(aa_trnas, attenuated_rnas, fold_changes):
            i = trna_to_row[trna]
            j = rna_to_col[cistron_id]
            self._attenuation_rna_fold_changes[i, j] = fc

        # Attenuated cistron index mapping
        self.attenuated_rna_indices = np.array(
            [self._rna_id_to_index[r] for r in self.attenuated_rna_ids]
        )

        # Specify location in gene where attenuation will occur
        # Currently just assumes before a transcript begins elongation (position < 1)
        # TODO: base this on specific locations for each gene
        locations = np.ones(len(self.attenuated_rna_indices))
        self.attenuation_location = {
            idx: loc for idx, loc in zip(self.attenuated_rna_indices, locations)
        }



[docs]
    def calculate_attenuation(self, sim_data, cell_specs):
        """
        Calculate constants for each attenuated gene.

        TODO:
                Calculate estimated charged tRNA concentration to use instead of all tRNA
        """

        def get_trna_conc(condition):
            spec = cell_specs[condition]
            unprocessed_trna_ids = self.rna_data["id"][self.rna_data["includes_tRNA"]]
            unprocessed_trna_idx = bulk_name_to_idx(
                unprocessed_trna_ids, spec["bulkAverageContainer"]["id"]
            )
            unprocessed_counts = counts(
                spec["bulkAverageContainer"], unprocessed_trna_idx
            )
            trna_counts = self.tRNA_cistron_tu_mapping_matrix.dot(unprocessed_counts)
            volume = (
                spec["avgCellDryMassInit"]
                / sim_data.constants.cell_density
                / sim_data.mass.cell_dry_mass_fraction
            )
            # Order of operations for conc (counts last) is to get units to work well
            conc = 1 / sim_data.constants.n_avogadro / volume * trna_counts
            return conc

        k_units = units.umol / units.L
        trna_conc = self.aa_from_trna @ get_trna_conc("with_aa").asNumber(k_units)

        # Calculate constant for stop probability
        self.attenuation_k = np.zeros_like(self._attenuation_rna_fold_changes)
        for i, j in zip(*np.where(self._attenuation_rna_fold_changes != 1)):
            k = trna_conc[i] / np.log(self._attenuation_rna_fold_changes[i, j])
            self.attenuation_k[i, j] = 1 / k
        self.attenuation_k = 1 / k_units * self.attenuation_k

        # Adjust basal synthesis probabilities to account for less synthesis
        # due to attenuation
        condition = "basal"
        basal_prob = sim_data.process.transcription_regulation.basal_prob
        delta_prob = sim_data.process.transcription_regulation.get_delta_prob_matrix()
        p_promoter_bound = np.array(
            [
                sim_data.pPromoterBound[condition][tf]
                for tf in sim_data.process.transcription_regulation.tf_ids
            ]
        )
        delta = delta_prob @ p_promoter_bound
        basal_stop_prob = self.get_attenuation_stop_probabilities(
            get_trna_conc(condition)
        )
        basal_synth_prob = (basal_prob + delta)[self.attenuated_rna_indices]
        self.attenuation_basal_prob_adjustments = basal_synth_prob * (
            1 / (1 - basal_stop_prob) - 1
        )

        # Store expected readthrough fraction for each condition to use in initial conditions
        self.attenuation_readthrough = {}
        for condition in sim_data.conditions:
            self.attenuation_readthrough[condition] = (
                1 - self.get_attenuation_stop_probabilities(get_trna_conc(condition))
            )



[docs]
    def get_attenuation_stop_probabilities(self, trna_conc):
        """
        Calculate the probability of a transcript stopping early due to attenuation.

        TODO:
                Consider a maximum stop probability factor (eg can only attenuate up to 90% of RNAs)
        """

        trna_by_aa = units.matmul(self.aa_from_trna, trna_conc)
        return 1 - np.exp(units.strip_empty_units(trna_by_aa @ self.attenuation_k))



[docs]
    def _build_elongation_rates(self, raw_data, sim_data):
        self.stable_RNA_elongation_rate = (
            sim_data.constants.RNAP_elongation_rate_for_stable_RNA.asNumber(
                units.nt / units.s
            )
        )

        # rRNAs are set to have higher elongation rates
        # TODO (ggsun): Consider adding tRNAs
        self.rRNA_indexes = np.where(self.rna_data["is_rRNA"])[0]



[docs]
    def make_elongation_rates(self, random, base, time_step, variable_elongation=False):
        return make_elongation_rates(
            random,
            self.transcription_sequences.shape[0],
            base,
            self.rRNA_indexes,
            self.stable_RNA_elongation_rate,
            time_step,
            variable_elongation,
        )



[docs]
    def _solve_ppgpp_km(self, raw_data, sim_data):
        """
        Solves for general expression rates for bound and free RNAP and
        a KM for ppGpp to RNAP based on global cellular measurements.
        Parameters are solved for at different doubling times using a
        gradient descent method to minimize the difference in expression
        of stable RNA compared to the measured RNA in a cell.
        Assumes a Hill coefficient of 2 for ppGpp binding to RNAP.

        Attributes set:
                _fit_ppgpp_fc (float): log2 fold change in stable RNA expression
                        from a fast doubling time to a slow doubling time based on
                        the rates of bound and free RNAP expression found
                _ppgpp_km_squared (float): squared and unitless KM value for
                        to limit computation needed for fraction bound
                ppgpp_km (float with mol / volume units): KM for ppGpp binding
                        to RNAP
        """

        # Data for different doubling times (100, 60, 40, 30, 24 min)
        per_dry_mass_to_per_volume = (
            sim_data.constants.cell_density * sim_data.mass.cell_dry_mass_fraction
        )
        ppgpp = (
            np.array(
                [
                    (d["ppGpp_conc"] * per_dry_mass_to_per_volume).asNumber(
                        PPGPP_CONC_UNITS
                    )
                    for d in raw_data.growth_rate_dependent_parameters
                ]
            )
            ** 2
        )
        rna = np.array([d["rnaMassFraction"] for d in raw_data.dry_mass_composition])
        mass_per_cell = np.array(
            [
                d["averageDryMass"].asNumber(units.fg)
                for d in raw_data.dry_mass_composition
            ]
        )
        rnap_per_cell = np.array(
            [d["RNAP_per_cell"] for d in raw_data.growth_rate_dependent_parameters]
        )

        # Variables for the objective
        ## a1: rate of RNA production from free RNAP
        ## a2: rate of RNA production from RNAP bound to ppGpp
        ## km: KM of ppGpp binding to RNAP
        a1s, a2s, kms = sp.symbols("a1 a2 km")

        # Create objective to minimize
        ## Objective is squared difference between RNA created with different rates for RNAP bound
        ## to ppGpp and free RNAP compared to measured RNA in the cell for each measured doubling time.
        ## Use sp.exp to prevent negative parameter values, also improves stability for larger step size.
        difference = (
            rnap_per_cell
            / mass_per_cell
            * (
                sp.exp(a1s) * (1 - ppgpp / (sp.exp(kms) + ppgpp))
                + sp.exp(a2s) * ppgpp / (sp.exp(kms) + ppgpp)
            )
            - rna
        )
        J = difference.dot(difference)

        # Convert to functions for faster performance
        dJda1 = sp.lambdify((a1s, a2s, kms), J.diff(a1s))
        dJda2 = sp.lambdify((a1s, a2s, kms), J.diff(a2s))
        dJdkm = sp.lambdify((a1s, a2s, kms), J.diff(kms))
        J = sp.lambdify((a1s, a2s, kms), J)

        # Initial parameters
        a1 = np.log(0.02)
        a2 = np.log(0.01)
        km = np.log(575)
        step_size = 1.0

        # Use gradient descent to find
        obj = J(a1, a2, km)
        old_obj = 100
        step = 0
        max_step = 1e5
        tol = 1e-6
        rel_tol = 1e-9
        while obj > tol and 1 - obj / old_obj > rel_tol:
            a1 -= dJda1(a1, a2, km) * step_size
            a2 -= dJda2(a1, a2, km) * step_size
            km -= dJdkm(a1, a2, km) * step_size

            old_obj = obj
            obj = J(a1, a2, km)

            step += 1
            if step > max_step:
                raise RuntimeError(
                    "Fitting ppGpp binding KM failed to converge."
                    " Check tolerances or maximum number of steps."
                )

        a1 = np.exp(a1)
        a2 = np.exp(a2)
        km = np.exp(km)
        f_low = ppgpp[-1] / (km + ppgpp[-1])
        fc = np.log2(a2 / (a1 * (1 - f_low) + a2 * f_low))

        self._fit_ppgpp_fc = fc
        self._ppgpp_km_squared = km
        self.ppgpp_km = np.sqrt(km) * PPGPP_CONC_UNITS



[docs]
    def get_rna_fractions(self, ppgpp):
        """
        Calculates expected RNA subgroup mass fractions based on ppGpp
        concentration. If ppGpp expression has not been set yet, uses default
        measured fractions.

        Args:
                ppgpp (float with or without mol / volume units): concentration of
                        ppGpp, if unitless, should represent the concentration of
                        PPGPP_CONC_UNITS

        Returns:
                dict[str, float]: mass fraction for each subgroup mass, values sum
                to 1
        """

        if self._ppgpp_expression_set:
            rna_exp = self.expression_from_ppgpp(ppgpp)
            cistron_exp = self.cistron_tu_mapping_matrix.dot(rna_exp)
            mass = self.cistron_data["mw"] * cistron_exp
            mass = (mass / units.sum(mass)).asNumber()

            fractions = {
                "rRNA": mass[self.cistron_data["is_rRNA"]].sum(),
                "tRNA": mass[self.cistron_data["is_tRNA"]].sum(),
                "mRNA": mass[self.cistron_data["is_mRNA"]].sum(),
            }
        else:
            fractions = self._basal_rna_fractions

        return fractions



[docs]
    def set_ppgpp_expression(self, sim_data):
        """
        Called during the parca to determine expression of each transcription
        unit for ppGpp bound and free RNAP.

        Attributes set:
                exp_ppgpp (ndarray[float]): expression for each TU when RNAP is
                        bound to ppGpp
                exp_free (ndarray[float]): expression for each TU when RNAP is not
                        bound to ppGpp
        """
        ppgpp_aa = sim_data.growth_rate_parameters.get_ppGpp_conc(
            sim_data.condition_to_doubling_time["with_aa"]
        )
        ppgpp_basal = sim_data.growth_rate_parameters.get_ppGpp_conc(
            sim_data.condition_to_doubling_time["basal"]
        )
        f_ppgpp_aa = self.fraction_rnap_bound_ppgpp(ppgpp_aa)
        f_ppgpp_basal = self.fraction_rnap_bound_ppgpp(ppgpp_basal)
        cistron_id_to_idx = {
            cistron: i for i, cistron in enumerate(self.cistron_data["id"])
        }

        # Since fold changes are reported for each cistron (gene), the FCs are
        # applied first to the expression levels of individual cistrons which
        # are converted back to TU expression levels through NNLS
        cistron_exp = self.fit_cistron_expression["basal"]

        fcs = np.zeros(len(self.cistron_data))
        for cistron_id, fc in zip(self.ppgpp_regulated_genes, self.ppgpp_fold_changes):
            fcs[cistron_id_to_idx[cistron_id]] = fc

        # Apply fold changes to expression levels of cistrons
        cistron_exp_ppgpp = (
            2**fcs * cistron_exp * (1 - f_ppgpp_aa) / (1 - f_ppgpp_basal)
        ) / (
            1
            - 2**fcs
            * (f_ppgpp_aa - f_ppgpp_basal * (1 - f_ppgpp_aa) / (1 - f_ppgpp_basal))
        )
        cistron_exp_free = (cistron_exp - cistron_exp_ppgpp * f_ppgpp_basal) / (
            1 - f_ppgpp_basal
        )
        cistron_exp_free[cistron_exp_free < 0] = (
            0  # fold change is limited by KM, can't have very high positive fold changes
        )

        # Map expression levels of cistrons to those of TUs through NNLS
        self.exp_ppgpp, _ = self.fit_rna_expression(cistron_exp_ppgpp)
        self.exp_free, _ = self.fit_rna_expression(cistron_exp_free)
        self._normalize_ppgpp_expression()

        self._ppgpp_expression_set = True



[docs]
    def adjust_polymerizing_ppgpp_expression(self, sim_data):
        """
        Adjust ppGpp expression based on fit for ribosome and RNAP physiological
        constraints using least squares fit for 3 conditions with different
        growth rates/ppGpp.

        Modifies attributes:
                exp_ppgpp (ndarray[float]): expression for each gene when RNAP
                        is bound to ppGpp, adjusted for necessary RNAP and ribosome
                        expression, normalized to 1
                exp_free (ndarray[float]): expression for each gene when RNAP
                        is not bound to ppGpp, adjusted for necessary RNAP and ribosome
                        expression, normalized to 1

        Note:
                See docs/processes/transcription_regulation.pdf for a description
                of the math used in this section.

        TODO:
                fit for all conditions and not just those specified below?
        """

        # Fraction RNAP bound to ppGpp in different conditions
        ppgpp_aa = sim_data.growth_rate_parameters.get_ppGpp_conc(
            sim_data.condition_to_doubling_time["with_aa"]
        )
        ppgpp_basal = sim_data.growth_rate_parameters.get_ppGpp_conc(
            sim_data.condition_to_doubling_time["basal"]
        )
        ppgpp_anaerobic = sim_data.growth_rate_parameters.get_ppGpp_conc(
            sim_data.condition_to_doubling_time["no_oxygen"]
        )
        f_ppgpp_aa = self.fraction_rnap_bound_ppgpp(ppgpp_aa)
        f_ppgpp_basal = self.fraction_rnap_bound_ppgpp(ppgpp_basal)
        f_ppgpp_anaerobic = self.fraction_rnap_bound_ppgpp(ppgpp_anaerobic)

        # Adjustments for TFs
        ## Probabilities need to be unnormalized to match the scale of delta prob
        ## This includes not having get_delta_prob_matrix normalized for ppGpp
        tf_adjustments = {}
        delta_prob = sim_data.process.transcription_regulation.get_delta_prob_matrix(
            ppgpp=False
        )
        adjusted_mask = (
            self.rna_data["includes_RNAP"]
            | self.rna_data["includes_ribosomal_protein"]
            | self.rna_data["is_rRNA"]
        )
        for condition in ["with_aa", "basal", "no_oxygen"]:
            p_promoter_bound = np.array(
                [
                    sim_data.pPromoterBound[condition][tf]
                    for tf in sim_data.process.transcription_regulation.tf_ids
                ]
            )
            delta = delta_prob @ p_promoter_bound
            condition_prob = (
                sim_data.process.transcription_regulation.basal_prob + delta
            )
            tf_adjustments[condition] = (
                delta[adjusted_mask] / condition_prob[adjusted_mask]
            )

        # Solve least squares fit for expression of each component of RNAP and ribosomes
        self._normalize_ppgpp_expression()  # Need to normalize first to get correct scale
        F = np.array(
            [
                [1 - f_ppgpp_aa, f_ppgpp_aa],
                [1 - f_ppgpp_basal, f_ppgpp_basal],
                [1 - f_ppgpp_anaerobic, f_ppgpp_anaerobic],
            ]
        )
        Flst = np.linalg.inv(F.T.dot(F)).dot(F.T)
        expression = np.array(
            [
                self.rna_expression["with_aa"][adjusted_mask]
                * np.fmax(0, 1 - tf_adjustments["with_aa"]),
                self.rna_expression["basal"][adjusted_mask]
                * np.fmax(0, 1 - tf_adjustments["basal"]),
                self.rna_expression["no_oxygen"][adjusted_mask]
                * np.fmax(0, 1 - tf_adjustments["no_oxygen"]),
            ]
        )
        adjusted_free, adjusted_ppgpp = Flst.dot(expression)
        self.exp_free[adjusted_mask] = adjusted_free
        self.exp_ppgpp[adjusted_mask] = adjusted_ppgpp
        self._normalize_ppgpp_expression()



[docs]
    def adjust_ppgpp_expression_for_tfs(self, sim_data):
        """
        Adjusts ppGpp regulated expression to get expression with and without
        ppGpp regulation to match in basal condition and taking into account
        the effect transcription factors will have.

        TODO:
                Should this not adjust polymerizing genes (adjusted_mask in
                        adjust_polymerizing_ppgpp_expression) since they have already
                        been adjusted for transcription factor effects?
        """

        condition = "basal"

        # Current (unnormalized) probabilities from ppGpp regulation
        ppgpp_conc = sim_data.growth_rate_parameters.get_ppGpp_conc(
            sim_data.condition_to_doubling_time[condition]
        )
        old_prob, factor = self.synth_prob_from_ppgpp(
            ppgpp_conc, sim_data.process.replication.get_average_copy_number
        )

        # Calculate the average expected effect of TFs in basal condition
        p_promoter_bound = np.array(
            [
                sim_data.pPromoterBound[condition][tf]
                for tf in sim_data.process.transcription_regulation.tf_ids
            ]
        )
        delta_prob_no_ppgpp = (
            sim_data.process.transcription_regulation.get_delta_prob_matrix(ppgpp=False)
        )
        delta_prob_with_ppgpp = (
            sim_data.process.transcription_regulation.get_delta_prob_matrix(ppgpp=True)
        )
        delta_no_ppgpp = delta_prob_no_ppgpp @ p_promoter_bound
        delta_with_ppgpp = delta_prob_with_ppgpp @ p_promoter_bound

        # Calculate the required probability to match expression without ppGpp
        new_prob = (
            normalize(self.rna_expression[condition] * factor) + delta_no_ppgpp
        ) / (1 + delta_with_ppgpp)
        new_prob[new_prob < 0] = old_prob[new_prob < 0]
        new_prob = normalize(new_prob)

        # Determine adjustments to the current ppGpp expression to scale
        # to the expected expression
        with np.errstate(invalid="ignore", divide="ignore"):
            adjustment = new_prob / old_prob
        adjustment[~np.isfinite(adjustment)] = 1

        # Scale free and bound expression and renormalize ppGpp regulated expression
        self.exp_free *= adjustment
        self.exp_ppgpp *= adjustment
        self._normalize_ppgpp_expression()



[docs]
    def _normalize_ppgpp_expression(self):
        """
        Normalize both free and ppGpp bound expression values to 1.
        """

        self.exp_free[self.exp_free < 0] = 0
        self.exp_ppgpp[self.exp_ppgpp < 0] = 0
        self.exp_free /= self.exp_free.sum()
        self.exp_ppgpp /= self.exp_ppgpp.sum()



[docs]
    def set_ppgpp_kinetics_parameters(self, init_container, constants):
        uncharged_trna_idx = bulk_name_to_idx(
            self.uncharged_trna_names, init_container["id"]
        )
        trna_counts = self.aa_from_trna @ counts(init_container, uncharged_trna_idx)
        trna_ratio = trna_counts / trna_counts.sum()
        adjustment_fraction = trna_ratio / trna_ratio.mean()

        self.KD_RelA = constants.KD_RelA_ribosome * adjustment_fraction
        self.KI_SpoT = constants.KI_SpoT_ppGpp_degradation * adjustment_fraction



[docs]
    def fraction_rnap_bound_ppgpp(self, ppgpp):
        """
        Calculates the fraction of RNAP expected to be bound to ppGpp
        at a given concentration of ppGpp.

        Args:
                ppgpp (float with or without mol / volume units): concentration of ppGpp,
                        if unitless, should represent the concentration of PPGPP_CONC_UNITS

        Returns:
                float: fraction of RNAP that will be bound to ppGpp
        """

        if units.hasUnit(ppgpp):
            ppgpp = ppgpp.asNumber(PPGPP_CONC_UNITS)

        return ppgpp**2 / (self._ppgpp_km_squared + ppgpp**2)



[docs]
    def expression_from_ppgpp(self, ppgpp):
        """
        Calculates the expression of each gene at a given concentration of ppGpp.

        Args:
                ppgpp (float with or without mol / volume units): concentration of ppGpp,
                        if unitless, should represent the concentration of PPGPP_CONC_UNITS

        Returns:
                ndarray[float]: normalized expression for each gene
        """

        f_ppgpp = self.fraction_rnap_bound_ppgpp(ppgpp)
        return normalize(self.exp_free * (1 - f_ppgpp) + self.exp_ppgpp * f_ppgpp)



[docs]
    def synth_prob_from_ppgpp(self, ppgpp, copy_number, balanced_rRNA_prob=True):
        """
        Calculates the synthesis probability of each gene at a given concentration
        of ppGpp.

        Args:
                ppgpp (float with mol / volume units): concentration of ppGpp
                copy_number (Callable[float, int]): function that gives the expected copy
                        number given a doubling time and gene replication coordinate
                balanced_rRNA_prob (bool): if True, set synthesis probabilities
                    of rRNA promoters equal to one another

        Returns
                prob (ndarray[float]): normalized synthesis probability for each gene
                factor (ndarray[float]): factor to adjust expression to probability for each gene

        Note:
                copy_number should be sim_data.process.replication.get_average_copy_number
                but saving the function handle as a class attribute prevents pickling of sim_data
                without additional handling
        """

        ppgpp = ppgpp.asNumber(PPGPP_CONC_UNITS)
        f_ppgpp = self.fraction_rnap_bound_ppgpp(ppgpp)

        y = fitting.interpolate_linearized_fit(ppgpp, *self._ppgpp_growth_parameters)
        growth = max(cast(float, y), 0.0)
        tau = np.log(2) / growth / 60
        loss = growth + self.rna_data["deg_rate"].asNumber(1 / units.s)
        # Use the wildtype replication coordinates that were used to calculate
        # exp_free and exp_ppgpp, instead of coordinates that can be adjusted
        # via variants
        n_avg_copy = copy_number(tau, self.rna_data["wt_replication_coordinate"])

        # Return values
        factor = loss / n_avg_copy
        prob = normalize(
            (self.exp_free * (1 - f_ppgpp) + self.exp_ppgpp * f_ppgpp) * factor
        )

        if balanced_rRNA_prob:
            prob[self.rna_data["is_rRNA"]] = prob[self.rna_data["is_rRNA"]].mean()

        return prob, factor



[docs]
    def get_rnap_active_fraction_from_ppGpp(self, ppgpp):
        f_ppgpp = self.fraction_rnap_bound_ppgpp(ppgpp)
        return (
            self.fraction_active_rnap_bound * f_ppgpp
            + self.fraction_active_rnap_free * (1 - f_ppgpp)
        )



[docs]
    def _build_new_gene_data(self, raw_data, sim_data):
        """
        Load baseline values for new gene expression in all simulations.
        """

        self.new_gene_expression_baselines = (
            raw_data.new_gene_data.new_gene_baseline_expression_parameters
        )