Source code for ecoli.library.initial_conditions

"""
Functions to initialize molecule states from sim_data.
"""

import numpy as np
import numpy.typing as npt
from numpy.lib import recfunctions as rfn
from typing import Any
from unum import Unum

from ecoli.library.schema import attrs, bulk_name_to_idx, create_unique_indexes, counts
from ecoli.processes.polypeptide_elongation import (
    calculate_trna_charging,
    REMOVED_FROM_CHARGING,
    MICROMOLAR_UNITS,
)
from wholecell.utils import units
from wholecell.utils.fitting import (
    countsFromMassAndExpression,
    masses_and_counts_for_homeostatic_target,
    normalize,
)
from wholecell.utils.mc_complexation import mccFormComplexesWithPrebuiltMatrices
from wholecell.utils.polymerize import computeMassIncrease
from wholecell.utils.random import stochasticRound

RAND_MAX = 2**31


[docs] def create_bulk_container( sim_data, n_seeds=1, condition=None, seed=0, ppgpp_regulation=True, trna_attenuation=True, mass_coeff=1, form_complexes=True, ): try: old_condition = sim_data.condition if condition is not None: sim_data.condition = condition media_id = sim_data.conditions[sim_data.condition]["nutrients"] exchange_data = sim_data.external_state.exchange_data_from_media(media_id) import_molecules = set( exchange_data["importUnconstrainedExchangeMolecules"] ) | set(exchange_data["importConstrainedExchangeMolecules"]) random_state = np.random.RandomState(seed=seed) # Construct bulk container ids_molecules = sim_data.internal_state.bulk_molecules.bulk_data["id"] average_container = np.array( [mol_data for mol_data in zip(ids_molecules, np.zeros(len(ids_molecules)))], dtype=[("id", ids_molecules.dtype), ("count", np.float64)], ) for n in range(n_seeds): random_state = np.random.RandomState(seed=seed + n) average_container["count"] += initialize_bulk_counts( sim_data, media_id, import_molecules, random_state, mass_coeff, ppgpp_regulation, trna_attenuation, form_complexes=form_complexes, )["count"] except Exception: raise RuntimeError( "sim_data might not be fully initialized. " "Make sure all attributes have been set before " "using this function." ) sim_data.condition = old_condition average_container["count"] = average_container["count"] / n_seeds return average_container
[docs] def initialize_bulk_counts( sim_data, media_id, import_molecules, random_state, mass_coeff, ppgpp_regulation, trna_attenuation, form_complexes=True, ): # Allocate count array to populate bulk_counts = np.zeros( len(sim_data.internal_state.bulk_molecules.bulk_data["id"]), dtype=int ) # Set protein counts from expression initialize_protein_monomers( bulk_counts, sim_data, random_state, mass_coeff, ppgpp_regulation, trna_attenuation, ) # Set RNA counts from expression initialize_rna( bulk_counts, sim_data, random_state, mass_coeff, ppgpp_regulation, trna_attenuation, ) # Set mature RNA counts initialize_mature_RNA(bulk_counts, sim_data) # Set other biomass components set_small_molecule_counts( bulk_counts, sim_data, media_id, import_molecules, mass_coeff ) # Form complexes if form_complexes: initialize_complexation(bulk_counts, sim_data, random_state) bulk_masses = sim_data.internal_state.bulk_molecules.bulk_data["mass"].asNumber( units.fg / units.mol ) / sim_data.constants.n_avogadro.asNumber(1 / units.mol) bulk_submasses = [] bulk_submass_dtypes = [] for submass, idx in sim_data.submass_name_to_index.items(): bulk_submasses.append(bulk_masses[:, idx]) bulk_submass_dtypes.append((f"{submass}_submass", np.float64)) bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data.struct_array["id"] bulk_array = np.array( [mol_data for mol_data in zip(bulk_ids, bulk_counts, *bulk_submasses)], dtype=[("id", bulk_ids.dtype), ("count", int)] + bulk_submass_dtypes, ) return bulk_array
[docs] def initialize_unique_molecules( bulk_state, sim_data, cell_mass, random_state, unique_id_rng, superhelical_density, ppgpp_regulation, trna_attenuation, mechanistic_replisome, ): unique_molecules = {} # Initialize counts of full chromosomes initialize_full_chromosome(unique_molecules, sim_data, unique_id_rng) # Initialize unique molecules relevant to replication initialize_replication( bulk_state, unique_molecules, sim_data, cell_mass, mechanistic_replisome, unique_id_rng, ) # Initialize bound transcription factors initialize_transcription_factors( bulk_state, unique_molecules, sim_data, random_state ) # Initialize active RNAPs and unique molecule representations of RNAs initialize_transcription( bulk_state, unique_molecules, sim_data, random_state, unique_id_rng, ppgpp_regulation, trna_attenuation, ) # Initialize linking numbers of chromosomal segments if superhelical_density: initialize_chromosomal_segments(unique_molecules, sim_data, unique_id_rng) else: unique_molecules["chromosomal_segment"] = create_new_unique_molecules( "chromosomal_segment", 0, sim_data, unique_id_rng ) # Initialize active ribosomes initialize_translation( bulk_state, unique_molecules, sim_data, random_state, unique_id_rng ) return unique_molecules
[docs] def create_new_unique_molecules(name, n_mols, sim_data, random_state, **attrs): """ Helper function to create a new Numpy structured array with n_mols instances of the unique molecule called name. Accepts keyword arguments that become initial values for specified attributes of the new molecules. """ dtypes = list( sim_data.internal_state.unique_molecule.unique_molecule_definitions[ name ].items() ) submasses = list(sim_data.submass_name_to_index) dtypes += [(f"massDiff_{submass}", "<f8") for submass in submasses] dtypes += [("_entryState", "i1"), ("unique_index", "<i8")] unique_mols = np.zeros(n_mols, dtype=dtypes) for attr_name, attr_value in attrs.items(): unique_mols[attr_name] = attr_value unique_mols["unique_index"] = create_unique_indexes(n_mols, random_state) unique_mols["_entryState"] = 1 return unique_mols
[docs] def initialize_protein_monomers( bulk_counts, sim_data, random_state, mass_coeff, ppgpp_regulation, trna_attenuation ): monomer_mass = ( mass_coeff * sim_data.mass.get_component_masses( sim_data.condition_to_doubling_time[sim_data.condition] )["proteinMass"] / sim_data.mass.avg_cell_to_initial_cell_conversion_factor ) # TODO: unify this logic with the parca so it doesn]t fall out of step # again (look at teh calProteinCounts function) transcription = sim_data.process.transcription if ppgpp_regulation: rna_expression = sim_data.calculate_ppgpp_expression(sim_data.condition) else: rna_expression = transcription.rna_expression[sim_data.condition] if trna_attenuation: # Need to adjust expression (calculated without attenuation) by basal_adjustment # to get the expected expression without any attenuation and then multiply # by the condition readthrough probability to get the condition specific expression readthrough = transcription.attenuation_readthrough[sim_data.condition] basal_adjustment = transcription.attenuation_readthrough["basal"] rna_expression[transcription.attenuated_rna_indices] *= ( readthrough / basal_adjustment ) monomer_expression = normalize( sim_data.process.transcription.cistron_tu_mapping_matrix.dot(rna_expression)[ sim_data.relation.cistron_to_monomer_mapping ] * sim_data.process.translation.translation_efficiencies_by_monomer / ( np.log(2) / sim_data.condition_to_doubling_time[sim_data.condition].asNumber(units.s) + sim_data.process.translation.monomer_data["deg_rate"].asNumber( 1 / units.s ) ) ) n_monomers = countsFromMassAndExpression( monomer_mass.asNumber(units.g), sim_data.process.translation.monomer_data["mw"].asNumber(units.g / units.mol), monomer_expression, sim_data.constants.n_avogadro.asNumber(1 / units.mol), ) # Get indices for monomers in bulk counts array monomer_ids = sim_data.process.translation.monomer_data["id"] bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"] monomer_idx = bulk_name_to_idx(monomer_ids, bulk_ids) # Calculate initial counts of each monomer from mutinomial distribution bulk_counts[monomer_idx] = random_state.multinomial(n_monomers, monomer_expression)
[docs] def initialize_rna( bulk_counts, sim_data, random_state, mass_coeff, ppgpp_regulation, trna_attenuation ): """ Initializes counts of RNAs in the bulk molecule container using RNA expression data. mRNA counts are also initialized here, but is later reset to zero when the representations for mRNAs are moved to the unique molecule container. """ transcription = sim_data.process.transcription rna_mass = ( mass_coeff * sim_data.mass.get_component_masses( sim_data.condition_to_doubling_time[sim_data.condition] )["rnaMass"] / sim_data.mass.avg_cell_to_initial_cell_conversion_factor ) if ppgpp_regulation: rna_expression = sim_data.calculate_ppgpp_expression(sim_data.condition) else: rna_expression = normalize(transcription.rna_expression[sim_data.condition]) if trna_attenuation: # Need to adjust expression (calculated without attenuation) by basal_adjustment # to get the expected expression without any attenuation and then multiply # by the condition readthrough probability to get the condition specific expression readthrough = transcription.attenuation_readthrough[sim_data.condition] basal_adjustment = transcription.attenuation_readthrough["basal"] rna_expression[transcription.attenuated_rna_indices] *= ( readthrough / basal_adjustment ) rna_expression /= rna_expression.sum() n_rnas = countsFromMassAndExpression( rna_mass.asNumber(units.g), transcription.rna_data["mw"].asNumber(units.g / units.mol), rna_expression, sim_data.constants.n_avogadro.asNumber(1 / units.mol), ) # Get indices for monomers in bulk counts array rna_ids = transcription.rna_data["id"] bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"] rna_idx = bulk_name_to_idx(rna_ids, bulk_ids) # Calculate initial counts of each RNA from mutinomial distribution bulk_counts[rna_idx] = random_state.multinomial(n_rnas, rna_expression)
[docs] def initialize_mature_RNA(bulk_counts, sim_data): """ Initializes counts of mature RNAs in the bulk molecule container using the counts of unprocessed RNAs. Also consolidates the different variants of each rRNA molecule into the main type. """ transcription = sim_data.process.transcription rna_data = transcription.rna_data unprocessed_rna_ids = rna_data["id"][rna_data["is_unprocessed"]] bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"] unprocessed_rna_idx = bulk_name_to_idx(unprocessed_rna_ids, bulk_ids) # Skip if there are no unprocessed RNAs represented if len(unprocessed_rna_ids) > 0: mature_rna_ids = transcription.mature_rna_data["id"] maturation_stoich_matrix = transcription.rna_maturation_stoich_matrix mature_rna_idx = bulk_name_to_idx(mature_rna_ids, bulk_ids) # Get counts of unprocessed RNAs unprocessed_rna_counts = bulk_counts[unprocessed_rna_idx] # Assume all unprocessed RNAs are converted to mature RNAs bulk_counts[unprocessed_rna_idx] = 0 bulk_counts[mature_rna_idx] += maturation_stoich_matrix.dot( unprocessed_rna_counts ) # Get IDs of rRNAs main_23s_rRNA_id = sim_data.molecule_groups.s50_23s_rRNA[0] main_16s_rRNA_id = sim_data.molecule_groups.s30_16s_rRNA[0] main_5s_rRNA_id = sim_data.molecule_groups.s50_5s_rRNA[0] variant_23s_rRNA_ids = sim_data.molecule_groups.s50_23s_rRNA[1:] variant_16s_rRNA_ids = sim_data.molecule_groups.s30_16s_rRNA[1:] variant_5s_rRNA_ids = sim_data.molecule_groups.s50_5s_rRNA[1:] # Get indices of main and variant rRNAs main_23s_rRNA_idx = bulk_name_to_idx(main_23s_rRNA_id, bulk_ids) main_16s_rRNA_idx = bulk_name_to_idx(main_16s_rRNA_id, bulk_ids) main_5s_rRNA_idx = bulk_name_to_idx(main_5s_rRNA_id, bulk_ids) variant_23s_rRNA_idx = bulk_name_to_idx(variant_23s_rRNA_ids, bulk_ids) variant_16s_rRNA_idx = bulk_name_to_idx(variant_16s_rRNA_ids, bulk_ids) variant_5s_rRNA_idx = bulk_name_to_idx(variant_5s_rRNA_ids, bulk_ids) # Evolve states bulk_counts[main_23s_rRNA_idx] += bulk_counts[variant_23s_rRNA_idx].sum() bulk_counts[main_16s_rRNA_idx] += bulk_counts[variant_16s_rRNA_idx].sum() bulk_counts[main_5s_rRNA_idx] += bulk_counts[variant_5s_rRNA_idx].sum() bulk_counts[variant_23s_rRNA_idx] -= bulk_counts[variant_23s_rRNA_idx] bulk_counts[variant_16s_rRNA_idx] -= bulk_counts[variant_16s_rRNA_idx] bulk_counts[variant_5s_rRNA_idx] -= bulk_counts[variant_5s_rRNA_idx]
# TODO: remove checks for zero concentrations (change to assertion) # TODO: move any rescaling logic to KB/fitting
[docs] def set_small_molecule_counts( bulk_counts, sim_data, media_id, import_molecules, mass_coeff, cell_mass=None ): doubling_time = sim_data.condition_to_doubling_time[sim_data.condition] conc_dict = sim_data.process.metabolism.concentration_updates.concentrations_based_on_nutrients( media_id=media_id, imports=import_molecules ) conc_dict.update(sim_data.mass.getBiomassAsConcentrations(doubling_time)) conc_dict[sim_data.molecule_ids.ppGpp] = ( sim_data.growth_rate_parameters.get_ppGpp_conc(doubling_time) ) molecule_ids = sorted(conc_dict) bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"] molecule_concentrations = (units.mol / units.L) * np.array( [conc_dict[key].asNumber(units.mol / units.L) for key in molecule_ids] ) if cell_mass is None: avg_cell_fraction_mass = sim_data.mass.get_component_masses(doubling_time) other_dry_mass = ( mass_coeff * ( avg_cell_fraction_mass["proteinMass"] + avg_cell_fraction_mass["rnaMass"] + avg_cell_fraction_mass["dnaMass"] ) / sim_data.mass.avg_cell_to_initial_cell_conversion_factor ) else: small_molecule_mass = 0 * units.fg for mol in conc_dict: mol_idx = bulk_name_to_idx(mol, bulk_ids) small_molecule_mass += ( bulk_counts[mol_idx] * sim_data.getter.get_mass(mol) / sim_data.constants.n_avogadro ) other_dry_mass = cell_mass - small_molecule_mass masses_to_add, counts_to_add = masses_and_counts_for_homeostatic_target( other_dry_mass, molecule_concentrations, sim_data.getter.get_masses(molecule_ids), sim_data.constants.cell_density, sim_data.constants.n_avogadro, ) molecule_idx = bulk_name_to_idx(molecule_ids, bulk_ids) bulk_counts[molecule_idx] = counts_to_add
[docs] def initialize_complexation(bulk_counts, sim_data, random_state): molecule_names = sim_data.process.complexation.molecule_names bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"] molecule_idx = bulk_name_to_idx(molecule_names, bulk_ids) stoich_matrix = sim_data.process.complexation.stoich_matrix().astype( np.int64, order="F" ) molecule_counts = bulk_counts[molecule_idx] updated_molecule_counts, complexation_events = mccFormComplexesWithPrebuiltMatrices( molecule_counts, random_state.randint(1000), stoich_matrix, *sim_data.process.complexation.prebuilt_matrices, ) bulk_counts[molecule_idx] = updated_molecule_counts if np.any(updated_molecule_counts < 0): raise ValueError("Negative counts after complexation")
[docs] def initialize_full_chromosome(unique_molecules, sim_data, unique_id_rng): """ Initializes the counts of full chromosomes to one. The division_time of this initial chromosome is set to be zero for consistency. """ unique_molecules["full_chromosome"] = create_new_unique_molecules( "full_chromosome", 1, sim_data, unique_id_rng, division_time=0.0, has_triggered_division=True, domain_index=0, )
[docs] def initialize_replication( bulk_state, unique_molecules, sim_data, cell_mass, mechanistic_replisome, unique_id_rng, ): """ Initializes replication by creating an appropriate number of replication forks given the cell growth rate. This also initializes the gene dosage bulk counts using the initial locations of the forks. """ # Determine the number and location of replication forks at the start of # the cell cycle # Get growth rate constants tau = sim_data.condition_to_doubling_time[sim_data.condition].asUnit(units.min) critical_mass = sim_data.mass.get_dna_critical_mass(tau) replication_rate = sim_data.process.replication.basal_elongation_rate # Calculate length of replichore genome_length = sim_data.process.replication.genome_length replichore_length = np.ceil(0.5 * genome_length) * units.nt # Calculate the maximum number of replisomes that could be formed with # the existing counts of replisome subunits. If mechanistic_replisome option # is off, set to an arbitrary high number. replisome_trimer_idx = bulk_name_to_idx( sim_data.molecule_groups.replisome_trimer_subunits, bulk_state["id"] ) replisome_monomer_idx = bulk_name_to_idx( sim_data.molecule_groups.replisome_monomer_subunits, bulk_state["id"] ) if mechanistic_replisome: n_max_replisomes = np.min( np.concatenate( ( bulk_state["count"][replisome_trimer_idx] // 3, bulk_state["count"][replisome_monomer_idx], ) ) ) else: n_max_replisomes = 1000 # Generate arrays specifying appropriate initial replication conditions oric_state, replisome_state, domain_state = determine_chromosome_state( tau, replichore_length, n_max_replisomes, sim_data.process.replication.no_child_place_holder, cell_mass, critical_mass, replication_rate, ) n_oric = oric_state["domain_index"].size n_replisome = replisome_state["domain_index"].size n_domain = domain_state["domain_index"].size # Add OriC molecules with the proposed attributes unique_molecules["oriC"] = create_new_unique_molecules( "oriC", n_oric, sim_data, unique_id_rng, domain_index=oric_state["domain_index"] ) # Add chromosome domain molecules with the proposed attributes unique_molecules["chromosome_domain"] = create_new_unique_molecules( "chromosome_domain", n_domain, sim_data, unique_id_rng, domain_index=domain_state["domain_index"], child_domains=domain_state["child_domains"], ) if n_replisome != 0: # Update mass of replisomes if the mechanistic replisome option is set if mechanistic_replisome: replisome_trimer_subunit_masses = np.vstack( [ sim_data.getter.get_submass_array(x).asNumber( units.fg / units.count ) for x in sim_data.molecule_groups.replisome_trimer_subunits ] ) replisome_monomer_subunit_masses = np.vstack( [ sim_data.getter.get_submass_array(x).asNumber( units.fg / units.count ) for x in sim_data.molecule_groups.replisome_monomer_subunits ] ) replisome_mass_array = 3 * replisome_trimer_subunit_masses.sum( axis=0 ) + replisome_monomer_subunit_masses.sum(axis=0) replisome_protein_mass = replisome_mass_array.sum() else: replisome_protein_mass = 0.0 # Update mass to account for DNA strands that have already been # elongated. sequences = sim_data.process.replication.replication_sequences fork_coordinates = replisome_state["coordinates"] sequence_elongations = np.abs(np.repeat(fork_coordinates, 2)) mass_increase_dna = computeMassIncrease( np.tile(sequences, (n_replisome // 2, 1)), sequence_elongations, sim_data.process.replication.replication_monomer_weights.asNumber(units.fg), ) # Add active replisomes as unique molecules and set attributes unique_molecules["active_replisome"] = create_new_unique_molecules( "active_replisome", n_replisome, sim_data, unique_id_rng, domain_index=replisome_state["domain_index"], coordinates=replisome_state["coordinates"], right_replichore=replisome_state["right_replichore"], massDiff_DNA=mass_increase_dna[0::2] + mass_increase_dna[1::2], massDiff_protein=replisome_protein_mass, ) if mechanistic_replisome: # Remove replisome subunits from bulk molecules bulk_state["count"][replisome_trimer_idx] -= 3 * n_replisome bulk_state["count"][replisome_monomer_idx] -= n_replisome else: # For n_replisome = 0, still create an empty structured array with # the expected fields unique_molecules["active_replisome"] = create_new_unique_molecules( "active_replisome", n_replisome, sim_data, unique_id_rng ) # Get coordinates of all genes, promoters and DnaA boxes all_gene_coordinates = sim_data.process.transcription.cistron_data[ "replication_coordinate" ] all_promoter_coordinates = sim_data.process.transcription.rna_data[ "replication_coordinate" ] all_DnaA_box_coordinates = sim_data.process.replication.motif_coordinates[ "DnaA_box" ] # Define function that initializes attributes of sequence motifs given the # initial state of the chromosome def get_motif_attributes(all_motif_coordinates): """ Using the initial positions of replication forks, calculate attributes of unique molecules representing DNA motifs, given their positions on the genome. Args: all_motif_coordinates (ndarray): Genomic coordinates of DNA motifs, represented in a specific order Returns: motif_index: Indices of all motif copies, in the case where different indexes imply a different functional role motif_coordinates: Genomic coordinates of all motif copies motif_domain_index: Domain indexes of the chromosome domain that each motif copy belongs to """ motif_index, motif_coordinates, motif_domain_index = [], [], [] def in_bounds(coordinates, lb, ub): return np.logical_and(coordinates < ub, coordinates > lb) # Loop through all chromosome domains for domain_idx in domain_state["domain_index"]: # If the domain is the mother domain of the initial chromosome, if domain_idx == 0: if n_replisome == 0: # No replisomes - all motifs should fall in this domain motif_mask = np.ones_like(all_motif_coordinates, dtype=bool) else: # Get domain boundaries domain_boundaries = replisome_state["coordinates"][ replisome_state["domain_index"] == 0 ] # Add motifs outside of this boundary motif_mask = np.logical_or( all_motif_coordinates > domain_boundaries.max(), all_motif_coordinates < domain_boundaries.min(), ) # If the domain contains the origin, elif np.isin(domain_idx, oric_state["domain_index"]): # Get index of the parent domain parent_domain_idx = domain_state["domain_index"][ np.where(domain_state["child_domains"] == domain_idx)[0] ] # Get domain boundaries of the parent domain parent_domain_boundaries = replisome_state["coordinates"][ replisome_state["domain_index"] == parent_domain_idx ] # Add motifs inside this boundary motif_mask = in_bounds( all_motif_coordinates, parent_domain_boundaries.min(), parent_domain_boundaries.max(), ) # If the domain neither contains the origin nor the terminus, else: # Get index of the parent domain parent_domain_idx = domain_state["domain_index"][ np.where(domain_state["child_domains"] == domain_idx)[0] ] # Get domain boundaries of the parent domain parent_domain_boundaries = replisome_state["coordinates"][ replisome_state["domain_index"] == parent_domain_idx ] # Get domain boundaries of this domain domain_boundaries = replisome_state["coordinates"][ replisome_state["domain_index"] == domain_idx ] # Add motifs between the boundaries motif_mask = np.logical_or( in_bounds( all_motif_coordinates, domain_boundaries.max(), parent_domain_boundaries.max(), ), in_bounds( all_motif_coordinates, parent_domain_boundaries.min(), domain_boundaries.min(), ), ) # Append attributes to existing list motif_index.extend(np.nonzero(motif_mask)[0]) motif_coordinates.extend(all_motif_coordinates[motif_mask]) motif_domain_index.extend(np.full(motif_mask.sum(), domain_idx)) return motif_index, motif_coordinates, motif_domain_index # Use function to get attributes for promoters and DnaA boxes TU_index, promoter_coordinates, promoter_domain_index = get_motif_attributes( all_promoter_coordinates ) cistron_index, gene_coordinates, gene_domain_index = get_motif_attributes( all_gene_coordinates ) _, DnaA_box_coordinates, DnaA_box_domain_index = get_motif_attributes( all_DnaA_box_coordinates ) # Add promoters as unique molecules and set attributes # Note: the bound_TF attribute is properly initialized in the function # initialize_transcription_factors n_promoter = len(TU_index) n_tf = len(sim_data.process.transcription_regulation.tf_ids) unique_molecules["promoter"] = create_new_unique_molecules( "promoter", n_promoter, sim_data, unique_id_rng, domain_index=promoter_domain_index, coordinates=promoter_coordinates, TU_index=TU_index, bound_TF=np.zeros((n_promoter, n_tf), dtype=bool), ) # Add genes as unique molecules and set attributes n_gene = len(cistron_index) unique_molecules["gene"] = create_new_unique_molecules( "gene", n_gene, sim_data, unique_id_rng, cistron_index=cistron_index, coordinates=gene_coordinates, domain_index=gene_domain_index, ) # Add DnaA boxes as unique molecules and set attributes n_DnaA_box = len(DnaA_box_coordinates) unique_molecules["DnaA_box"] = create_new_unique_molecules( "DnaA_box", n_DnaA_box, sim_data, unique_id_rng, domain_index=DnaA_box_domain_index, coordinates=DnaA_box_coordinates, DnaA_bound=np.zeros(n_DnaA_box, dtype=bool), )
[docs] def initialize_transcription_factors( bulk_state, unique_molecules, sim_data, random_state ): """ Initialize transcription factors that are bound to the chromosome. For each type of transcription factor, this function calculates the total number of transcription factors that should be bound to the chromosome using the binding probabilities of each transcription factor and the number of available promoter sites. The calculated number of transcription factors are then distributed randomly to promoters, whose bound_TF attributes and submasses are updated correspondingly. """ # Get transcription factor properties from sim_data tf_ids = sim_data.process.transcription_regulation.tf_ids tf_to_tf_type = sim_data.process.transcription_regulation.tf_to_tf_type p_promoter_bound_TF = sim_data.process.transcription_regulation.p_promoter_bound_tf # Build dict that maps TFs to transcription units they regulate delta_prob = sim_data.process.transcription_regulation.delta_prob TF_to_TU_idx = {} for i, tf in enumerate(tf_ids): TF_to_TU_idx[tf] = delta_prob["deltaI"][delta_prob["deltaJ"] == i] # Get views into bulk molecule representations of transcription factors active_tf_view = {} inactive_tf_view = {} active_tf_view_idx = {} inactive_tf_view_idx = {} for tf in tf_ids: tf_idx = bulk_name_to_idx(tf + "[c]", bulk_state["id"]) active_tf_view[tf] = bulk_state["count"][tf_idx] active_tf_view_idx[tf] = tf_idx if tf_to_tf_type[tf] == "1CS": if tf == sim_data.process.transcription_regulation.active_to_bound[tf]: inactive_tf_idx = bulk_name_to_idx( sim_data.process.equilibrium.get_unbound(tf + "[c]"), bulk_state["id"], ) inactive_tf_view[tf] = bulk_state["count"][inactive_tf_idx] else: inactive_tf_idx = bulk_name_to_idx( sim_data.process.transcription_regulation.active_to_bound[tf] + "[c]", bulk_state["id"], ) inactive_tf_view[tf] = bulk_state["count"][inactive_tf_idx] elif tf_to_tf_type[tf] == "2CS": inactive_tf_idx = bulk_name_to_idx( sim_data.process.two_component_system.active_to_inactive_tf[tf + "[c]"], bulk_state["id"], ) inactive_tf_view[tf] = bulk_state["count"][inactive_tf_idx] inactive_tf_view_idx[tf] = inactive_tf_idx # Get masses of active transcription factors tf_indexes = [np.where(bulk_state["id"] == tf_id + "[c]")[0][0] for tf_id in tf_ids] active_tf_masses = ( sim_data.internal_state.bulk_molecules.bulk_data["mass"][tf_indexes] / sim_data.constants.n_avogadro ).asNumber(units.fg) # Get TU indices of promoters TU_index = unique_molecules["promoter"]["TU_index"] # Initialize bound_TF array bound_TF = np.zeros((len(TU_index), len(tf_ids)), dtype=bool) for tf_idx, tf_id in enumerate(tf_ids): # Get counts of transcription factors active_tf_counts = active_tf_view[tf_id] # If there are no active transcription factors at initialization, # continue to the next transcription factor if active_tf_counts == 0: continue # Compute probability of binding the promoter if tf_to_tf_type[tf_id] == "0CS": p_promoter_bound = 1.0 else: inactive_tf_counts = inactive_tf_view[tf_id] p_promoter_bound = p_promoter_bound_TF(active_tf_counts, inactive_tf_counts) # Determine the number of available promoter sites available_promoters = np.isin(TU_index, TF_to_TU_idx[tf_id]) n_available_promoters = available_promoters.sum() # Calculate the number of promoters that should be bound n_to_bind = int( stochasticRound( random_state, np.full(n_available_promoters, p_promoter_bound) ).sum() ) bound_locs = np.zeros(n_available_promoters, dtype=bool) if n_to_bind > 0: # Determine randomly which DNA targets to bind based on which of # the following is more limiting: # number of promoter sites to bind, or number of active # transcription factors bound_locs[ random_state.choice( n_available_promoters, size=min(n_to_bind, active_tf_view[tf_id]), replace=False, ) ] = True # Update count of free transcription factors bulk_state["count"][active_tf_view_idx[tf_id]] -= bound_locs.sum() # Update bound_TF array bound_TF[available_promoters, tf_idx] = bound_locs # Calculate masses of bound TFs mass_diffs = bound_TF.dot(active_tf_masses) # Reset bound_TF attribute of promoters unique_molecules["promoter"]["bound_TF"] = bound_TF # Add mass_diffs array to promoter submass for submass, i in sim_data.submass_name_to_index.items(): unique_molecules["promoter"][f"massDiff_{submass}"] = mass_diffs[:, i]
[docs] def initialize_transcription( bulk_state, unique_molecules, sim_data, random_state, unique_id_rng, ppgpp_regulation, trna_attenuation, ): """ Activate RNA polymerases as unique molecules, and distribute them along lengths of trancription units, while decreasing counts of unactivated RNA polymerases (APORNAP-CPLX[c]). Also initialize unique molecule representations of fully transcribed mRNAs and partially transcribed RNAs, using counts of mRNAs initialized as bulk molecules, and the attributes of initialized RNA polymerases. The counts of full mRNAs represented as bulk molecules are reset to zero. RNA polymerases are placed randomly across the length of each transcription unit, with the synthesis probabilities for each TU determining the number of RNA polymerases placed at each gene. """ # Load parameters rna_lengths = sim_data.process.transcription.rna_data["length"].asNumber() rna_masses = ( sim_data.process.transcription.rna_data["mw"] / sim_data.constants.n_avogadro ).asNumber(units.fg) current_media_id = sim_data.conditions[sim_data.condition]["nutrients"] frac_active_rnap = sim_data.process.transcription.rnapFractionActiveDict[ current_media_id ] inactive_rnap_idx = bulk_name_to_idx( sim_data.molecule_ids.full_RNAP, bulk_state["id"] ) inactive_RNAP_counts = bulk_state["count"][inactive_rnap_idx] rna_sequences = sim_data.process.transcription.transcription_sequences nt_weights = sim_data.process.transcription.transcription_monomer_weights end_weight = sim_data.process.transcription.transcription_end_weight replichore_lengths = sim_data.process.replication.replichore_lengths chromosome_length = replichore_lengths.sum() # Number of rnaPoly to activate n_RNAPs_to_activate = np.int64(frac_active_rnap * inactive_RNAP_counts) # Get attributes of promoters TU_index, bound_TF, domain_index_promoters = attrs( unique_molecules["promoter"], ["TU_index", "bound_TF", "domain_index"] ) # Parameters for rnaSynthProb if ppgpp_regulation: doubling_time = sim_data.condition_to_doubling_time[sim_data.condition] ppgpp_conc = sim_data.growth_rate_parameters.get_ppGpp_conc(doubling_time) basal_prob, _ = sim_data.process.transcription.synth_prob_from_ppgpp( ppgpp_conc, sim_data.process.replication.get_average_copy_number ) ppgpp_scale = basal_prob[TU_index] # Use original delta prob if no ppGpp basal prob ppgpp_scale[ppgpp_scale == 0] = 1 else: basal_prob = sim_data.process.transcription_regulation.basal_prob.copy() ppgpp_scale = 1 if trna_attenuation: basal_prob[sim_data.process.transcription.attenuated_rna_indices] += ( sim_data.process.transcription.attenuation_basal_prob_adjustments ) n_TUs = len(basal_prob) delta_prob_matrix = sim_data.process.transcription_regulation.get_delta_prob_matrix( dense=True, ppgpp=ppgpp_regulation ) # Synthesis probabilities for different categories of genes rna_synth_prob_fractions = sim_data.process.transcription.rnaSynthProbFraction rna_synth_prob_R_protein = sim_data.process.transcription.rnaSynthProbRProtein rna_synth_prob_rna_polymerase = ( sim_data.process.transcription.rnaSynthProbRnaPolymerase ) # Get coordinates and transcription directions of transcription units replication_coordinate = sim_data.process.transcription.rna_data[ "replication_coordinate" ] transcription_direction = sim_data.process.transcription.rna_data["is_forward"] # Determine changes from genetic perturbations genetic_perturbations = {} perturbations = getattr(sim_data, "genetic_perturbations", {}) if len(perturbations) > 0: probability_indexes = [ (index, sim_data.genetic_perturbations[rna_data["id"]]) for index, rna_data in enumerate(sim_data.process.transcription.rna_data) if rna_data["id"] in sim_data.genetic_perturbations ] genetic_perturbations = { "fixedRnaIdxs": [pair[0] for pair in probability_indexes], "fixedSynthProbs": [pair[1] for pair in probability_indexes], } # ID Groups idx_rRNA = np.where(sim_data.process.transcription.rna_data["is_rRNA"])[0] idx_mRNA = np.where(sim_data.process.transcription.rna_data["is_mRNA"])[0] idx_tRNA = np.where(sim_data.process.transcription.rna_data["is_tRNA"])[0] idx_rprotein = np.where( sim_data.process.transcription.rna_data["includes_ribosomal_protein"] )[0] idx_rnap = np.where(sim_data.process.transcription.rna_data["includes_RNAP"])[0] # Calculate probabilities of the RNAP binding to the promoters promoter_init_probs = basal_prob[TU_index] + ppgpp_scale * np.multiply( delta_prob_matrix[TU_index, :], bound_TF ).sum(axis=1) if len(genetic_perturbations) > 0: rescale_initiation_probs( promoter_init_probs, TU_index, genetic_perturbations["fixedSynthProbs"], genetic_perturbations["fixedRnaIdxs"], ) # Adjust probabilities to not be negative promoter_init_probs[promoter_init_probs < 0] = 0.0 promoter_init_probs /= promoter_init_probs.sum() if np.any(promoter_init_probs < 0): raise Exception("Have negative RNA synthesis probabilities") # Adjust synthesis probabilities depending on environment synth_prob_fractions = rna_synth_prob_fractions[current_media_id] # Create masks for different types of RNAs is_mRNA = np.isin(TU_index, idx_mRNA) is_tRNA = np.isin(TU_index, idx_tRNA) is_rRNA = np.isin(TU_index, idx_rRNA) is_rprotein = np.isin(TU_index, idx_rprotein) is_rnap = np.isin(TU_index, idx_rnap) is_fixed = is_tRNA | is_rRNA | is_rprotein | is_rnap # Rescale initiation probabilities based on type of RNA promoter_init_probs[is_mRNA] *= ( synth_prob_fractions["mRna"] / promoter_init_probs[is_mRNA].sum() ) promoter_init_probs[is_tRNA] *= ( synth_prob_fractions["tRna"] / promoter_init_probs[is_tRNA].sum() ) promoter_init_probs[is_rRNA] *= ( synth_prob_fractions["rRna"] / promoter_init_probs[is_rRNA].sum() ) # Set fixed synthesis probabilities for RProteins and RNAPs rescale_initiation_probs( promoter_init_probs, TU_index, np.concatenate( ( rna_synth_prob_R_protein[current_media_id], rna_synth_prob_rna_polymerase[current_media_id], ) ), np.concatenate((idx_rprotein, idx_rnap)), ) assert promoter_init_probs[is_fixed].sum() < 1.0 # Adjust for attenuation that will stop transcription after initiation if trna_attenuation: attenuation_readthrough = { idx: prob for idx, prob in zip( sim_data.process.transcription.attenuated_rna_indices, sim_data.process.transcription.attenuation_readthrough[ sim_data.condition ], ) } readthrough_adjustment = np.array( [attenuation_readthrough.get(idx, 1) for idx in TU_index] ) promoter_init_probs *= readthrough_adjustment scale_the_rest_by = ( 1.0 - promoter_init_probs[is_fixed].sum() ) / promoter_init_probs[~is_fixed].sum() promoter_init_probs[~is_fixed] *= scale_the_rest_by # normalize to length of rna init_prob_length_adjusted = promoter_init_probs * rna_lengths[TU_index] init_prob_normalized = init_prob_length_adjusted / init_prob_length_adjusted.sum() # Sample a multinomial distribution of synthesis probabilities to determine # what RNA are initialized n_initiations = random_state.multinomial(n_RNAPs_to_activate, init_prob_normalized) # Build array of transcription unit indexes for partially transcribed mRNAs # and domain indexes for RNAPs TU_index_partial_RNAs = np.repeat(TU_index, n_initiations) domain_index_rnap = np.repeat(domain_index_promoters, n_initiations) # Build arrays of starting coordinates and transcription directions starting_coordinates = replication_coordinate[TU_index_partial_RNAs] is_forward = transcription_direction[TU_index_partial_RNAs] # Randomly advance RNAPs along the transcription units # TODO (Eran): make sure there aren't any RNAPs at same location on same TU updated_lengths = np.array( random_state.rand(n_RNAPs_to_activate) * rna_lengths[TU_index_partial_RNAs], dtype=int, ) # Rescale boolean array of directions to an array of 1's and -1's. direction_rescaled = (2 * (is_forward - 0.5)).astype(np.int64) # Compute the updated coordinates of RNAPs. Coordinates of RNAPs moving in # the positive direction are increased, whereas coordinates of RNAPs moving # in the negative direction are decreased. updated_coordinates = starting_coordinates + np.multiply( direction_rescaled, updated_lengths ) # Reset coordinates of RNAPs that cross the boundaries between right and # left replichores updated_coordinates[updated_coordinates > replichore_lengths[0]] -= ( chromosome_length ) updated_coordinates[updated_coordinates < -replichore_lengths[1]] += ( chromosome_length ) # Update mass sequences = rna_sequences[TU_index_partial_RNAs] added_mass = computeMassIncrease(sequences, updated_lengths, nt_weights) added_mass[updated_lengths != 0] += end_weight # add endWeight to all new Rna # Masses of partial mRNAs are counted as mRNA mass as they are already # functional, but the masses of other types of partial RNAs are counted as # generic RNA mass. added_RNA_mass = added_mass.copy() added_mRNA_mass = added_mass.copy() is_mRNA_partial_RNAs = np.isin(TU_index_partial_RNAs, idx_mRNA) added_RNA_mass[is_mRNA_partial_RNAs] = 0 added_mRNA_mass[np.logical_not(is_mRNA_partial_RNAs)] = 0 # Add active RNAPs and get their unique indexes unique_molecules["active_RNAP"] = create_new_unique_molecules( "active_RNAP", n_RNAPs_to_activate, sim_data, unique_id_rng, domain_index=domain_index_rnap, coordinates=updated_coordinates, is_forward=is_forward, ) # Decrement counts of bulk inactive RNAPs rnap_idx = bulk_name_to_idx(sim_data.molecule_ids.full_RNAP, bulk_state["id"]) bulk_state["count"][rnap_idx] = inactive_RNAP_counts - n_RNAPs_to_activate # Add partially transcribed RNAs partial_rnas = create_new_unique_molecules( "RNA", n_RNAPs_to_activate, sim_data, unique_id_rng, TU_index=TU_index_partial_RNAs, transcript_length=updated_lengths, is_mRNA=is_mRNA_partial_RNAs, is_full_transcript=np.zeros(n_RNAPs_to_activate, dtype=bool), can_translate=is_mRNA_partial_RNAs, RNAP_index=unique_molecules["active_RNAP"]["unique_index"], massDiff_nonspecific_RNA=added_RNA_mass, massDiff_mRNA=added_mRNA_mass, ) # Get counts of mRNAs initialized as bulk molecules mRNA_ids = sim_data.process.transcription.rna_data["id"][ sim_data.process.transcription.rna_data["is_mRNA"] ] mRNA_idx = bulk_name_to_idx(mRNA_ids, bulk_state["id"]) mRNA_counts = bulk_state["count"][mRNA_idx] # Subtract number of partially transcribed mRNAs that were initialized. # Note: some mRNAs with high degradation rates have more partial mRNAs than # the expected total number of mRNAs - for these mRNAs we simply set the # initial full mRNA counts to be zero. partial_mRNA_counts = np.bincount( TU_index_partial_RNAs[is_mRNA_partial_RNAs], minlength=n_TUs )[idx_mRNA] full_mRNA_counts = (mRNA_counts - partial_mRNA_counts).clip(min=0) # Get array of TU indexes for each full mRNA TU_index_full_mRNAs = np.repeat(idx_mRNA, full_mRNA_counts) # Add fully transcribed mRNAs. The RNAP_index attribute of these molecules # are set to -1. full_rnas = create_new_unique_molecules( "RNA", len(TU_index_full_mRNAs), sim_data, unique_id_rng, TU_index=TU_index_full_mRNAs, transcript_length=rna_lengths[TU_index_full_mRNAs], is_mRNA=np.ones_like(TU_index_full_mRNAs, dtype=bool), is_full_transcript=np.ones_like(TU_index_full_mRNAs, dtype=bool), can_translate=np.ones_like(TU_index_full_mRNAs, dtype=bool), RNAP_index=np.full(TU_index_full_mRNAs.shape, -1, dtype=np.int64), massDiff_mRNA=rna_masses[TU_index_full_mRNAs], ) unique_molecules["RNA"] = np.concatenate((partial_rnas, full_rnas)) # Reset counts of bulk mRNAs to zero bulk_state["count"][mRNA_idx] = 0
[docs] def initialize_chromosomal_segments(unique_molecules, sim_data, unique_id_rng): """ Initialize unique molecule representations of chromosomal segments. All chromosomal segments are assumed to be at their relaxed states upon initialization. """ # Load parameters relaxed_DNA_base_pairs_per_turn = ( sim_data.process.chromosome_structure.relaxed_DNA_base_pairs_per_turn ) terC_index = sim_data.process.chromosome_structure.terC_dummy_molecule_index replichore_lengths = sim_data.process.replication.replichore_lengths min_coordinates = -replichore_lengths[1] max_coordinates = replichore_lengths[0] # Get attributes of replisomes, active RNAPs, chromosome domains, full # chromosomes, and oriCs (replisome_coordinates, replisome_domain_indexes, replisome_unique_indexes) = attrs( unique_molecules["active_replisome"], ["coordinates", "domain_index", "unique_index"], ) ( active_RNAP_coordinates, active_RNAP_domain_indexes, active_RNAP_unique_indexes, ) = attrs( unique_molecules["active_RNAP"], ["coordinates", "domain_index", "unique_index"] ) chromosome_domain_domain_indexes, child_domains = attrs( unique_molecules["chromosome_domain"], ["domain_index", "child_domains"] ) (full_chromosome_domain_indexes,) = attrs( unique_molecules["full_chromosome"], ["domain_index"] ) (origin_domain_indexes,) = attrs(unique_molecules["oriC"], ["domain_index"]) # Initialize chromosomal segment attributes all_boundary_molecule_indexes = np.empty((0, 2), dtype=np.int64) all_boundary_coordinates = np.empty((0, 2), dtype=np.int64) all_segment_domain_indexes = np.array([], dtype=np.int32) all_linking_numbers = np.array([], dtype=np.float64) def get_chromosomal_segment_attributes( coordinates, unique_indexes, spans_oriC, spans_terC ): """ Returns the attributes of all chromosomal segments from a continuous stretch of DNA, given the coordinates and unique indexes of all boundary molecules. """ coordinates_argsort = np.argsort(coordinates) coordinates_sorted = coordinates[coordinates_argsort] unique_indexes_sorted = unique_indexes[coordinates_argsort] # Add dummy molecule at terC if domain spans terC if spans_terC: coordinates_sorted = np.insert( coordinates_sorted, [0, len(coordinates_sorted)], [min_coordinates, max_coordinates], ) unique_indexes_sorted = np.insert( unique_indexes_sorted, [0, len(unique_indexes_sorted)], terC_index ) boundary_molecule_indexes = np.hstack( ( unique_indexes_sorted[:-1][:, np.newaxis], unique_indexes_sorted[1:][:, np.newaxis], ) ) boundary_coordinates = np.hstack( ( coordinates_sorted[:-1][:, np.newaxis], coordinates_sorted[1:][:, np.newaxis], ) ) # Remove segment that spans oriC if the domain does not span oriC if not spans_oriC: oriC_segment_index = np.where( np.sign(boundary_coordinates).sum(axis=1) == 0 )[0] assert len(oriC_segment_index) == 1 boundary_molecule_indexes = np.delete( boundary_molecule_indexes, oriC_segment_index, 0 ) boundary_coordinates = np.delete( boundary_coordinates, oriC_segment_index, 0 ) # Assumes all segments are at their relaxed state at initialization linking_numbers = ( boundary_coordinates[:, 1] - boundary_coordinates[:, 0] ) / relaxed_DNA_base_pairs_per_turn return boundary_molecule_indexes, boundary_coordinates, linking_numbers # Loop through each domain index for domain_index in chromosome_domain_domain_indexes: domain_spans_oriC = domain_index in origin_domain_indexes domain_spans_terC = domain_index in full_chromosome_domain_indexes # Get coordinates and indexes of all RNAPs on this domain RNAP_domain_mask = active_RNAP_domain_indexes == domain_index molecule_coordinates_this_domain = active_RNAP_coordinates[RNAP_domain_mask] molecule_indexes_this_domain = active_RNAP_unique_indexes[RNAP_domain_mask] # Append coordinates and indexes of replisomes on this domain, if any if not domain_spans_oriC: replisome_domain_mask = replisome_domain_indexes == domain_index molecule_coordinates_this_domain = np.concatenate( ( molecule_coordinates_this_domain, replisome_coordinates[replisome_domain_mask], ) ) molecule_indexes_this_domain = np.concatenate( ( molecule_indexes_this_domain, replisome_unique_indexes[replisome_domain_mask], ) ) # Append coordinates and indexes of parent domain replisomes, if any if not domain_spans_terC: parent_domain_index = chromosome_domain_domain_indexes[ np.where(child_domains == domain_index)[0][0] ] replisome_parent_domain_mask = ( replisome_domain_indexes == parent_domain_index ) molecule_coordinates_this_domain = np.concatenate( ( molecule_coordinates_this_domain, replisome_coordinates[replisome_parent_domain_mask], ) ) molecule_indexes_this_domain = np.concatenate( ( molecule_indexes_this_domain, replisome_unique_indexes[replisome_parent_domain_mask], ) ) # Get attributes of chromosomal segments on this domain ( boundary_molecule_indexes_this_domain, boundary_coordinates_this_domain, linking_numbers_this_domain, ) = get_chromosomal_segment_attributes( molecule_coordinates_this_domain, molecule_indexes_this_domain, domain_spans_oriC, domain_spans_terC, ) # Append to existing array of attributes all_boundary_molecule_indexes = np.vstack( [all_boundary_molecule_indexes, boundary_molecule_indexes_this_domain] ) all_boundary_coordinates = np.vstack( (all_boundary_coordinates, boundary_coordinates_this_domain) ) all_segment_domain_indexes = np.concatenate( ( all_segment_domain_indexes, np.full(len(linking_numbers_this_domain), domain_index, dtype=np.int32), ) ) all_linking_numbers = np.concatenate( (all_linking_numbers, linking_numbers_this_domain) ) # Confirm total counts of all segments n_segments = len(all_linking_numbers) assert ( n_segments == len(active_RNAP_unique_indexes) + 1.5 * len(replisome_unique_indexes) + 1 ) # Add chromosomal segments unique_molecules["chromosomal_segment"] = create_new_unique_molecules( "chromosomal_segment", n_segments, sim_data, unique_id_rng, boundary_molecule_indexes=all_boundary_molecule_indexes, boundary_coordinates=all_boundary_coordinates, domain_index=all_segment_domain_indexes, linking_number=all_linking_numbers, )
[docs] def initialize_translation( bulk_state, unique_molecules, sim_data, random_state, unique_id_rng ): """ Activate ribosomes as unique molecules, and distribute them along lengths of mRNAs, while decreasing counts of unactivated ribosomal subunits (30S and 50S). Ribosomes are placed randomly across the lengths of each mRNA. """ # Load translation parameters current_nutrients = sim_data.conditions[sim_data.condition]["nutrients"] frac_active_ribosome = sim_data.process.translation.ribosomeFractionActiveDict[ current_nutrients ] protein_sequences = sim_data.process.translation.translation_sequences protein_lengths = sim_data.process.translation.monomer_data["length"].asNumber() translation_efficiencies = normalize( sim_data.process.translation.translation_efficiencies_by_monomer ) aa_weights_incorporated = sim_data.process.translation.translation_monomer_weights end_weight = sim_data.process.translation.translation_end_weight cistron_lengths = sim_data.process.transcription.cistron_data["length"].asNumber( units.nt ) TU_ids = sim_data.process.transcription.rna_data["id"] monomer_index_to_tu_indexes = sim_data.relation.monomer_index_to_tu_indexes monomer_index_to_cistron_index = { i: sim_data.process.transcription._cistron_id_to_index[monomer["cistron_id"]] for (i, monomer) in enumerate(sim_data.process.translation.monomer_data) } # Get attributes of RNAs ( TU_index_all_RNAs, length_all_RNAs, is_mRNA, is_full_transcript_all_RNAs, unique_index_all_RNAs, ) = attrs( unique_molecules["RNA"], [ "TU_index", "transcript_length", "is_mRNA", "is_full_transcript", "unique_index", ], ) TU_index_mRNAs = TU_index_all_RNAs[is_mRNA] length_mRNAs = length_all_RNAs[is_mRNA] is_full_transcript_mRNAs = is_full_transcript_all_RNAs[is_mRNA] unique_index_mRNAs = unique_index_all_RNAs[is_mRNA] # Calculate available template lengths of each mRNA cistron from fully # transcribed mRNA transcription units TU_index_full_mRNAs = TU_index_mRNAs[is_full_transcript_mRNAs] TU_counts_full_mRNAs = np.bincount(TU_index_full_mRNAs, minlength=len(TU_ids)) cistron_counts_full_mRNAs = ( sim_data.process.transcription.cistron_tu_mapping_matrix.dot( TU_counts_full_mRNAs ) ) available_cistron_lengths = np.multiply(cistron_counts_full_mRNAs, cistron_lengths) # Add available template lengths from each partially transcribed mRNAs TU_index_incomplete_mRNAs = TU_index_mRNAs[np.logical_not(is_full_transcript_mRNAs)] length_incomplete_mRNAs = length_mRNAs[np.logical_not(is_full_transcript_mRNAs)] TU_index_to_mRNA_lengths = {} for TU_index, length in zip(TU_index_incomplete_mRNAs, length_incomplete_mRNAs): TU_index_to_mRNA_lengths.setdefault(TU_index, []).append(length) for TU_index, available_lengths in TU_index_to_mRNA_lengths.items(): cistron_indexes = sim_data.process.transcription.rna_id_to_cistron_indexes( TU_ids[TU_index] ) cistron_start_positions = np.array( [ sim_data.process.transcription.cistron_start_end_pos_in_tu[ (cistron_index, TU_index) ][0] for cistron_index in cistron_indexes ] ) for length in available_lengths: available_cistron_lengths[cistron_indexes] += np.clip( length - cistron_start_positions, 0, cistron_lengths[cistron_indexes] ) # Find number of ribosomes to activate ribosome30S_idx = bulk_name_to_idx( sim_data.molecule_ids.s30_full_complex, bulk_state["id"] ) ribosome30S = bulk_state["count"][ribosome30S_idx] ribosome50S_idx = bulk_name_to_idx( sim_data.molecule_ids.s50_full_complex, bulk_state["id"] ) ribosome50S = bulk_state["count"][ribosome50S_idx] inactive_ribosome_count = np.minimum(ribosome30S, ribosome50S) n_ribosomes_to_activate = np.int64(frac_active_ribosome * inactive_ribosome_count) # Add total available template lengths as weights and normalize protein_init_probs = normalize( available_cistron_lengths[sim_data.relation.cistron_to_monomer_mapping] * translation_efficiencies ) # Sample a multinomial distribution of synthesis probabilities to determine # which types of mRNAs are initialized n_new_proteins = random_state.multinomial( n_ribosomes_to_activate, protein_init_probs ) # Build attributes for active ribosomes protein_indexes = np.empty(n_ribosomes_to_activate, np.int64) cistron_start_positions_on_mRNA = np.empty(n_ribosomes_to_activate, np.int64) positions_on_mRNA_from_cistron_start_site = np.empty( n_ribosomes_to_activate, np.int64 ) mRNA_indexes = np.empty(n_ribosomes_to_activate, np.int64) start_index = 0 nonzero_count = n_new_proteins > 0 for protein_index, protein_counts in zip( np.arange(n_new_proteins.size)[nonzero_count], n_new_proteins[nonzero_count] ): # Set protein index protein_indexes[start_index : start_index + protein_counts] = protein_index # Get index of cistron corresponding to this protein cistron_index = monomer_index_to_cistron_index[protein_index] # Initialize list of available lengths for each transcript and the # indexes of each transcript in the list of mRNA attributes available_lengths = [] attribute_indexes = [] cistron_start_positions = [] # Distribute ribosomes among mRNAs that produce this protein, weighted # by their lengths for TU_index in monomer_index_to_tu_indexes[protein_index]: attribute_indexes_this_TU = np.where(TU_index_mRNAs == TU_index)[0] cistron_start_position = ( sim_data.process.transcription.cistron_start_end_pos_in_tu[ (cistron_index, TU_index) ][0] ) available_lengths.extend( np.clip( length_mRNAs[attribute_indexes_this_TU] - cistron_start_position, 0, cistron_lengths[cistron_index], ) ) attribute_indexes.extend(attribute_indexes_this_TU) cistron_start_positions.extend( [cistron_start_position] * len(attribute_indexes_this_TU) ) available_lengths = np.array(available_lengths) attribute_indexes = np.array(attribute_indexes) cistron_start_positions = np.array(cistron_start_positions) n_ribosomes_per_RNA = random_state.multinomial( protein_counts, normalize(available_lengths) ) # Get unique indexes of each mRNA mRNA_indexes[start_index : start_index + protein_counts] = np.repeat( unique_index_mRNAs[attribute_indexes], n_ribosomes_per_RNA ) # Get full length of this polypeptide peptide_full_length = protein_lengths[protein_index] # Randomly place ribosomes along the length of each mRNA, capped by the # mRNA length expected from the full polypeptide length to prevent # ribosomes from overshooting full peptide lengths cistron_start_positions_on_mRNA[start_index : start_index + protein_counts] = ( np.repeat(cistron_start_positions, n_ribosomes_per_RNA) ) positions_on_mRNA_from_cistron_start_site[ start_index : start_index + protein_counts ] = np.floor( random_state.rand(protein_counts) * np.repeat( np.minimum(available_lengths, peptide_full_length * 3), n_ribosomes_per_RNA, ) ) start_index += protein_counts # Calculate the lengths of the partial polypeptide, and rescale position on # mRNA to be a multiple of three using this peptide length peptide_lengths = np.floor_divide(positions_on_mRNA_from_cistron_start_site, 3) positions_on_mRNA = cistron_start_positions_on_mRNA + 3 * peptide_lengths # Update masses of partially translated proteins sequences = protein_sequences[protein_indexes] mass_increase_protein = computeMassIncrease( sequences, peptide_lengths, aa_weights_incorporated ) # Add end weight mass_increase_protein[peptide_lengths != 0] += end_weight # Add active ribosomes unique_molecules["active_ribosome"] = create_new_unique_molecules( "active_ribosome", n_ribosomes_to_activate, sim_data, unique_id_rng, protein_index=protein_indexes, peptide_length=peptide_lengths, mRNA_index=mRNA_indexes, pos_on_mRNA=positions_on_mRNA, massDiff_protein=mass_increase_protein, ) # Decrease counts of free 30S and 50S ribosomal subunits bulk_state["count"][ribosome30S_idx] = ribosome30S - n_ribosomes_to_activate bulk_state["count"][ribosome50S_idx] = ribosome50S - n_ribosomes_to_activate
[docs] def determine_chromosome_state( tau: Unum, replichore_length: Unum, n_max_replisomes: int, place_holder: int, cell_mass: Unum, critical_mass: Unum, replication_rate: float, ) -> tuple[ dict[str, npt.NDArray[np.int32]], dict[str, npt.NDArray[Any]], dict[str, npt.NDArray[np.int32]], ]: """ Calculates the attributes of oriC's, replisomes, and chromosome domains on the chromosomes at the beginning of the cell cycle. Args: tau: the doubling time of the cell (with Unum time unit) replichore_length: the amount of DNA to be replicated per fork, usually half of the genome, in base-pairs (with Unum nucleotide unit) n_max_replisomes: the maximum number of replisomes that can be formed given the initial counts of replisome subunits place_holder: placeholder value for chromosome domains without child domains cell_mass: total mass of the cell with mass units (with Unum mass unit) critical_mass: mass per oriC before replication is initiated (with Unum mass unit) replication_rate: rate of nucleotide elongation (with Unum nucleotides per time unit) Returns: Three dictionaries, each containing updates to attributes of a unique molecule type. - ``oric_state``: dictionary of the following format:: {'domain_index': a vector of integers indicating which chromosome domain the oriC sequence belongs to.} - ``replisome_state``: dictionary of the following format:: {'coordinates': a vector of integers that indicates where the replisomes are located on the chromosome relative to the origin in base pairs, 'right_replichore': a vector of boolean values that indicates whether the replisome is on the right replichore (True) or the left replichore (False), 'domain_index': a vector of integers indicating which chromosome domain the replisomes belong to. The index of the "mother" domain of the replication fork is assigned to the replisome} - ``domain_state``: dictionary of the following format:: {'domain_index': the indexes of the domains, 'child_domains': the (n_domain X 2) array of the domain indexes of the two children domains that are connected on the oriC side with the given domain.} """ # All inputs must be positive numbers unitless_tau = tau.asNumber(units.s) unitless_replichore_length = replichore_length.asNumber(units.nt) assert unitless_tau >= 0, "tau value can't be negative." assert unitless_replichore_length > 0, "replichore_length must be positive." # Convert to unitless unitless_cell_mass = cell_mass.asNumber(units.fg) unitless_critical_mass = critical_mass.asNumber(units.fg) # Calculate the maximum number of replication rounds given the maximum # count of replisomes n_max_rounds = int(np.log2(n_max_replisomes / 2 + 1)) # Calculate the number of active replication rounds n_rounds = min( n_max_rounds, max(0, int(np.ceil(np.log2(unitless_cell_mass / unitless_critical_mass)))), ) # Initialize arrays for replisomes n_replisomes = 2 * (2**n_rounds - 1) coordinates = np.zeros(n_replisomes, dtype=np.int64) right_replichore_replisome = np.zeros(n_replisomes, dtype=bool) domain_index_replisome = np.zeros(n_replisomes, dtype=np.int32) # Initialize child domain array for chromosome domains n_domains = 2 ** (n_rounds + 1) - 1 child_domains = np.full((n_domains, 2), place_holder, dtype=np.int32) # Set domain_index attribute of oriC's and chromosome domains domain_index_oric = np.arange( 2**n_rounds - 1, 2 ** (n_rounds + 1) - 1, dtype=np.int32 ) domain_index_domains = np.arange(0, n_domains, dtype=np.int32) def n_events_before_this_round(round_idx): """ Calculates the number of replication events that happen before the replication round index given as an argument. Since 2**i events happen at each round i = 0, 1, ..., the sum of the number of events before round j is 2**j - 1. """ return 2**round_idx - 1 # Loop through active replication rounds, starting from the oldest round. # If n_round = 0 skip loop entirely - no active replication round. for round_idx in np.arange(n_rounds): # Determine at which location (base) of the chromosome the replication # forks should be initialized to round_critical_mass = 2**round_idx * unitless_critical_mass growth_rate = np.log(2) / unitless_tau replication_time = ( np.log(unitless_cell_mass / round_critical_mass) / growth_rate ) # TODO: this should handle completed replication (instead of taking min) # for accuracy but will likely never start with multiple chromosomes fork_location = min( np.floor(replication_time * replication_rate), unitless_replichore_length - 1, ) # Add 2^n initiation events per round. A single initiation event # generates two replication forks. n_events_this_round = 2**round_idx # Set attributes of replisomes for this replication round coordinates[ 2 * n_events_before_this_round(round_idx) : 2 * n_events_before_this_round(round_idx + 1) ] = np.tile(np.array([fork_location, -fork_location]), n_events_this_round) right_replichore_replisome[ 2 * n_events_before_this_round(round_idx) : 2 * n_events_before_this_round(round_idx + 1) ] = np.tile(np.array([True, False]), n_events_this_round) for i, domain_index in enumerate( np.arange( n_events_before_this_round(round_idx), n_events_before_this_round(round_idx + 1), ) ): domain_index_replisome[ 2 * n_events_before_this_round(round_idx) + 2 * i : 2 * n_events_before_this_round(round_idx) + 2 * (i + 1) ] = np.repeat(domain_index, 2) # Set attributes of chromosome domains for this replication round for i, domain_index in enumerate( np.arange( n_events_before_this_round(round_idx + 1), n_events_before_this_round(round_idx + 2), 2, ) ): child_domains[n_events_before_this_round(round_idx) + i, :] = np.array( [domain_index, domain_index + 1] ) # Convert to numpy arrays and wrap into dictionaries oric_state = {"domain_index": domain_index_oric} replisome_state = { "coordinates": coordinates, "right_replichore": right_replichore_replisome, "domain_index": domain_index_replisome, } domain_state = { "child_domains": child_domains, "domain_index": domain_index_domains, } return oric_state, replisome_state, domain_state
[docs] def rescale_initiation_probs(init_probs, TU_index, fixed_synth_probs, fixed_TU_indexes): """ Rescales the initiation probabilities of each promoter such that the total synthesis probabilities of certain types of RNAs are fixed to a predetermined value. For instance, if there are two copies of promoters for RNA A, whose synthesis probability should be fixed to 0.1, each promoter is given an initiation probability of 0.05. """ for rna_idx, synth_prob in zip(fixed_TU_indexes, fixed_synth_probs): fixed_rna_mask = TU_index == rna_idx init_probs[fixed_rna_mask] = synth_prob / fixed_rna_mask.sum()
[docs] def calculate_cell_mass(bulk_state, unique_molecules, sim_data): """ Calculates cell mass in femtograms. """ bulk_submass_names = [ f"{submass}_submass" for submass in sim_data.submass_name_to_index.keys() ] cell_mass = ( bulk_state["count"] .dot(rfn.structured_to_unstructured(bulk_state[bulk_submass_names])) .sum() ) if len(unique_molecules) > 0: unique_masses = sim_data.internal_state.unique_molecule.unique_molecule_masses[ "mass" ].asNumber(units.fg / units.mol) / sim_data.constants.n_avogadro.asNumber( 1 / units.mol ) unique_ids = sim_data.internal_state.unique_molecule.unique_molecule_masses[ "id" ] unique_submass_names = [ f"massDiff_{submass}" for submass in sim_data.submass_name_to_index.keys() ] for unique_id, unique_submasses in zip(unique_ids, unique_masses): if unique_id in unique_molecules: cell_mass += ( unique_molecules[unique_id]["_entryState"].sum() * unique_submasses ).sum() cell_mass += rfn.structured_to_unstructured( unique_molecules[unique_id][unique_submass_names] ).sum() return units.fg * cell_mass
[docs] def initialize_trna_charging( bulk_state: np.ndarray, unique_molecules: dict[str, np.ndarray], sim_data: Any, variable_elongation: bool, ): """ Initializes charged tRNA from uncharged tRNA and amino acids Args: bulk_state: Structured array with IDs and counts of all bulk molecules unique_molecules: Mapping of unique molecule names to structured arrays of their current simulation states sim_data: Simulation data loaded from pickle generated by ParCa variable_elongation: Sets max elongation higher if True .. note:: Does not adjust for mass of amino acids on charged tRNA (~0.01% of cell mass) """ # Calculate cell volume for concentrations cell_volume = ( calculate_cell_mass(bulk_state, unique_molecules, sim_data) / sim_data.constants.cell_density ) counts_to_molar = 1 / (sim_data.constants.n_avogadro * cell_volume) # Get molecule views and concentrations transcription = sim_data.process.transcription aa_from_synthetase = transcription.aa_from_synthetase aa_from_trna = transcription.aa_from_trna synthetases = counts( bulk_state, bulk_name_to_idx(transcription.synthetase_names, bulk_state["id"]) ) uncharged_trna_idx = bulk_name_to_idx( transcription.uncharged_trna_names, bulk_state["id"] ) uncharged_trna = counts(bulk_state, uncharged_trna_idx) charged_trna_idx = bulk_name_to_idx( transcription.charged_trna_names, bulk_state["id"] ) charged_trna = counts(bulk_state, charged_trna_idx) aas = counts( bulk_state, bulk_name_to_idx(sim_data.molecule_groups.amino_acids, bulk_state["id"]), ) ribosome_counts = unique_molecules["active_ribosome"]["_entryState"].sum() synthetase_conc = counts_to_molar * np.dot(aa_from_synthetase, synthetases) uncharged_trna_conc = counts_to_molar * np.dot(aa_from_trna, uncharged_trna) charged_trna_conc = counts_to_molar * np.dot(aa_from_trna, charged_trna) aa_conc = counts_to_molar * aas ribosome_conc = counts_to_molar * ribosome_counts # Estimate fraction of amino acids from sequences, excluding first index for padding of -1 _, aas_in_sequences = np.unique( sim_data.process.translation.translation_sequences, return_counts=True ) f = aas_in_sequences[1:] / np.sum(aas_in_sequences[1:]) # Estimate initial charging state constants = sim_data.constants transcription = sim_data.process.transcription metabolism = sim_data.process.metabolism elongation_max = ( constants.ribosome_elongation_rate_max if variable_elongation else constants.ribosome_elongation_rate_basal ) charging_params = { "kS": constants.synthetase_charging_rate.asNumber(1 / units.s), "KMaa": transcription.aa_kms.asNumber(MICROMOLAR_UNITS), "KMtf": transcription.trna_kms.asNumber(MICROMOLAR_UNITS), "krta": constants.Kdissociation_charged_trna_ribosome.asNumber( MICROMOLAR_UNITS ), "krtf": constants.Kdissociation_uncharged_trna_ribosome.asNumber( MICROMOLAR_UNITS ), "max_elong_rate": float(elongation_max.asNumber(units.aa / units.s)), "charging_mask": np.array( [ aa not in REMOVED_FROM_CHARGING for aa in sim_data.molecule_groups.amino_acids ] ), "unit_conversion": metabolism.get_amino_acid_conc_conversion(MICROMOLAR_UNITS), } fraction_charged, *_ = calculate_trna_charging( synthetase_conc, uncharged_trna_conc, charged_trna_conc, aa_conc, ribosome_conc, f, charging_params, ) # Update counts of tRNA to match charging total_trna_counts = uncharged_trna + charged_trna charged_trna_counts = np.round( total_trna_counts * np.dot(fraction_charged, aa_from_trna) ) uncharged_trna_counts = total_trna_counts - charged_trna_counts bulk_state["count"][charged_trna_idx] = charged_trna_counts bulk_state["count"][uncharged_trna_idx] = uncharged_trna_counts