"""
Functions to initialize molecule states from sim_data.
"""
import numpy as np
import numpy.typing as npt
from numpy.lib import recfunctions as rfn
from typing import Any
from unum import Unum
from ecoli.library.schema import attrs, bulk_name_to_idx, create_unique_indexes, counts
from ecoli.processes.polypeptide_elongation import (
calculate_trna_charging,
REMOVED_FROM_CHARGING,
MICROMOLAR_UNITS,
)
from wholecell.utils import units
from wholecell.utils.fitting import (
countsFromMassAndExpression,
masses_and_counts_for_homeostatic_target,
normalize,
)
from wholecell.utils.mc_complexation import mccFormComplexesWithPrebuiltMatrices
from wholecell.utils.polymerize import computeMassIncrease
from wholecell.utils.random import stochasticRound
RAND_MAX = 2**31
[docs]
def create_bulk_container(
sim_data,
n_seeds=1,
condition=None,
seed=0,
ppgpp_regulation=True,
trna_attenuation=True,
mass_coeff=1,
form_complexes=True,
):
try:
old_condition = sim_data.condition
if condition is not None:
sim_data.condition = condition
media_id = sim_data.conditions[sim_data.condition]["nutrients"]
exchange_data = sim_data.external_state.exchange_data_from_media(media_id)
import_molecules = set(
exchange_data["importUnconstrainedExchangeMolecules"]
) | set(exchange_data["importConstrainedExchangeMolecules"])
random_state = np.random.RandomState(seed=seed)
# Construct bulk container
ids_molecules = sim_data.internal_state.bulk_molecules.bulk_data["id"]
average_container = np.array(
[mol_data for mol_data in zip(ids_molecules, np.zeros(len(ids_molecules)))],
dtype=[("id", ids_molecules.dtype), ("count", np.float64)],
)
for n in range(n_seeds):
random_state = np.random.RandomState(seed=seed + n)
average_container["count"] += initialize_bulk_counts(
sim_data,
media_id,
import_molecules,
random_state,
mass_coeff,
ppgpp_regulation,
trna_attenuation,
form_complexes=form_complexes,
)["count"]
except Exception:
raise RuntimeError(
"sim_data might not be fully initialized. "
"Make sure all attributes have been set before "
"using this function."
)
sim_data.condition = old_condition
average_container["count"] = average_container["count"] / n_seeds
return average_container
[docs]
def initialize_bulk_counts(
sim_data,
media_id,
import_molecules,
random_state,
mass_coeff,
ppgpp_regulation,
trna_attenuation,
form_complexes=True,
):
# Allocate count array to populate
bulk_counts = np.zeros(
len(sim_data.internal_state.bulk_molecules.bulk_data["id"]), dtype=int
)
# Set protein counts from expression
initialize_protein_monomers(
bulk_counts,
sim_data,
random_state,
mass_coeff,
ppgpp_regulation,
trna_attenuation,
)
# Set RNA counts from expression
initialize_rna(
bulk_counts,
sim_data,
random_state,
mass_coeff,
ppgpp_regulation,
trna_attenuation,
)
# Set mature RNA counts
initialize_mature_RNA(bulk_counts, sim_data)
# Set other biomass components
set_small_molecule_counts(
bulk_counts, sim_data, media_id, import_molecules, mass_coeff
)
# Form complexes
if form_complexes:
initialize_complexation(bulk_counts, sim_data, random_state)
bulk_masses = sim_data.internal_state.bulk_molecules.bulk_data["mass"].asNumber(
units.fg / units.mol
) / sim_data.constants.n_avogadro.asNumber(1 / units.mol)
bulk_submasses = []
bulk_submass_dtypes = []
for submass, idx in sim_data.submass_name_to_index.items():
bulk_submasses.append(bulk_masses[:, idx])
bulk_submass_dtypes.append((f"{submass}_submass", np.float64))
bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data.struct_array["id"]
bulk_array = np.array(
[mol_data for mol_data in zip(bulk_ids, bulk_counts, *bulk_submasses)],
dtype=[("id", bulk_ids.dtype), ("count", int)] + bulk_submass_dtypes,
)
return bulk_array
[docs]
def initialize_unique_molecules(
bulk_state,
sim_data,
cell_mass,
random_state,
unique_id_rng,
superhelical_density,
ppgpp_regulation,
trna_attenuation,
mechanistic_replisome,
):
unique_molecules = {}
# Initialize counts of full chromosomes
initialize_full_chromosome(unique_molecules, sim_data, unique_id_rng)
# Initialize unique molecules relevant to replication
initialize_replication(
bulk_state,
unique_molecules,
sim_data,
cell_mass,
mechanistic_replisome,
unique_id_rng,
)
# Initialize bound transcription factors
initialize_transcription_factors(
bulk_state, unique_molecules, sim_data, random_state
)
# Initialize active RNAPs and unique molecule representations of RNAs
initialize_transcription(
bulk_state,
unique_molecules,
sim_data,
random_state,
unique_id_rng,
ppgpp_regulation,
trna_attenuation,
)
# Initialize linking numbers of chromosomal segments
if superhelical_density:
initialize_chromosomal_segments(unique_molecules, sim_data, unique_id_rng)
else:
unique_molecules["chromosomal_segment"] = create_new_unique_molecules(
"chromosomal_segment", 0, sim_data, unique_id_rng
)
# Initialize active ribosomes
initialize_translation(
bulk_state, unique_molecules, sim_data, random_state, unique_id_rng
)
return unique_molecules
[docs]
def create_new_unique_molecules(name, n_mols, sim_data, random_state, **attrs):
"""
Helper function to create a new Numpy structured array with n_mols
instances of the unique molecule called name. Accepts keyword arguments
that become initial values for specified attributes of the new molecules.
"""
dtypes = list(
sim_data.internal_state.unique_molecule.unique_molecule_definitions[
name
].items()
)
submasses = list(sim_data.submass_name_to_index)
dtypes += [(f"massDiff_{submass}", "<f8") for submass in submasses]
dtypes += [("_entryState", "i1"), ("unique_index", "<i8")]
unique_mols = np.zeros(n_mols, dtype=dtypes)
for attr_name, attr_value in attrs.items():
unique_mols[attr_name] = attr_value
unique_mols["unique_index"] = create_unique_indexes(n_mols, random_state)
unique_mols["_entryState"] = 1
return unique_mols
[docs]
def initialize_protein_monomers(
bulk_counts, sim_data, random_state, mass_coeff, ppgpp_regulation, trna_attenuation
):
monomer_mass = (
mass_coeff
* sim_data.mass.get_component_masses(
sim_data.condition_to_doubling_time[sim_data.condition]
)["proteinMass"]
/ sim_data.mass.avg_cell_to_initial_cell_conversion_factor
)
# TODO: unify this logic with the parca so it doesn]t fall out of step
# again (look at teh calProteinCounts function)
transcription = sim_data.process.transcription
if ppgpp_regulation:
rna_expression = sim_data.calculate_ppgpp_expression(sim_data.condition)
else:
rna_expression = transcription.rna_expression[sim_data.condition]
if trna_attenuation:
# Need to adjust expression (calculated without attenuation) by basal_adjustment
# to get the expected expression without any attenuation and then multiply
# by the condition readthrough probability to get the condition specific expression
readthrough = transcription.attenuation_readthrough[sim_data.condition]
basal_adjustment = transcription.attenuation_readthrough["basal"]
rna_expression[transcription.attenuated_rna_indices] *= (
readthrough / basal_adjustment
)
monomer_expression = normalize(
sim_data.process.transcription.cistron_tu_mapping_matrix.dot(rna_expression)[
sim_data.relation.cistron_to_monomer_mapping
]
* sim_data.process.translation.translation_efficiencies_by_monomer
/ (
np.log(2)
/ sim_data.condition_to_doubling_time[sim_data.condition].asNumber(units.s)
+ sim_data.process.translation.monomer_data["deg_rate"].asNumber(
1 / units.s
)
)
)
n_monomers = countsFromMassAndExpression(
monomer_mass.asNumber(units.g),
sim_data.process.translation.monomer_data["mw"].asNumber(units.g / units.mol),
monomer_expression,
sim_data.constants.n_avogadro.asNumber(1 / units.mol),
)
# Get indices for monomers in bulk counts array
monomer_ids = sim_data.process.translation.monomer_data["id"]
bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"]
monomer_idx = bulk_name_to_idx(monomer_ids, bulk_ids)
# Calculate initial counts of each monomer from mutinomial distribution
bulk_counts[monomer_idx] = random_state.multinomial(n_monomers, monomer_expression)
[docs]
def initialize_rna(
bulk_counts, sim_data, random_state, mass_coeff, ppgpp_regulation, trna_attenuation
):
"""
Initializes counts of RNAs in the bulk molecule container using RNA
expression data. mRNA counts are also initialized here, but is later reset
to zero when the representations for mRNAs are moved to the unique molecule
container.
"""
transcription = sim_data.process.transcription
rna_mass = (
mass_coeff
* sim_data.mass.get_component_masses(
sim_data.condition_to_doubling_time[sim_data.condition]
)["rnaMass"]
/ sim_data.mass.avg_cell_to_initial_cell_conversion_factor
)
if ppgpp_regulation:
rna_expression = sim_data.calculate_ppgpp_expression(sim_data.condition)
else:
rna_expression = normalize(transcription.rna_expression[sim_data.condition])
if trna_attenuation:
# Need to adjust expression (calculated without attenuation) by basal_adjustment
# to get the expected expression without any attenuation and then multiply
# by the condition readthrough probability to get the condition specific expression
readthrough = transcription.attenuation_readthrough[sim_data.condition]
basal_adjustment = transcription.attenuation_readthrough["basal"]
rna_expression[transcription.attenuated_rna_indices] *= (
readthrough / basal_adjustment
)
rna_expression /= rna_expression.sum()
n_rnas = countsFromMassAndExpression(
rna_mass.asNumber(units.g),
transcription.rna_data["mw"].asNumber(units.g / units.mol),
rna_expression,
sim_data.constants.n_avogadro.asNumber(1 / units.mol),
)
# Get indices for monomers in bulk counts array
rna_ids = transcription.rna_data["id"]
bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"]
rna_idx = bulk_name_to_idx(rna_ids, bulk_ids)
# Calculate initial counts of each RNA from mutinomial distribution
bulk_counts[rna_idx] = random_state.multinomial(n_rnas, rna_expression)
[docs]
def initialize_mature_RNA(bulk_counts, sim_data):
"""
Initializes counts of mature RNAs in the bulk molecule container using the
counts of unprocessed RNAs. Also consolidates the different variants of each
rRNA molecule into the main type.
"""
transcription = sim_data.process.transcription
rna_data = transcription.rna_data
unprocessed_rna_ids = rna_data["id"][rna_data["is_unprocessed"]]
bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"]
unprocessed_rna_idx = bulk_name_to_idx(unprocessed_rna_ids, bulk_ids)
# Skip if there are no unprocessed RNAs represented
if len(unprocessed_rna_ids) > 0:
mature_rna_ids = transcription.mature_rna_data["id"]
maturation_stoich_matrix = transcription.rna_maturation_stoich_matrix
mature_rna_idx = bulk_name_to_idx(mature_rna_ids, bulk_ids)
# Get counts of unprocessed RNAs
unprocessed_rna_counts = bulk_counts[unprocessed_rna_idx]
# Assume all unprocessed RNAs are converted to mature RNAs
bulk_counts[unprocessed_rna_idx] = 0
bulk_counts[mature_rna_idx] += maturation_stoich_matrix.dot(
unprocessed_rna_counts
)
# Get IDs of rRNAs
main_23s_rRNA_id = sim_data.molecule_groups.s50_23s_rRNA[0]
main_16s_rRNA_id = sim_data.molecule_groups.s30_16s_rRNA[0]
main_5s_rRNA_id = sim_data.molecule_groups.s50_5s_rRNA[0]
variant_23s_rRNA_ids = sim_data.molecule_groups.s50_23s_rRNA[1:]
variant_16s_rRNA_ids = sim_data.molecule_groups.s30_16s_rRNA[1:]
variant_5s_rRNA_ids = sim_data.molecule_groups.s50_5s_rRNA[1:]
# Get indices of main and variant rRNAs
main_23s_rRNA_idx = bulk_name_to_idx(main_23s_rRNA_id, bulk_ids)
main_16s_rRNA_idx = bulk_name_to_idx(main_16s_rRNA_id, bulk_ids)
main_5s_rRNA_idx = bulk_name_to_idx(main_5s_rRNA_id, bulk_ids)
variant_23s_rRNA_idx = bulk_name_to_idx(variant_23s_rRNA_ids, bulk_ids)
variant_16s_rRNA_idx = bulk_name_to_idx(variant_16s_rRNA_ids, bulk_ids)
variant_5s_rRNA_idx = bulk_name_to_idx(variant_5s_rRNA_ids, bulk_ids)
# Evolve states
bulk_counts[main_23s_rRNA_idx] += bulk_counts[variant_23s_rRNA_idx].sum()
bulk_counts[main_16s_rRNA_idx] += bulk_counts[variant_16s_rRNA_idx].sum()
bulk_counts[main_5s_rRNA_idx] += bulk_counts[variant_5s_rRNA_idx].sum()
bulk_counts[variant_23s_rRNA_idx] -= bulk_counts[variant_23s_rRNA_idx]
bulk_counts[variant_16s_rRNA_idx] -= bulk_counts[variant_16s_rRNA_idx]
bulk_counts[variant_5s_rRNA_idx] -= bulk_counts[variant_5s_rRNA_idx]
# TODO: remove checks for zero concentrations (change to assertion)
# TODO: move any rescaling logic to KB/fitting
[docs]
def set_small_molecule_counts(
bulk_counts, sim_data, media_id, import_molecules, mass_coeff, cell_mass=None
):
doubling_time = sim_data.condition_to_doubling_time[sim_data.condition]
conc_dict = sim_data.process.metabolism.concentration_updates.concentrations_based_on_nutrients(
media_id=media_id, imports=import_molecules
)
conc_dict.update(sim_data.mass.getBiomassAsConcentrations(doubling_time))
conc_dict[sim_data.molecule_ids.ppGpp] = (
sim_data.growth_rate_parameters.get_ppGpp_conc(doubling_time)
)
molecule_ids = sorted(conc_dict)
bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"]
molecule_concentrations = (units.mol / units.L) * np.array(
[conc_dict[key].asNumber(units.mol / units.L) for key in molecule_ids]
)
if cell_mass is None:
avg_cell_fraction_mass = sim_data.mass.get_component_masses(doubling_time)
other_dry_mass = (
mass_coeff
* (
avg_cell_fraction_mass["proteinMass"]
+ avg_cell_fraction_mass["rnaMass"]
+ avg_cell_fraction_mass["dnaMass"]
)
/ sim_data.mass.avg_cell_to_initial_cell_conversion_factor
)
else:
small_molecule_mass = 0 * units.fg
for mol in conc_dict:
mol_idx = bulk_name_to_idx(mol, bulk_ids)
small_molecule_mass += (
bulk_counts[mol_idx]
* sim_data.getter.get_mass(mol)
/ sim_data.constants.n_avogadro
)
other_dry_mass = cell_mass - small_molecule_mass
masses_to_add, counts_to_add = masses_and_counts_for_homeostatic_target(
other_dry_mass,
molecule_concentrations,
sim_data.getter.get_masses(molecule_ids),
sim_data.constants.cell_density,
sim_data.constants.n_avogadro,
)
molecule_idx = bulk_name_to_idx(molecule_ids, bulk_ids)
bulk_counts[molecule_idx] = counts_to_add
[docs]
def initialize_complexation(bulk_counts, sim_data, random_state):
molecule_names = sim_data.process.complexation.molecule_names
bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"]
molecule_idx = bulk_name_to_idx(molecule_names, bulk_ids)
stoich_matrix = sim_data.process.complexation.stoich_matrix().astype(
np.int64, order="F"
)
molecule_counts = bulk_counts[molecule_idx]
updated_molecule_counts, complexation_events = mccFormComplexesWithPrebuiltMatrices(
molecule_counts,
random_state.randint(1000),
stoich_matrix,
*sim_data.process.complexation.prebuilt_matrices,
)
bulk_counts[molecule_idx] = updated_molecule_counts
if np.any(updated_molecule_counts < 0):
raise ValueError("Negative counts after complexation")
[docs]
def initialize_full_chromosome(unique_molecules, sim_data, unique_id_rng):
"""
Initializes the counts of full chromosomes to one. The division_time of
this initial chromosome is set to be zero for consistency.
"""
unique_molecules["full_chromosome"] = create_new_unique_molecules(
"full_chromosome",
1,
sim_data,
unique_id_rng,
division_time=0.0,
has_triggered_division=True,
domain_index=0,
)
[docs]
def initialize_replication(
bulk_state,
unique_molecules,
sim_data,
cell_mass,
mechanistic_replisome,
unique_id_rng,
):
"""
Initializes replication by creating an appropriate number of replication
forks given the cell growth rate. This also initializes the gene dosage
bulk counts using the initial locations of the forks.
"""
# Determine the number and location of replication forks at the start of
# the cell cycle
# Get growth rate constants
tau = sim_data.condition_to_doubling_time[sim_data.condition].asUnit(units.min)
critical_mass = sim_data.mass.get_dna_critical_mass(tau)
replication_rate = sim_data.process.replication.basal_elongation_rate
# Calculate length of replichore
genome_length = sim_data.process.replication.genome_length
replichore_length = np.ceil(0.5 * genome_length) * units.nt
# Calculate the maximum number of replisomes that could be formed with
# the existing counts of replisome subunits. If mechanistic_replisome option
# is off, set to an arbitrary high number.
replisome_trimer_idx = bulk_name_to_idx(
sim_data.molecule_groups.replisome_trimer_subunits, bulk_state["id"]
)
replisome_monomer_idx = bulk_name_to_idx(
sim_data.molecule_groups.replisome_monomer_subunits, bulk_state["id"]
)
if mechanistic_replisome:
n_max_replisomes = np.min(
np.concatenate(
(
bulk_state["count"][replisome_trimer_idx] // 3,
bulk_state["count"][replisome_monomer_idx],
)
)
)
else:
n_max_replisomes = 1000
# Generate arrays specifying appropriate initial replication conditions
oric_state, replisome_state, domain_state = determine_chromosome_state(
tau,
replichore_length,
n_max_replisomes,
sim_data.process.replication.no_child_place_holder,
cell_mass,
critical_mass,
replication_rate,
)
n_oric = oric_state["domain_index"].size
n_replisome = replisome_state["domain_index"].size
n_domain = domain_state["domain_index"].size
# Add OriC molecules with the proposed attributes
unique_molecules["oriC"] = create_new_unique_molecules(
"oriC", n_oric, sim_data, unique_id_rng, domain_index=oric_state["domain_index"]
)
# Add chromosome domain molecules with the proposed attributes
unique_molecules["chromosome_domain"] = create_new_unique_molecules(
"chromosome_domain",
n_domain,
sim_data,
unique_id_rng,
domain_index=domain_state["domain_index"],
child_domains=domain_state["child_domains"],
)
if n_replisome != 0:
# Update mass of replisomes if the mechanistic replisome option is set
if mechanistic_replisome:
replisome_trimer_subunit_masses = np.vstack(
[
sim_data.getter.get_submass_array(x).asNumber(
units.fg / units.count
)
for x in sim_data.molecule_groups.replisome_trimer_subunits
]
)
replisome_monomer_subunit_masses = np.vstack(
[
sim_data.getter.get_submass_array(x).asNumber(
units.fg / units.count
)
for x in sim_data.molecule_groups.replisome_monomer_subunits
]
)
replisome_mass_array = 3 * replisome_trimer_subunit_masses.sum(
axis=0
) + replisome_monomer_subunit_masses.sum(axis=0)
replisome_protein_mass = replisome_mass_array.sum()
else:
replisome_protein_mass = 0.0
# Update mass to account for DNA strands that have already been
# elongated.
sequences = sim_data.process.replication.replication_sequences
fork_coordinates = replisome_state["coordinates"]
sequence_elongations = np.abs(np.repeat(fork_coordinates, 2))
mass_increase_dna = computeMassIncrease(
np.tile(sequences, (n_replisome // 2, 1)),
sequence_elongations,
sim_data.process.replication.replication_monomer_weights.asNumber(units.fg),
)
# Add active replisomes as unique molecules and set attributes
unique_molecules["active_replisome"] = create_new_unique_molecules(
"active_replisome",
n_replisome,
sim_data,
unique_id_rng,
domain_index=replisome_state["domain_index"],
coordinates=replisome_state["coordinates"],
right_replichore=replisome_state["right_replichore"],
massDiff_DNA=mass_increase_dna[0::2] + mass_increase_dna[1::2],
massDiff_protein=replisome_protein_mass,
)
if mechanistic_replisome:
# Remove replisome subunits from bulk molecules
bulk_state["count"][replisome_trimer_idx] -= 3 * n_replisome
bulk_state["count"][replisome_monomer_idx] -= n_replisome
else:
# For n_replisome = 0, still create an empty structured array with
# the expected fields
unique_molecules["active_replisome"] = create_new_unique_molecules(
"active_replisome", n_replisome, sim_data, unique_id_rng
)
# Get coordinates of all genes, promoters and DnaA boxes
all_gene_coordinates = sim_data.process.transcription.cistron_data[
"replication_coordinate"
]
all_promoter_coordinates = sim_data.process.transcription.rna_data[
"replication_coordinate"
]
all_DnaA_box_coordinates = sim_data.process.replication.motif_coordinates[
"DnaA_box"
]
# Define function that initializes attributes of sequence motifs given the
# initial state of the chromosome
def get_motif_attributes(all_motif_coordinates):
"""
Using the initial positions of replication forks, calculate attributes
of unique molecules representing DNA motifs, given their positions on
the genome.
Args:
all_motif_coordinates (ndarray): Genomic coordinates of DNA motifs,
represented in a specific order
Returns:
motif_index: Indices of all motif copies, in the case where
different indexes imply a different functional role
motif_coordinates: Genomic coordinates of all motif copies
motif_domain_index: Domain indexes of the chromosome domain that
each motif copy belongs to
"""
motif_index, motif_coordinates, motif_domain_index = [], [], []
def in_bounds(coordinates, lb, ub):
return np.logical_and(coordinates < ub, coordinates > lb)
# Loop through all chromosome domains
for domain_idx in domain_state["domain_index"]:
# If the domain is the mother domain of the initial chromosome,
if domain_idx == 0:
if n_replisome == 0:
# No replisomes - all motifs should fall in this domain
motif_mask = np.ones_like(all_motif_coordinates, dtype=bool)
else:
# Get domain boundaries
domain_boundaries = replisome_state["coordinates"][
replisome_state["domain_index"] == 0
]
# Add motifs outside of this boundary
motif_mask = np.logical_or(
all_motif_coordinates > domain_boundaries.max(),
all_motif_coordinates < domain_boundaries.min(),
)
# If the domain contains the origin,
elif np.isin(domain_idx, oric_state["domain_index"]):
# Get index of the parent domain
parent_domain_idx = domain_state["domain_index"][
np.where(domain_state["child_domains"] == domain_idx)[0]
]
# Get domain boundaries of the parent domain
parent_domain_boundaries = replisome_state["coordinates"][
replisome_state["domain_index"] == parent_domain_idx
]
# Add motifs inside this boundary
motif_mask = in_bounds(
all_motif_coordinates,
parent_domain_boundaries.min(),
parent_domain_boundaries.max(),
)
# If the domain neither contains the origin nor the terminus,
else:
# Get index of the parent domain
parent_domain_idx = domain_state["domain_index"][
np.where(domain_state["child_domains"] == domain_idx)[0]
]
# Get domain boundaries of the parent domain
parent_domain_boundaries = replisome_state["coordinates"][
replisome_state["domain_index"] == parent_domain_idx
]
# Get domain boundaries of this domain
domain_boundaries = replisome_state["coordinates"][
replisome_state["domain_index"] == domain_idx
]
# Add motifs between the boundaries
motif_mask = np.logical_or(
in_bounds(
all_motif_coordinates,
domain_boundaries.max(),
parent_domain_boundaries.max(),
),
in_bounds(
all_motif_coordinates,
parent_domain_boundaries.min(),
domain_boundaries.min(),
),
)
# Append attributes to existing list
motif_index.extend(np.nonzero(motif_mask)[0])
motif_coordinates.extend(all_motif_coordinates[motif_mask])
motif_domain_index.extend(np.full(motif_mask.sum(), domain_idx))
return motif_index, motif_coordinates, motif_domain_index
# Use function to get attributes for promoters and DnaA boxes
TU_index, promoter_coordinates, promoter_domain_index = get_motif_attributes(
all_promoter_coordinates
)
cistron_index, gene_coordinates, gene_domain_index = get_motif_attributes(
all_gene_coordinates
)
_, DnaA_box_coordinates, DnaA_box_domain_index = get_motif_attributes(
all_DnaA_box_coordinates
)
# Add promoters as unique molecules and set attributes
# Note: the bound_TF attribute is properly initialized in the function
# initialize_transcription_factors
n_promoter = len(TU_index)
n_tf = len(sim_data.process.transcription_regulation.tf_ids)
unique_molecules["promoter"] = create_new_unique_molecules(
"promoter",
n_promoter,
sim_data,
unique_id_rng,
domain_index=promoter_domain_index,
coordinates=promoter_coordinates,
TU_index=TU_index,
bound_TF=np.zeros((n_promoter, n_tf), dtype=bool),
)
# Add genes as unique molecules and set attributes
n_gene = len(cistron_index)
unique_molecules["gene"] = create_new_unique_molecules(
"gene",
n_gene,
sim_data,
unique_id_rng,
cistron_index=cistron_index,
coordinates=gene_coordinates,
domain_index=gene_domain_index,
)
# Add DnaA boxes as unique molecules and set attributes
n_DnaA_box = len(DnaA_box_coordinates)
unique_molecules["DnaA_box"] = create_new_unique_molecules(
"DnaA_box",
n_DnaA_box,
sim_data,
unique_id_rng,
domain_index=DnaA_box_domain_index,
coordinates=DnaA_box_coordinates,
DnaA_bound=np.zeros(n_DnaA_box, dtype=bool),
)
[docs]
def initialize_transcription_factors(
bulk_state, unique_molecules, sim_data, random_state
):
"""
Initialize transcription factors that are bound to the chromosome. For each
type of transcription factor, this function calculates the total number of
transcription factors that should be bound to the chromosome using the
binding probabilities of each transcription factor and the number of
available promoter sites. The calculated number of transcription factors
are then distributed randomly to promoters, whose bound_TF attributes and
submasses are updated correspondingly.
"""
# Get transcription factor properties from sim_data
tf_ids = sim_data.process.transcription_regulation.tf_ids
tf_to_tf_type = sim_data.process.transcription_regulation.tf_to_tf_type
p_promoter_bound_TF = sim_data.process.transcription_regulation.p_promoter_bound_tf
# Build dict that maps TFs to transcription units they regulate
delta_prob = sim_data.process.transcription_regulation.delta_prob
TF_to_TU_idx = {}
for i, tf in enumerate(tf_ids):
TF_to_TU_idx[tf] = delta_prob["deltaI"][delta_prob["deltaJ"] == i]
# Get views into bulk molecule representations of transcription factors
active_tf_view = {}
inactive_tf_view = {}
active_tf_view_idx = {}
inactive_tf_view_idx = {}
for tf in tf_ids:
tf_idx = bulk_name_to_idx(tf + "[c]", bulk_state["id"])
active_tf_view[tf] = bulk_state["count"][tf_idx]
active_tf_view_idx[tf] = tf_idx
if tf_to_tf_type[tf] == "1CS":
if tf == sim_data.process.transcription_regulation.active_to_bound[tf]:
inactive_tf_idx = bulk_name_to_idx(
sim_data.process.equilibrium.get_unbound(tf + "[c]"),
bulk_state["id"],
)
inactive_tf_view[tf] = bulk_state["count"][inactive_tf_idx]
else:
inactive_tf_idx = bulk_name_to_idx(
sim_data.process.transcription_regulation.active_to_bound[tf]
+ "[c]",
bulk_state["id"],
)
inactive_tf_view[tf] = bulk_state["count"][inactive_tf_idx]
elif tf_to_tf_type[tf] == "2CS":
inactive_tf_idx = bulk_name_to_idx(
sim_data.process.two_component_system.active_to_inactive_tf[tf + "[c]"],
bulk_state["id"],
)
inactive_tf_view[tf] = bulk_state["count"][inactive_tf_idx]
inactive_tf_view_idx[tf] = inactive_tf_idx
# Get masses of active transcription factors
tf_indexes = [np.where(bulk_state["id"] == tf_id + "[c]")[0][0] for tf_id in tf_ids]
active_tf_masses = (
sim_data.internal_state.bulk_molecules.bulk_data["mass"][tf_indexes]
/ sim_data.constants.n_avogadro
).asNumber(units.fg)
# Get TU indices of promoters
TU_index = unique_molecules["promoter"]["TU_index"]
# Initialize bound_TF array
bound_TF = np.zeros((len(TU_index), len(tf_ids)), dtype=bool)
for tf_idx, tf_id in enumerate(tf_ids):
# Get counts of transcription factors
active_tf_counts = active_tf_view[tf_id]
# If there are no active transcription factors at initialization,
# continue to the next transcription factor
if active_tf_counts == 0:
continue
# Compute probability of binding the promoter
if tf_to_tf_type[tf_id] == "0CS":
p_promoter_bound = 1.0
else:
inactive_tf_counts = inactive_tf_view[tf_id]
p_promoter_bound = p_promoter_bound_TF(active_tf_counts, inactive_tf_counts)
# Determine the number of available promoter sites
available_promoters = np.isin(TU_index, TF_to_TU_idx[tf_id])
n_available_promoters = available_promoters.sum()
# Calculate the number of promoters that should be bound
n_to_bind = int(
stochasticRound(
random_state, np.full(n_available_promoters, p_promoter_bound)
).sum()
)
bound_locs = np.zeros(n_available_promoters, dtype=bool)
if n_to_bind > 0:
# Determine randomly which DNA targets to bind based on which of
# the following is more limiting:
# number of promoter sites to bind, or number of active
# transcription factors
bound_locs[
random_state.choice(
n_available_promoters,
size=min(n_to_bind, active_tf_view[tf_id]),
replace=False,
)
] = True
# Update count of free transcription factors
bulk_state["count"][active_tf_view_idx[tf_id]] -= bound_locs.sum()
# Update bound_TF array
bound_TF[available_promoters, tf_idx] = bound_locs
# Calculate masses of bound TFs
mass_diffs = bound_TF.dot(active_tf_masses)
# Reset bound_TF attribute of promoters
unique_molecules["promoter"]["bound_TF"] = bound_TF
# Add mass_diffs array to promoter submass
for submass, i in sim_data.submass_name_to_index.items():
unique_molecules["promoter"][f"massDiff_{submass}"] = mass_diffs[:, i]
[docs]
def initialize_transcription(
bulk_state,
unique_molecules,
sim_data,
random_state,
unique_id_rng,
ppgpp_regulation,
trna_attenuation,
):
"""
Activate RNA polymerases as unique molecules, and distribute them along
lengths of trancription units, while decreasing counts of unactivated RNA
polymerases (APORNAP-CPLX[c]). Also initialize unique molecule
representations of fully transcribed mRNAs and partially transcribed RNAs,
using counts of mRNAs initialized as bulk molecules, and the attributes of
initialized RNA polymerases. The counts of full mRNAs represented as bulk
molecules are reset to zero.
RNA polymerases are placed randomly across the length of each transcription
unit, with the synthesis probabilities for each TU determining the number of
RNA polymerases placed at each gene.
"""
# Load parameters
rna_lengths = sim_data.process.transcription.rna_data["length"].asNumber()
rna_masses = (
sim_data.process.transcription.rna_data["mw"] / sim_data.constants.n_avogadro
).asNumber(units.fg)
current_media_id = sim_data.conditions[sim_data.condition]["nutrients"]
frac_active_rnap = sim_data.process.transcription.rnapFractionActiveDict[
current_media_id
]
inactive_rnap_idx = bulk_name_to_idx(
sim_data.molecule_ids.full_RNAP, bulk_state["id"]
)
inactive_RNAP_counts = bulk_state["count"][inactive_rnap_idx]
rna_sequences = sim_data.process.transcription.transcription_sequences
nt_weights = sim_data.process.transcription.transcription_monomer_weights
end_weight = sim_data.process.transcription.transcription_end_weight
replichore_lengths = sim_data.process.replication.replichore_lengths
chromosome_length = replichore_lengths.sum()
# Number of rnaPoly to activate
n_RNAPs_to_activate = np.int64(frac_active_rnap * inactive_RNAP_counts)
# Get attributes of promoters
TU_index, bound_TF, domain_index_promoters = attrs(
unique_molecules["promoter"], ["TU_index", "bound_TF", "domain_index"]
)
# Parameters for rnaSynthProb
if ppgpp_regulation:
doubling_time = sim_data.condition_to_doubling_time[sim_data.condition]
ppgpp_conc = sim_data.growth_rate_parameters.get_ppGpp_conc(doubling_time)
basal_prob, _ = sim_data.process.transcription.synth_prob_from_ppgpp(
ppgpp_conc, sim_data.process.replication.get_average_copy_number
)
ppgpp_scale = basal_prob[TU_index]
# Use original delta prob if no ppGpp basal prob
ppgpp_scale[ppgpp_scale == 0] = 1
else:
basal_prob = sim_data.process.transcription_regulation.basal_prob.copy()
ppgpp_scale = 1
if trna_attenuation:
basal_prob[sim_data.process.transcription.attenuated_rna_indices] += (
sim_data.process.transcription.attenuation_basal_prob_adjustments
)
n_TUs = len(basal_prob)
delta_prob_matrix = sim_data.process.transcription_regulation.get_delta_prob_matrix(
dense=True, ppgpp=ppgpp_regulation
)
# Synthesis probabilities for different categories of genes
rna_synth_prob_fractions = sim_data.process.transcription.rnaSynthProbFraction
rna_synth_prob_R_protein = sim_data.process.transcription.rnaSynthProbRProtein
rna_synth_prob_rna_polymerase = (
sim_data.process.transcription.rnaSynthProbRnaPolymerase
)
# Get coordinates and transcription directions of transcription units
replication_coordinate = sim_data.process.transcription.rna_data[
"replication_coordinate"
]
transcription_direction = sim_data.process.transcription.rna_data["is_forward"]
# Determine changes from genetic perturbations
genetic_perturbations = {}
perturbations = getattr(sim_data, "genetic_perturbations", {})
if len(perturbations) > 0:
probability_indexes = [
(index, sim_data.genetic_perturbations[rna_data["id"]])
for index, rna_data in enumerate(sim_data.process.transcription.rna_data)
if rna_data["id"] in sim_data.genetic_perturbations
]
genetic_perturbations = {
"fixedRnaIdxs": [pair[0] for pair in probability_indexes],
"fixedSynthProbs": [pair[1] for pair in probability_indexes],
}
# ID Groups
idx_rRNA = np.where(sim_data.process.transcription.rna_data["is_rRNA"])[0]
idx_mRNA = np.where(sim_data.process.transcription.rna_data["is_mRNA"])[0]
idx_tRNA = np.where(sim_data.process.transcription.rna_data["is_tRNA"])[0]
idx_rprotein = np.where(
sim_data.process.transcription.rna_data["includes_ribosomal_protein"]
)[0]
idx_rnap = np.where(sim_data.process.transcription.rna_data["includes_RNAP"])[0]
# Calculate probabilities of the RNAP binding to the promoters
promoter_init_probs = basal_prob[TU_index] + ppgpp_scale * np.multiply(
delta_prob_matrix[TU_index, :], bound_TF
).sum(axis=1)
if len(genetic_perturbations) > 0:
rescale_initiation_probs(
promoter_init_probs,
TU_index,
genetic_perturbations["fixedSynthProbs"],
genetic_perturbations["fixedRnaIdxs"],
)
# Adjust probabilities to not be negative
promoter_init_probs[promoter_init_probs < 0] = 0.0
promoter_init_probs /= promoter_init_probs.sum()
if np.any(promoter_init_probs < 0):
raise Exception("Have negative RNA synthesis probabilities")
# Adjust synthesis probabilities depending on environment
synth_prob_fractions = rna_synth_prob_fractions[current_media_id]
# Create masks for different types of RNAs
is_mRNA = np.isin(TU_index, idx_mRNA)
is_tRNA = np.isin(TU_index, idx_tRNA)
is_rRNA = np.isin(TU_index, idx_rRNA)
is_rprotein = np.isin(TU_index, idx_rprotein)
is_rnap = np.isin(TU_index, idx_rnap)
is_fixed = is_tRNA | is_rRNA | is_rprotein | is_rnap
# Rescale initiation probabilities based on type of RNA
promoter_init_probs[is_mRNA] *= (
synth_prob_fractions["mRna"] / promoter_init_probs[is_mRNA].sum()
)
promoter_init_probs[is_tRNA] *= (
synth_prob_fractions["tRna"] / promoter_init_probs[is_tRNA].sum()
)
promoter_init_probs[is_rRNA] *= (
synth_prob_fractions["rRna"] / promoter_init_probs[is_rRNA].sum()
)
# Set fixed synthesis probabilities for RProteins and RNAPs
rescale_initiation_probs(
promoter_init_probs,
TU_index,
np.concatenate(
(
rna_synth_prob_R_protein[current_media_id],
rna_synth_prob_rna_polymerase[current_media_id],
)
),
np.concatenate((idx_rprotein, idx_rnap)),
)
assert promoter_init_probs[is_fixed].sum() < 1.0
# Adjust for attenuation that will stop transcription after initiation
if trna_attenuation:
attenuation_readthrough = {
idx: prob
for idx, prob in zip(
sim_data.process.transcription.attenuated_rna_indices,
sim_data.process.transcription.attenuation_readthrough[
sim_data.condition
],
)
}
readthrough_adjustment = np.array(
[attenuation_readthrough.get(idx, 1) for idx in TU_index]
)
promoter_init_probs *= readthrough_adjustment
scale_the_rest_by = (
1.0 - promoter_init_probs[is_fixed].sum()
) / promoter_init_probs[~is_fixed].sum()
promoter_init_probs[~is_fixed] *= scale_the_rest_by
# normalize to length of rna
init_prob_length_adjusted = promoter_init_probs * rna_lengths[TU_index]
init_prob_normalized = init_prob_length_adjusted / init_prob_length_adjusted.sum()
# Sample a multinomial distribution of synthesis probabilities to determine
# what RNA are initialized
n_initiations = random_state.multinomial(n_RNAPs_to_activate, init_prob_normalized)
# Build array of transcription unit indexes for partially transcribed mRNAs
# and domain indexes for RNAPs
TU_index_partial_RNAs = np.repeat(TU_index, n_initiations)
domain_index_rnap = np.repeat(domain_index_promoters, n_initiations)
# Build arrays of starting coordinates and transcription directions
starting_coordinates = replication_coordinate[TU_index_partial_RNAs]
is_forward = transcription_direction[TU_index_partial_RNAs]
# Randomly advance RNAPs along the transcription units
# TODO (Eran): make sure there aren't any RNAPs at same location on same TU
updated_lengths = np.array(
random_state.rand(n_RNAPs_to_activate) * rna_lengths[TU_index_partial_RNAs],
dtype=int,
)
# Rescale boolean array of directions to an array of 1's and -1's.
direction_rescaled = (2 * (is_forward - 0.5)).astype(np.int64)
# Compute the updated coordinates of RNAPs. Coordinates of RNAPs moving in
# the positive direction are increased, whereas coordinates of RNAPs moving
# in the negative direction are decreased.
updated_coordinates = starting_coordinates + np.multiply(
direction_rescaled, updated_lengths
)
# Reset coordinates of RNAPs that cross the boundaries between right and
# left replichores
updated_coordinates[updated_coordinates > replichore_lengths[0]] -= (
chromosome_length
)
updated_coordinates[updated_coordinates < -replichore_lengths[1]] += (
chromosome_length
)
# Update mass
sequences = rna_sequences[TU_index_partial_RNAs]
added_mass = computeMassIncrease(sequences, updated_lengths, nt_weights)
added_mass[updated_lengths != 0] += end_weight # add endWeight to all new Rna
# Masses of partial mRNAs are counted as mRNA mass as they are already
# functional, but the masses of other types of partial RNAs are counted as
# generic RNA mass.
added_RNA_mass = added_mass.copy()
added_mRNA_mass = added_mass.copy()
is_mRNA_partial_RNAs = np.isin(TU_index_partial_RNAs, idx_mRNA)
added_RNA_mass[is_mRNA_partial_RNAs] = 0
added_mRNA_mass[np.logical_not(is_mRNA_partial_RNAs)] = 0
# Add active RNAPs and get their unique indexes
unique_molecules["active_RNAP"] = create_new_unique_molecules(
"active_RNAP",
n_RNAPs_to_activate,
sim_data,
unique_id_rng,
domain_index=domain_index_rnap,
coordinates=updated_coordinates,
is_forward=is_forward,
)
# Decrement counts of bulk inactive RNAPs
rnap_idx = bulk_name_to_idx(sim_data.molecule_ids.full_RNAP, bulk_state["id"])
bulk_state["count"][rnap_idx] = inactive_RNAP_counts - n_RNAPs_to_activate
# Add partially transcribed RNAs
partial_rnas = create_new_unique_molecules(
"RNA",
n_RNAPs_to_activate,
sim_data,
unique_id_rng,
TU_index=TU_index_partial_RNAs,
transcript_length=updated_lengths,
is_mRNA=is_mRNA_partial_RNAs,
is_full_transcript=np.zeros(n_RNAPs_to_activate, dtype=bool),
can_translate=is_mRNA_partial_RNAs,
RNAP_index=unique_molecules["active_RNAP"]["unique_index"],
massDiff_nonspecific_RNA=added_RNA_mass,
massDiff_mRNA=added_mRNA_mass,
)
# Get counts of mRNAs initialized as bulk molecules
mRNA_ids = sim_data.process.transcription.rna_data["id"][
sim_data.process.transcription.rna_data["is_mRNA"]
]
mRNA_idx = bulk_name_to_idx(mRNA_ids, bulk_state["id"])
mRNA_counts = bulk_state["count"][mRNA_idx]
# Subtract number of partially transcribed mRNAs that were initialized.
# Note: some mRNAs with high degradation rates have more partial mRNAs than
# the expected total number of mRNAs - for these mRNAs we simply set the
# initial full mRNA counts to be zero.
partial_mRNA_counts = np.bincount(
TU_index_partial_RNAs[is_mRNA_partial_RNAs], minlength=n_TUs
)[idx_mRNA]
full_mRNA_counts = (mRNA_counts - partial_mRNA_counts).clip(min=0)
# Get array of TU indexes for each full mRNA
TU_index_full_mRNAs = np.repeat(idx_mRNA, full_mRNA_counts)
# Add fully transcribed mRNAs. The RNAP_index attribute of these molecules
# are set to -1.
full_rnas = create_new_unique_molecules(
"RNA",
len(TU_index_full_mRNAs),
sim_data,
unique_id_rng,
TU_index=TU_index_full_mRNAs,
transcript_length=rna_lengths[TU_index_full_mRNAs],
is_mRNA=np.ones_like(TU_index_full_mRNAs, dtype=bool),
is_full_transcript=np.ones_like(TU_index_full_mRNAs, dtype=bool),
can_translate=np.ones_like(TU_index_full_mRNAs, dtype=bool),
RNAP_index=np.full(TU_index_full_mRNAs.shape, -1, dtype=np.int64),
massDiff_mRNA=rna_masses[TU_index_full_mRNAs],
)
unique_molecules["RNA"] = np.concatenate((partial_rnas, full_rnas))
# Reset counts of bulk mRNAs to zero
bulk_state["count"][mRNA_idx] = 0
[docs]
def initialize_chromosomal_segments(unique_molecules, sim_data, unique_id_rng):
"""
Initialize unique molecule representations of chromosomal segments. All
chromosomal segments are assumed to be at their relaxed states upon
initialization.
"""
# Load parameters
relaxed_DNA_base_pairs_per_turn = (
sim_data.process.chromosome_structure.relaxed_DNA_base_pairs_per_turn
)
terC_index = sim_data.process.chromosome_structure.terC_dummy_molecule_index
replichore_lengths = sim_data.process.replication.replichore_lengths
min_coordinates = -replichore_lengths[1]
max_coordinates = replichore_lengths[0]
# Get attributes of replisomes, active RNAPs, chromosome domains, full
# chromosomes, and oriCs
(replisome_coordinates, replisome_domain_indexes, replisome_unique_indexes) = attrs(
unique_molecules["active_replisome"],
["coordinates", "domain_index", "unique_index"],
)
(
active_RNAP_coordinates,
active_RNAP_domain_indexes,
active_RNAP_unique_indexes,
) = attrs(
unique_molecules["active_RNAP"], ["coordinates", "domain_index", "unique_index"]
)
chromosome_domain_domain_indexes, child_domains = attrs(
unique_molecules["chromosome_domain"], ["domain_index", "child_domains"]
)
(full_chromosome_domain_indexes,) = attrs(
unique_molecules["full_chromosome"], ["domain_index"]
)
(origin_domain_indexes,) = attrs(unique_molecules["oriC"], ["domain_index"])
# Initialize chromosomal segment attributes
all_boundary_molecule_indexes = np.empty((0, 2), dtype=np.int64)
all_boundary_coordinates = np.empty((0, 2), dtype=np.int64)
all_segment_domain_indexes = np.array([], dtype=np.int32)
all_linking_numbers = np.array([], dtype=np.float64)
def get_chromosomal_segment_attributes(
coordinates, unique_indexes, spans_oriC, spans_terC
):
"""
Returns the attributes of all chromosomal segments from a continuous
stretch of DNA, given the coordinates and unique indexes of all
boundary molecules.
"""
coordinates_argsort = np.argsort(coordinates)
coordinates_sorted = coordinates[coordinates_argsort]
unique_indexes_sorted = unique_indexes[coordinates_argsort]
# Add dummy molecule at terC if domain spans terC
if spans_terC:
coordinates_sorted = np.insert(
coordinates_sorted,
[0, len(coordinates_sorted)],
[min_coordinates, max_coordinates],
)
unique_indexes_sorted = np.insert(
unique_indexes_sorted, [0, len(unique_indexes_sorted)], terC_index
)
boundary_molecule_indexes = np.hstack(
(
unique_indexes_sorted[:-1][:, np.newaxis],
unique_indexes_sorted[1:][:, np.newaxis],
)
)
boundary_coordinates = np.hstack(
(
coordinates_sorted[:-1][:, np.newaxis],
coordinates_sorted[1:][:, np.newaxis],
)
)
# Remove segment that spans oriC if the domain does not span oriC
if not spans_oriC:
oriC_segment_index = np.where(
np.sign(boundary_coordinates).sum(axis=1) == 0
)[0]
assert len(oriC_segment_index) == 1
boundary_molecule_indexes = np.delete(
boundary_molecule_indexes, oriC_segment_index, 0
)
boundary_coordinates = np.delete(
boundary_coordinates, oriC_segment_index, 0
)
# Assumes all segments are at their relaxed state at initialization
linking_numbers = (
boundary_coordinates[:, 1] - boundary_coordinates[:, 0]
) / relaxed_DNA_base_pairs_per_turn
return boundary_molecule_indexes, boundary_coordinates, linking_numbers
# Loop through each domain index
for domain_index in chromosome_domain_domain_indexes:
domain_spans_oriC = domain_index in origin_domain_indexes
domain_spans_terC = domain_index in full_chromosome_domain_indexes
# Get coordinates and indexes of all RNAPs on this domain
RNAP_domain_mask = active_RNAP_domain_indexes == domain_index
molecule_coordinates_this_domain = active_RNAP_coordinates[RNAP_domain_mask]
molecule_indexes_this_domain = active_RNAP_unique_indexes[RNAP_domain_mask]
# Append coordinates and indexes of replisomes on this domain, if any
if not domain_spans_oriC:
replisome_domain_mask = replisome_domain_indexes == domain_index
molecule_coordinates_this_domain = np.concatenate(
(
molecule_coordinates_this_domain,
replisome_coordinates[replisome_domain_mask],
)
)
molecule_indexes_this_domain = np.concatenate(
(
molecule_indexes_this_domain,
replisome_unique_indexes[replisome_domain_mask],
)
)
# Append coordinates and indexes of parent domain replisomes, if any
if not domain_spans_terC:
parent_domain_index = chromosome_domain_domain_indexes[
np.where(child_domains == domain_index)[0][0]
]
replisome_parent_domain_mask = (
replisome_domain_indexes == parent_domain_index
)
molecule_coordinates_this_domain = np.concatenate(
(
molecule_coordinates_this_domain,
replisome_coordinates[replisome_parent_domain_mask],
)
)
molecule_indexes_this_domain = np.concatenate(
(
molecule_indexes_this_domain,
replisome_unique_indexes[replisome_parent_domain_mask],
)
)
# Get attributes of chromosomal segments on this domain
(
boundary_molecule_indexes_this_domain,
boundary_coordinates_this_domain,
linking_numbers_this_domain,
) = get_chromosomal_segment_attributes(
molecule_coordinates_this_domain,
molecule_indexes_this_domain,
domain_spans_oriC,
domain_spans_terC,
)
# Append to existing array of attributes
all_boundary_molecule_indexes = np.vstack(
[all_boundary_molecule_indexes, boundary_molecule_indexes_this_domain]
)
all_boundary_coordinates = np.vstack(
(all_boundary_coordinates, boundary_coordinates_this_domain)
)
all_segment_domain_indexes = np.concatenate(
(
all_segment_domain_indexes,
np.full(len(linking_numbers_this_domain), domain_index, dtype=np.int32),
)
)
all_linking_numbers = np.concatenate(
(all_linking_numbers, linking_numbers_this_domain)
)
# Confirm total counts of all segments
n_segments = len(all_linking_numbers)
assert (
n_segments
== len(active_RNAP_unique_indexes) + 1.5 * len(replisome_unique_indexes) + 1
)
# Add chromosomal segments
unique_molecules["chromosomal_segment"] = create_new_unique_molecules(
"chromosomal_segment",
n_segments,
sim_data,
unique_id_rng,
boundary_molecule_indexes=all_boundary_molecule_indexes,
boundary_coordinates=all_boundary_coordinates,
domain_index=all_segment_domain_indexes,
linking_number=all_linking_numbers,
)
[docs]
def initialize_translation(
bulk_state, unique_molecules, sim_data, random_state, unique_id_rng
):
"""
Activate ribosomes as unique molecules, and distribute them along lengths
of mRNAs, while decreasing counts of unactivated ribosomal subunits (30S
and 50S).
Ribosomes are placed randomly across the lengths of each mRNA.
"""
# Load translation parameters
current_nutrients = sim_data.conditions[sim_data.condition]["nutrients"]
frac_active_ribosome = sim_data.process.translation.ribosomeFractionActiveDict[
current_nutrients
]
protein_sequences = sim_data.process.translation.translation_sequences
protein_lengths = sim_data.process.translation.monomer_data["length"].asNumber()
translation_efficiencies = normalize(
sim_data.process.translation.translation_efficiencies_by_monomer
)
aa_weights_incorporated = sim_data.process.translation.translation_monomer_weights
end_weight = sim_data.process.translation.translation_end_weight
cistron_lengths = sim_data.process.transcription.cistron_data["length"].asNumber(
units.nt
)
TU_ids = sim_data.process.transcription.rna_data["id"]
monomer_index_to_tu_indexes = sim_data.relation.monomer_index_to_tu_indexes
monomer_index_to_cistron_index = {
i: sim_data.process.transcription._cistron_id_to_index[monomer["cistron_id"]]
for (i, monomer) in enumerate(sim_data.process.translation.monomer_data)
}
# Get attributes of RNAs
(
TU_index_all_RNAs,
length_all_RNAs,
is_mRNA,
is_full_transcript_all_RNAs,
unique_index_all_RNAs,
) = attrs(
unique_molecules["RNA"],
[
"TU_index",
"transcript_length",
"is_mRNA",
"is_full_transcript",
"unique_index",
],
)
TU_index_mRNAs = TU_index_all_RNAs[is_mRNA]
length_mRNAs = length_all_RNAs[is_mRNA]
is_full_transcript_mRNAs = is_full_transcript_all_RNAs[is_mRNA]
unique_index_mRNAs = unique_index_all_RNAs[is_mRNA]
# Calculate available template lengths of each mRNA cistron from fully
# transcribed mRNA transcription units
TU_index_full_mRNAs = TU_index_mRNAs[is_full_transcript_mRNAs]
TU_counts_full_mRNAs = np.bincount(TU_index_full_mRNAs, minlength=len(TU_ids))
cistron_counts_full_mRNAs = (
sim_data.process.transcription.cistron_tu_mapping_matrix.dot(
TU_counts_full_mRNAs
)
)
available_cistron_lengths = np.multiply(cistron_counts_full_mRNAs, cistron_lengths)
# Add available template lengths from each partially transcribed mRNAs
TU_index_incomplete_mRNAs = TU_index_mRNAs[np.logical_not(is_full_transcript_mRNAs)]
length_incomplete_mRNAs = length_mRNAs[np.logical_not(is_full_transcript_mRNAs)]
TU_index_to_mRNA_lengths = {}
for TU_index, length in zip(TU_index_incomplete_mRNAs, length_incomplete_mRNAs):
TU_index_to_mRNA_lengths.setdefault(TU_index, []).append(length)
for TU_index, available_lengths in TU_index_to_mRNA_lengths.items():
cistron_indexes = sim_data.process.transcription.rna_id_to_cistron_indexes(
TU_ids[TU_index]
)
cistron_start_positions = np.array(
[
sim_data.process.transcription.cistron_start_end_pos_in_tu[
(cistron_index, TU_index)
][0]
for cistron_index in cistron_indexes
]
)
for length in available_lengths:
available_cistron_lengths[cistron_indexes] += np.clip(
length - cistron_start_positions, 0, cistron_lengths[cistron_indexes]
)
# Find number of ribosomes to activate
ribosome30S_idx = bulk_name_to_idx(
sim_data.molecule_ids.s30_full_complex, bulk_state["id"]
)
ribosome30S = bulk_state["count"][ribosome30S_idx]
ribosome50S_idx = bulk_name_to_idx(
sim_data.molecule_ids.s50_full_complex, bulk_state["id"]
)
ribosome50S = bulk_state["count"][ribosome50S_idx]
inactive_ribosome_count = np.minimum(ribosome30S, ribosome50S)
n_ribosomes_to_activate = np.int64(frac_active_ribosome * inactive_ribosome_count)
# Add total available template lengths as weights and normalize
protein_init_probs = normalize(
available_cistron_lengths[sim_data.relation.cistron_to_monomer_mapping]
* translation_efficiencies
)
# Sample a multinomial distribution of synthesis probabilities to determine
# which types of mRNAs are initialized
n_new_proteins = random_state.multinomial(
n_ribosomes_to_activate, protein_init_probs
)
# Build attributes for active ribosomes
protein_indexes = np.empty(n_ribosomes_to_activate, np.int64)
cistron_start_positions_on_mRNA = np.empty(n_ribosomes_to_activate, np.int64)
positions_on_mRNA_from_cistron_start_site = np.empty(
n_ribosomes_to_activate, np.int64
)
mRNA_indexes = np.empty(n_ribosomes_to_activate, np.int64)
start_index = 0
nonzero_count = n_new_proteins > 0
for protein_index, protein_counts in zip(
np.arange(n_new_proteins.size)[nonzero_count], n_new_proteins[nonzero_count]
):
# Set protein index
protein_indexes[start_index : start_index + protein_counts] = protein_index
# Get index of cistron corresponding to this protein
cistron_index = monomer_index_to_cistron_index[protein_index]
# Initialize list of available lengths for each transcript and the
# indexes of each transcript in the list of mRNA attributes
available_lengths = []
attribute_indexes = []
cistron_start_positions = []
# Distribute ribosomes among mRNAs that produce this protein, weighted
# by their lengths
for TU_index in monomer_index_to_tu_indexes[protein_index]:
attribute_indexes_this_TU = np.where(TU_index_mRNAs == TU_index)[0]
cistron_start_position = (
sim_data.process.transcription.cistron_start_end_pos_in_tu[
(cistron_index, TU_index)
][0]
)
available_lengths.extend(
np.clip(
length_mRNAs[attribute_indexes_this_TU] - cistron_start_position,
0,
cistron_lengths[cistron_index],
)
)
attribute_indexes.extend(attribute_indexes_this_TU)
cistron_start_positions.extend(
[cistron_start_position] * len(attribute_indexes_this_TU)
)
available_lengths = np.array(available_lengths)
attribute_indexes = np.array(attribute_indexes)
cistron_start_positions = np.array(cistron_start_positions)
n_ribosomes_per_RNA = random_state.multinomial(
protein_counts, normalize(available_lengths)
)
# Get unique indexes of each mRNA
mRNA_indexes[start_index : start_index + protein_counts] = np.repeat(
unique_index_mRNAs[attribute_indexes], n_ribosomes_per_RNA
)
# Get full length of this polypeptide
peptide_full_length = protein_lengths[protein_index]
# Randomly place ribosomes along the length of each mRNA, capped by the
# mRNA length expected from the full polypeptide length to prevent
# ribosomes from overshooting full peptide lengths
cistron_start_positions_on_mRNA[start_index : start_index + protein_counts] = (
np.repeat(cistron_start_positions, n_ribosomes_per_RNA)
)
positions_on_mRNA_from_cistron_start_site[
start_index : start_index + protein_counts
] = np.floor(
random_state.rand(protein_counts)
* np.repeat(
np.minimum(available_lengths, peptide_full_length * 3),
n_ribosomes_per_RNA,
)
)
start_index += protein_counts
# Calculate the lengths of the partial polypeptide, and rescale position on
# mRNA to be a multiple of three using this peptide length
peptide_lengths = np.floor_divide(positions_on_mRNA_from_cistron_start_site, 3)
positions_on_mRNA = cistron_start_positions_on_mRNA + 3 * peptide_lengths
# Update masses of partially translated proteins
sequences = protein_sequences[protein_indexes]
mass_increase_protein = computeMassIncrease(
sequences, peptide_lengths, aa_weights_incorporated
)
# Add end weight
mass_increase_protein[peptide_lengths != 0] += end_weight
# Add active ribosomes
unique_molecules["active_ribosome"] = create_new_unique_molecules(
"active_ribosome",
n_ribosomes_to_activate,
sim_data,
unique_id_rng,
protein_index=protein_indexes,
peptide_length=peptide_lengths,
mRNA_index=mRNA_indexes,
pos_on_mRNA=positions_on_mRNA,
massDiff_protein=mass_increase_protein,
)
# Decrease counts of free 30S and 50S ribosomal subunits
bulk_state["count"][ribosome30S_idx] = ribosome30S - n_ribosomes_to_activate
bulk_state["count"][ribosome50S_idx] = ribosome50S - n_ribosomes_to_activate
[docs]
def determine_chromosome_state(
tau: Unum,
replichore_length: Unum,
n_max_replisomes: int,
place_holder: int,
cell_mass: Unum,
critical_mass: Unum,
replication_rate: float,
) -> tuple[
dict[str, npt.NDArray[np.int32]],
dict[str, npt.NDArray[Any]],
dict[str, npt.NDArray[np.int32]],
]:
"""
Calculates the attributes of oriC's, replisomes, and chromosome domains on
the chromosomes at the beginning of the cell cycle.
Args:
tau: the doubling time of the cell (with Unum time unit)
replichore_length: the amount of DNA to be replicated per fork, usually
half of the genome, in base-pairs (with Unum nucleotide unit)
n_max_replisomes: the maximum number of replisomes that can be formed
given the initial counts of replisome subunits
place_holder: placeholder value for chromosome domains without child
domains
cell_mass: total mass of the cell with mass units (with Unum mass unit)
critical_mass: mass per oriC before replication is initiated
(with Unum mass unit)
replication_rate: rate of nucleotide elongation
(with Unum nucleotides per time unit)
Returns:
Three dictionaries, each containing updates to attributes of a unique molecule type.
- ``oric_state``: dictionary of the following format::
{'domain_index': a vector of integers indicating which chromosome domain the
oriC sequence belongs to.}
- ``replisome_state``: dictionary of the following format::
{'coordinates': a vector of integers that indicates where the replisomes
are located on the chromosome relative to the origin in base pairs,
'right_replichore': a vector of boolean values that indicates whether the
replisome is on the right replichore (True) or the left replichore (False),
'domain_index': a vector of integers indicating which chromosome domain the
replisomes belong to. The index of the "mother" domain of the replication
fork is assigned to the replisome}
- ``domain_state``: dictionary of the following format::
{'domain_index': the indexes of the domains,
'child_domains': the (n_domain X 2) array of the domain indexes of the two
children domains that are connected on the oriC side with the given domain.}
"""
# All inputs must be positive numbers
unitless_tau = tau.asNumber(units.s)
unitless_replichore_length = replichore_length.asNumber(units.nt)
assert unitless_tau >= 0, "tau value can't be negative."
assert unitless_replichore_length > 0, "replichore_length must be positive."
# Convert to unitless
unitless_cell_mass = cell_mass.asNumber(units.fg)
unitless_critical_mass = critical_mass.asNumber(units.fg)
# Calculate the maximum number of replication rounds given the maximum
# count of replisomes
n_max_rounds = int(np.log2(n_max_replisomes / 2 + 1))
# Calculate the number of active replication rounds
n_rounds = min(
n_max_rounds,
max(0, int(np.ceil(np.log2(unitless_cell_mass / unitless_critical_mass)))),
)
# Initialize arrays for replisomes
n_replisomes = 2 * (2**n_rounds - 1)
coordinates = np.zeros(n_replisomes, dtype=np.int64)
right_replichore_replisome = np.zeros(n_replisomes, dtype=bool)
domain_index_replisome = np.zeros(n_replisomes, dtype=np.int32)
# Initialize child domain array for chromosome domains
n_domains = 2 ** (n_rounds + 1) - 1
child_domains = np.full((n_domains, 2), place_holder, dtype=np.int32)
# Set domain_index attribute of oriC's and chromosome domains
domain_index_oric = np.arange(
2**n_rounds - 1, 2 ** (n_rounds + 1) - 1, dtype=np.int32
)
domain_index_domains = np.arange(0, n_domains, dtype=np.int32)
def n_events_before_this_round(round_idx):
"""
Calculates the number of replication events that happen before the
replication round index given as an argument. Since 2**i events happen
at each round i = 0, 1, ..., the sum of the number of events before
round j is 2**j - 1.
"""
return 2**round_idx - 1
# Loop through active replication rounds, starting from the oldest round.
# If n_round = 0 skip loop entirely - no active replication round.
for round_idx in np.arange(n_rounds):
# Determine at which location (base) of the chromosome the replication
# forks should be initialized to
round_critical_mass = 2**round_idx * unitless_critical_mass
growth_rate = np.log(2) / unitless_tau
replication_time = (
np.log(unitless_cell_mass / round_critical_mass) / growth_rate
)
# TODO: this should handle completed replication (instead of taking min)
# for accuracy but will likely never start with multiple chromosomes
fork_location = min(
np.floor(replication_time * replication_rate),
unitless_replichore_length - 1,
)
# Add 2^n initiation events per round. A single initiation event
# generates two replication forks.
n_events_this_round = 2**round_idx
# Set attributes of replisomes for this replication round
coordinates[
2 * n_events_before_this_round(round_idx) : 2
* n_events_before_this_round(round_idx + 1)
] = np.tile(np.array([fork_location, -fork_location]), n_events_this_round)
right_replichore_replisome[
2 * n_events_before_this_round(round_idx) : 2
* n_events_before_this_round(round_idx + 1)
] = np.tile(np.array([True, False]), n_events_this_round)
for i, domain_index in enumerate(
np.arange(
n_events_before_this_round(round_idx),
n_events_before_this_round(round_idx + 1),
)
):
domain_index_replisome[
2 * n_events_before_this_round(round_idx) + 2 * i : 2
* n_events_before_this_round(round_idx)
+ 2 * (i + 1)
] = np.repeat(domain_index, 2)
# Set attributes of chromosome domains for this replication round
for i, domain_index in enumerate(
np.arange(
n_events_before_this_round(round_idx + 1),
n_events_before_this_round(round_idx + 2),
2,
)
):
child_domains[n_events_before_this_round(round_idx) + i, :] = np.array(
[domain_index, domain_index + 1]
)
# Convert to numpy arrays and wrap into dictionaries
oric_state = {"domain_index": domain_index_oric}
replisome_state = {
"coordinates": coordinates,
"right_replichore": right_replichore_replisome,
"domain_index": domain_index_replisome,
}
domain_state = {
"child_domains": child_domains,
"domain_index": domain_index_domains,
}
return oric_state, replisome_state, domain_state
[docs]
def rescale_initiation_probs(init_probs, TU_index, fixed_synth_probs, fixed_TU_indexes):
"""
Rescales the initiation probabilities of each promoter such that the total
synthesis probabilities of certain types of RNAs are fixed to a
predetermined value. For instance, if there are two copies of promoters for
RNA A, whose synthesis probability should be fixed to 0.1, each promoter is
given an initiation probability of 0.05.
"""
for rna_idx, synth_prob in zip(fixed_TU_indexes, fixed_synth_probs):
fixed_rna_mask = TU_index == rna_idx
init_probs[fixed_rna_mask] = synth_prob / fixed_rna_mask.sum()
[docs]
def calculate_cell_mass(bulk_state, unique_molecules, sim_data):
"""
Calculates cell mass in femtograms.
"""
bulk_submass_names = [
f"{submass}_submass" for submass in sim_data.submass_name_to_index.keys()
]
cell_mass = (
bulk_state["count"]
.dot(rfn.structured_to_unstructured(bulk_state[bulk_submass_names]))
.sum()
)
if len(unique_molecules) > 0:
unique_masses = sim_data.internal_state.unique_molecule.unique_molecule_masses[
"mass"
].asNumber(units.fg / units.mol) / sim_data.constants.n_avogadro.asNumber(
1 / units.mol
)
unique_ids = sim_data.internal_state.unique_molecule.unique_molecule_masses[
"id"
]
unique_submass_names = [
f"massDiff_{submass}" for submass in sim_data.submass_name_to_index.keys()
]
for unique_id, unique_submasses in zip(unique_ids, unique_masses):
if unique_id in unique_molecules:
cell_mass += (
unique_molecules[unique_id]["_entryState"].sum() * unique_submasses
).sum()
cell_mass += rfn.structured_to_unstructured(
unique_molecules[unique_id][unique_submass_names]
).sum()
return units.fg * cell_mass
[docs]
def initialize_trna_charging(
bulk_state: np.ndarray,
unique_molecules: dict[str, np.ndarray],
sim_data: Any,
variable_elongation: bool,
):
"""
Initializes charged tRNA from uncharged tRNA and amino acids
Args:
bulk_state: Structured array with IDs and counts of all bulk molecules
unique_molecules: Mapping of unique molecule names to structured
arrays of their current simulation states
sim_data: Simulation data loaded from pickle generated by ParCa
variable_elongation: Sets max elongation higher if True
.. note::
Does not adjust for mass of amino acids on charged tRNA (~0.01% of cell mass)
"""
# Calculate cell volume for concentrations
cell_volume = (
calculate_cell_mass(bulk_state, unique_molecules, sim_data)
/ sim_data.constants.cell_density
)
counts_to_molar = 1 / (sim_data.constants.n_avogadro * cell_volume)
# Get molecule views and concentrations
transcription = sim_data.process.transcription
aa_from_synthetase = transcription.aa_from_synthetase
aa_from_trna = transcription.aa_from_trna
synthetases = counts(
bulk_state, bulk_name_to_idx(transcription.synthetase_names, bulk_state["id"])
)
uncharged_trna_idx = bulk_name_to_idx(
transcription.uncharged_trna_names, bulk_state["id"]
)
uncharged_trna = counts(bulk_state, uncharged_trna_idx)
charged_trna_idx = bulk_name_to_idx(
transcription.charged_trna_names, bulk_state["id"]
)
charged_trna = counts(bulk_state, charged_trna_idx)
aas = counts(
bulk_state,
bulk_name_to_idx(sim_data.molecule_groups.amino_acids, bulk_state["id"]),
)
ribosome_counts = unique_molecules["active_ribosome"]["_entryState"].sum()
synthetase_conc = counts_to_molar * np.dot(aa_from_synthetase, synthetases)
uncharged_trna_conc = counts_to_molar * np.dot(aa_from_trna, uncharged_trna)
charged_trna_conc = counts_to_molar * np.dot(aa_from_trna, charged_trna)
aa_conc = counts_to_molar * aas
ribosome_conc = counts_to_molar * ribosome_counts
# Estimate fraction of amino acids from sequences, excluding first index for padding of -1
_, aas_in_sequences = np.unique(
sim_data.process.translation.translation_sequences, return_counts=True
)
f = aas_in_sequences[1:] / np.sum(aas_in_sequences[1:])
# Estimate initial charging state
constants = sim_data.constants
transcription = sim_data.process.transcription
metabolism = sim_data.process.metabolism
elongation_max = (
constants.ribosome_elongation_rate_max
if variable_elongation
else constants.ribosome_elongation_rate_basal
)
charging_params = {
"kS": constants.synthetase_charging_rate.asNumber(1 / units.s),
"KMaa": transcription.aa_kms.asNumber(MICROMOLAR_UNITS),
"KMtf": transcription.trna_kms.asNumber(MICROMOLAR_UNITS),
"krta": constants.Kdissociation_charged_trna_ribosome.asNumber(
MICROMOLAR_UNITS
),
"krtf": constants.Kdissociation_uncharged_trna_ribosome.asNumber(
MICROMOLAR_UNITS
),
"max_elong_rate": float(elongation_max.asNumber(units.aa / units.s)),
"charging_mask": np.array(
[
aa not in REMOVED_FROM_CHARGING
for aa in sim_data.molecule_groups.amino_acids
]
),
"unit_conversion": metabolism.get_amino_acid_conc_conversion(MICROMOLAR_UNITS),
}
fraction_charged, *_ = calculate_trna_charging(
synthetase_conc,
uncharged_trna_conc,
charged_trna_conc,
aa_conc,
ribosome_conc,
f,
charging_params,
)
# Update counts of tRNA to match charging
total_trna_counts = uncharged_trna + charged_trna
charged_trna_counts = np.round(
total_trna_counts * np.dot(fraction_charged, aa_from_trna)
)
uncharged_trna_counts = total_trna_counts - charged_trna_counts
bulk_state["count"][charged_trna_idx] = charged_trna_counts
bulk_state["count"][uncharged_trna_idx] = uncharged_trna_counts