"""
SimulationData state associated data
"""
from wholecell.utils import units
from reconstruction.ecoli.dataclasses.state.bulkMolecules import BulkMolecules
from reconstruction.ecoli.dataclasses.state.uniqueMolecules import UniqueMolecules
import numpy as np
[docs]
class InternalState(object):
"""Internal State"""
def __init__(self, raw_data, sim_data):
self.bulk_molecules = BulkMolecules(raw_data, sim_data)
self.unique_molecule = UniqueMolecules(raw_data, sim_data)
self._build_bulk_molecules(sim_data)
self._build_unique_molecules(sim_data)
self._build_compartments(raw_data, sim_data)
[docs]
def _build_bulk_molecules(self, sim_data):
"""
Add data (IDs and mass) for all classes of bulk molecules.
"""
all_bulk_molecule_ids = sim_data.getter.get_all_valid_molecules()
all_bulk_molecule_ids_with_compartments, all_bulk_molecule_masses = (
self._build_bulk_molecule_specs(sim_data, all_bulk_molecule_ids)
)
self.bulk_molecules.add_to_bulk_state(
all_bulk_molecule_ids_with_compartments, all_bulk_molecule_masses
)
sim_data.molecule_groups.bulk_molecules_binomial_division.extend(
all_bulk_molecule_ids_with_compartments
)
[docs]
def _build_bulk_molecule_specs(self, sim_data, molecule_ids):
"""
Builds a list of molecule IDs with compartment tags and a corresponding
array of molecular masses to add to the bulk state.
Args:
molecule_ids (List[str]): List of molecule IDs w/o compartment tags
Returns:
molecule_ids_with_compartments (List[str]): List of molecule IDs
with compartment tags
masses (np.ndarray): Array of molecular masses divided into
submasses
"""
molecule_ids_with_compartments = []
masses = []
# Loop through each molecule species and associated compartments
for molecule_id in molecule_ids:
mw = sim_data.getter.get_submass_array(molecule_id).asNumber(
units.g / units.mol
)
for loc in sim_data.getter.get_compartment(molecule_id):
molecule_ids_with_compartments.append("{}[{}]".format(molecule_id, loc))
masses.append(mw)
masses = (units.g / units.mol) * np.array(masses)
return molecule_ids_with_compartments, masses
[docs]
def _build_unique_molecules(self, sim_data):
"""
Add data (name, mass, and attribute data structure) for all classes of
unique molecules.
"""
# Set up dictionary for quick indexing of bulk molecule masses
bulk_molecule_id_to_mass = {
molecule_id: mass
for (molecule_id, mass) in zip(
self.bulk_molecules.bulk_data["id"],
self.bulk_molecules.bulk_data["mass"],
)
}
# Add active RNA polymerase
# The attributes of active RNA polymerases are given as:
# - domain_index (32-bit int): Domain index of the chromosome domain
# that the RNAP is bound to. This value is used to split the RNAPs at
# cell division.
# - coordinates (64-bit int): Location of the RNAP on the chromosome,
# in base pairs from origin.
# - is_forward (bool): True if RNAP is moving in the positive direction
# of the genomic coordinates, False if RNAP is moving in the negative
# direction. This is determined by the orientation of the TU that the
# RNAP is transcribing.
RNAP_mass = bulk_molecule_id_to_mass[sim_data.molecule_ids.full_RNAP]
RNAP_attributes = {
"domain_index": "i4",
"coordinates": "i8",
"is_forward": "?",
}
self.unique_molecule.add_to_unique_state(
"active_RNAP", RNAP_attributes, RNAP_mass
)
# RNAPs are divided based on the index of the chromosome domain they
# are bound to
sim_data.molecule_groups.unique_molecules_domain_index_division.append(
"active_RNAP"
)
# Add RNAs
# This molecule represents all RNAs that should be represented as
# unique molecules. These RNAs include all partially transcribed RNAs
# and fully transcribed mRNAs. The attributes of RNAs are given as:
# - TU_index (64-bit int): Index of the transcription unit that the
# RNA molecule is representing. Determines the sequence and the length
# of the fully elongated RNA.
# - transcript_length (64-bit int): Current length of the RNA.
# - is_mRNA (bool): True if RNA represents an mRNA molecule.
# - is_full_transcript (bool): True if RNA is fully transcribed and
# released from the RNA polymerase, False if RNA is being actively
# transcribed. This cannot be True if 'is_mRNA' is False, since all
# non-mRNAs are represented as bulk molecules when they are fully
# transcribed.
# - can_translate (bool): True if the 5' end of the mRNA molecule is
# available for translation. This flag is used only for mRNA molecules.
# - RNAP_index (64-bit int): Unique index of the RNA polymerase that is
# synthesizing the RNA or have synthesized the mRNA. For fully
# transcribed mRNAs that are added at initialization, this attribute
# is set to -1.
RNA_mass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
RNA_attributes = {
"TU_index": "i8",
"transcript_length": "i8",
"is_mRNA": "?",
"is_full_transcript": "?",
"can_translate": "?",
"RNAP_index": "i8",
}
self.unique_molecule.add_to_unique_state("RNA", RNA_attributes, RNA_mass)
# Fully transcribed mRNAs are divided binomially, partial transcripts
# are divided based on which chromosome domains their associated RNAPs
# are bound to
sim_data.molecule_groups.unique_molecules_RNA_division.append("RNA")
# Add active ribosomes
# The attributes of active ribosomes are given as:
# - protein_index (64-bit int): Index of the protein monomer that the
# ribosome is translating
# - peptide_length (64-bit int): Current length of the polypeptide that
# the ribosome is translating, in number of amino acids
# - mRNA_index (64-bit int): Unique index of the mRNA that the ribosome
# is bound to
# - pos_on_mRNA (64-bit int): Location of the ribosome on the bound
# mRNA, in number of bases from the transcription start site
ribosome_30S_mass = bulk_molecule_id_to_mass[
sim_data.molecule_ids.s30_full_complex
]
ribosome_50S_mass = bulk_molecule_id_to_mass[
sim_data.molecule_ids.s50_full_complex
]
ribosome_mass = ribosome_30S_mass + ribosome_50S_mass
ribosome_attributes = {
"protein_index": "i8",
"peptide_length": "i8",
"mRNA_index": "i8",
"pos_on_mRNA": "i8",
}
self.unique_molecule.add_to_unique_state(
"active_ribosome", ribosome_attributes, ribosome_mass
)
# Active ribosomes are divided such that they always follow the mRNA
# molecule they are bound to
sim_data.molecule_groups.unique_molecules_active_ribosome_division.append(
"active_ribosome"
)
# Add full chromosomes
# One full chromosome molecule is added when chromosome replication is
# complete, and sets cell division to happen after a length of time
# specified by the D period (if D_PERIOD_DIVISION is set to True).
# The 'has_triggered_division' attribute is initially set to False, and
# is reset to True when division_time was reached and the cell has
# divided. The 'domain_index' keeps track of the index of the oldest
# chromosome domain that is part of the full chromosome.
full_chromosome_mass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
full_chromosome_mass[sim_data.submass_name_to_index["DNA"]] = (
sim_data.getter.get_mass(sim_data.molecule_ids.full_chromosome)
)
full_chromosome_attributes = {
"division_time": "f8",
"has_triggered_division": "?",
"domain_index": "i4",
}
self.unique_molecule.add_to_unique_state(
"full_chromosome", full_chromosome_attributes, full_chromosome_mass
)
# Full chromosomes are divided based on their domain index
sim_data.molecule_groups.unique_molecules_domain_index_division.append(
"full_chromosome"
)
# Add chromosome domains
# Chromosome domains are zero-mass molecules that accounts for the
# structures of replicating chromosomes. Each replication initiation
# event creates two new chromosome domains that are given a unique
# integer 'domain_index'. These two new domains are child domains of
# the original domain that the origin belonged to.
chromosome_domain_mass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
chromosome_domain_attributes = {
"domain_index": "i4",
"child_domains": ("i4", 2),
}
self.unique_molecule.add_to_unique_state(
"chromosome_domain", chromosome_domain_attributes, chromosome_domain_mass
)
# Chromosome domains are divided based on their domain index
sim_data.molecule_groups.unique_molecules_domain_index_division.append(
"chromosome_domain"
)
# Add active replisomes
# Note that the replisome does not functionally replicate the
# chromosome, but instead keeps track of the mass associated with
# essential subunits of the replisome complex (if the mechanistic
# replisome option is turned on). The list of essential subunits and
# their stoichiometry were taken from Reyes-Lamothe et al., 2010.
replisome_mass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
replisome_attributes = {
"domain_index": "i4",
"right_replichore": "?",
"coordinates": "i8",
}
self.unique_molecule.add_to_unique_state(
"active_replisome", replisome_attributes, replisome_mass
)
# Active replisomes are divided based on their domain index
sim_data.molecule_groups.unique_molecules_domain_index_division.append(
"active_replisome"
)
# Add origins of replication
# Note that origins are conceptual molecules and have zero mass. The
# chromosomeIndexes of oriC's determine the chromosomeIndexes of the
# new partial chromosomes and replisomes initiated on the same oriC.
originMass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
originAttributes = {
"domain_index": "i4",
}
self.unique_molecule.add_to_unique_state("oriC", originAttributes, originMass)
# oriC's are divided based on their domain index
sim_data.molecule_groups.unique_molecules_domain_index_division.append("oriC")
# Add promoters
# Promoters are sequences on the DNA where RNA polymerases bind to and
# initiate transcription. They can also be bound to transcription
# factors(TFs), if the transcription unit associated with the promoter
# is regulated by TFs. The promoter itself has zero mass but can hold
# the mass of the transcription factor that it is bound to. Its
# attributes are given as:
# - TU_index (64-bit int): Index of the transcription unit that
# the promoter is associated with. This determines which TFs the
# promoter can bind to, and how the transcription probability is
# calculated.
# - coordinates (64-bit int): Location of the promoter on the
# chromosome, in base pairs from origin. This value does not change
# after the molecule is initialized.
# - domain_index (32-bit int): Domain index of the chromosome domain
# that the promoter belongs to. This value is used to split the
# promoters at cell division.
# - bound_TF (boolean array of length n_tf): A boolean array that
# shows which TFs the promoter is bound to. Note that one promoter can
# bind to multiple TFs.
n_tf = len(sim_data.process.transcription_regulation.tf_ids)
promoter_mass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
promoter_attributes = {
"TU_index": "i8",
"coordinates": "i8",
"domain_index": "i4",
"bound_TF": ("?", n_tf),
}
self.unique_molecule.add_to_unique_state(
"promoter", promoter_attributes, promoter_mass
)
# Promoters are divided based on their domain index
sim_data.molecule_groups.unique_molecules_domain_index_division.append(
"promoter"
)
# Add genes
# Genes are sequences on the DNA that encode for particular cistrons.
# Its attributes are given as:
# - cistron_index (64-bit int): Index of the cistron that the gene
# encodes for.
# - coordinates (64-bit int): Location of the gene on the chromosome, in
# base pairs from origin. This value does not change after the molecule
# is initialized.
# - domain_index (32-bit int): Domain index of the chromosome domain
# that the gene belongs to. This value is used to split the genes at
# cell division.
gene_mass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
gene_attributes = {
"cistron_index": "i8",
"coordinates": "i8",
"domain_index": "i4",
}
self.unique_molecule.add_to_unique_state("gene", gene_attributes, gene_mass)
# Genes are divided based on their domain index
sim_data.molecule_groups.unique_molecules_domain_index_division.append("gene")
# Add chromosomal segments
# Chromosomal segments are segments of DNA that are topologically
# constrained, such that the changes in the linking number of the
# segment do not spread beyond its boundaries. For a relaxed DNA double
# helix, the linking number is proportional to the length of the
# chromosomal segment. Currently only active RNA polymerases and
# replisomes serve as segment boundaries. Its attributes are given as:
# - boundary_molecule_indexes (pair of 64-bit ints): Unique indexes
# of molecules at the boundaries of the chromosomal segment
# - boundary_coordinates (pair of 64-bit ints): Coordinates of the
# boundaries of the chromosomal segment
# - domain_index (32-bit int): Domain index of the chromosome domain
# that the segment belongs to.
# - linking_number (64-bit float): Linking number of the chromosomal
# segment.
chromosomal_segment_mass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
chromosomal_segment_attributes = {
"boundary_molecule_indexes": ("i8", 2),
"boundary_coordinates": ("i8", 2),
"domain_index": "i4",
"linking_number": "f8",
}
self.unique_molecule.add_to_unique_state(
"chromosomal_segment",
chromosomal_segment_attributes,
chromosomal_segment_mass,
)
# Chromosomal segments are divided based on their domain indexes, but
# division occurs after all chromosome-bound molecules are divided to
# properly reset the boundary_molecule_indexes attribute
sim_data.molecule_groups.unique_molecules_chromosomal_segment_division.append(
"chromosomal_segment"
)
# Add DnaA boxes
# DnaA boxes are 9-base sequence motifs on the DNA that bind to the
# protein DnaA. Except for DnaA boxes close to the origin, these boxes
# serve no functional role in replication initiation, but can
# effectively titrate away free DnaA molecules and control its
# concentration. The molecule itself has zero mass but it can hold the
# mass of the DnaA protein that it is bound to. Its attributes are
# given as:
# - coordinates (64-bit int): Location of the middle base (5th base) of
# the DnaA box, in base pairs from origin. This value does not change
# after the molecule is initialized.
# - domain_index (32-bit int): Domain index of the chromosome domain
# that the DnaA box belongs to. This value is used to allocate DnaA
# boxes to the two daughter cells at cell division.
# - DnaA_bound (boolean): True if bound to a DnaA protein, False if not
DnaA_box_mass = (units.g / units.mol) * np.zeros_like(RNAP_mass)
DnaA_box_attributes = {
"coordinates": "i8",
"domain_index": "i4",
"DnaA_bound": "?",
}
self.unique_molecule.add_to_unique_state(
"DnaA_box", DnaA_box_attributes, DnaA_box_mass
)
# DnaA boxes are divided based on their domain index
sim_data.molecule_groups.unique_molecules_domain_index_division.append(
"DnaA_box"
)
[docs]
def _build_compartments(self, raw_data, sim_data):
_ = sim_data
compartmentData = np.empty(
len(raw_data.compartments),
dtype=[("id", "U20"), ("compartmentAbbreviation", "U1")],
)
compartmentData["id"] = [x["id"] for x in raw_data.compartments]
compartmentData["compartmentAbbreviation"] = [
x["abbrev"] for x in raw_data.compartments
]
self.compartments = compartmentData