Source code for reconstruction.ecoli.dataclasses.process.translation

"""
SimulationData for translation process
"""

import numpy as np

from ecoli.library.sim_data import MAX_TIME_STEP
from wholecell.utils import data, units
from wholecell.utils.unit_struct_array import UnitStructArray
from wholecell.utils.polymerize import polymerize
from wholecell.utils.random import make_elongation_rates


PROCESS_MAX_TIME_STEP = 2.0


[docs] class Translation(object): """Translation""" def __init__(self, raw_data, sim_data): self.max_time_step = min(MAX_TIME_STEP, PROCESS_MAX_TIME_STEP) self.next_aa_pad = ( 1 # Need an extra amino acid in sequences lengths to find next one ) self._build_monomer_data(raw_data, sim_data) self._build_translation(raw_data, sim_data) self._build_translation_efficiency(raw_data, sim_data) self._build_elongation_rates(raw_data, sim_data) def __getstate__(self): """Return the state to pickle with translationSequences removed and only storing data from translationSequences with pad values stripped. """ state = data.dissoc_strict(self.__dict__, ("translation_sequences",)) state["sequences"] = np.array( [seq[seq != polymerize.PAD_VALUE] for seq in self.translation_sequences], dtype=object, ) state["sequence_shape"] = self.translation_sequences.shape return state def __setstate__(self, state): """Restore translationSequences and remove processed versions of the data.""" sequences = state.pop("sequences") sequence_shape = state.pop("sequence_shape") self.__dict__.update(state) self.translation_sequences = np.full( sequence_shape, polymerize.PAD_VALUE, dtype=np.int8 ) for i, seq in enumerate(sequences): self.translation_sequences[i, : len(seq)] = seq
[docs] def _build_monomer_data(self, raw_data, sim_data): # Get set of all cistrons IDs with an associated gene and right and left # end positions rna_id_to_gene_id = {gene["rna_ids"][0]: gene["id"] for gene in raw_data.genes} gene_id_to_left_end_pos = { gene["id"]: gene["left_end_pos"] for gene in raw_data.genes } gene_id_to_right_end_pos = { gene["id"]: gene["right_end_pos"] for gene in raw_data.genes } all_mRNA_cistrons = { rna["id"] for rna in raw_data.rnas if rna["id"] in rna_id_to_gene_id and gene_id_to_left_end_pos[rna_id_to_gene_id[rna["id"]]] is not None and gene_id_to_right_end_pos[rna_id_to_gene_id[rna["id"]]] is not None and rna["type"] == "mRNA" } # Get mappings from monomer IDs to cistron IDs # TODO (ggsun): Handle cistrons with two or more associated proteins monomer_id_to_cistron_id = { rna["monomer_ids"][0]: rna["id"] for rna in raw_data.rnas if len(rna["monomer_ids"]) > 0 } # Select proteins with valid sequences and mappings to valid mRNAs all_proteins = [] for protein in raw_data.proteins: if sim_data.getter.is_valid_molecule(protein["id"]): try: rna_id = monomer_id_to_cistron_id[protein["id"]] except KeyError: continue if rna_id in all_mRNA_cistrons: all_proteins.append(protein) # Get protein IDs with compartments protein_ids = [protein["id"] for protein in all_proteins] protein_compartments = sim_data.getter.get_compartments(protein_ids) assert all([len(loc) == 1 for loc in protein_compartments]) protein_ids_with_compartments = [ f"{protein_id}[{loc[0]}]" for (protein_id, loc) in zip(protein_ids, protein_compartments) ] n_proteins = len(protein_ids) # Get cistron IDs associated to each monomer cistron_ids = [ monomer_id_to_cistron_id[protein["id"]] for protein in all_proteins ] # Get lengths and amino acids counts of each protein protein_seqs = sim_data.getter.get_sequences(protein_ids) lengths = [len(seq) for seq in protein_seqs] aa_counts = [ [seq.count(aa) for aa in sim_data.amino_acid_code_to_id_ordered.keys()] for seq in protein_seqs ] n_amino_acids = len(sim_data.amino_acid_code_to_id_ordered) # Get molecular weights mws = sim_data.getter.get_masses(protein_ids).asNumber(units.g / units.mol) # Calculate degradation rates based on N-rule deg_rate_units = 1 / units.s n_end_rule_deg_rates = { row["aa_code"]: (np.log(2) / (row["half life"])).asNumber(deg_rate_units) for row in raw_data.protein_half_lives_n_end_rule } # Get degradation rates from measured protein half lives measured_deg_rates = { p["id"]: (np.log(2) / p["half life"]).asNumber(deg_rate_units) for p in raw_data.protein_half_lives_measured } # Get degradation rates from Nagar (2022) pulsed-SILAC half lives pulsed_silac_deg_rates = { p["id"]: (np.log(2) / p["half_life"]).asNumber(deg_rate_units) for p in raw_data.protein_half_lives_pulsed_silac } deg_rate = np.zeros(len(all_proteins)) for i, protein in enumerate(all_proteins): # Use measured degradation rates if available if protein["id"] in measured_deg_rates: deg_rate[i] = measured_deg_rates[protein["id"]] elif protein["id"] in pulsed_silac_deg_rates: deg_rate[i] = pulsed_silac_deg_rates[protein["id"]] # If measured rates are unavailable, use N-end rule else: seq = protein["seq"] assert ( seq[0] == "M" ) # All protein sequences should start with methionine # Set N-end residue as second amino acid if initial methionine # is cleaved n_end_residue = seq[protein["cleavage_of_initial_methionine"]] deg_rate[i] = n_end_rule_deg_rates[n_end_residue] max_protein_id_length = max( len(protein_id) for protein_id in protein_ids_with_compartments ) max_cistron_id_length = max(len(cistron_id) for cistron_id in cistron_ids) monomer_data = np.zeros( n_proteins, dtype=[ ("id", "U{}".format(max_protein_id_length)), ("cistron_id", "U{}".format(max_cistron_id_length)), ("deg_rate", "f8"), ("length", "i8"), ("aa_counts", "{}i8".format(n_amino_acids)), ("mw", "f8"), ], ) monomer_data["id"] = protein_ids_with_compartments monomer_data["cistron_id"] = cistron_ids monomer_data["deg_rate"] = deg_rate monomer_data["length"] = lengths monomer_data["aa_counts"] = aa_counts monomer_data["mw"] = mws field_units = { "id": None, "cistron_id": None, "deg_rate": deg_rate_units, "length": units.aa, "aa_counts": units.aa, "mw": units.g / units.mol, } self.monomer_data = UnitStructArray(monomer_data, field_units) self.n_monomers = len(self.monomer_data)
[docs] def _build_translation(self, raw_data, sim_data): sequences = sim_data.getter.get_sequences( [protein_id[:-3] for protein_id in self.monomer_data["id"]] ) max_len = ( np.int64( self.monomer_data["length"].asNumber().max() + self.max_time_step * sim_data.constants.ribosome_elongation_rate_max.asNumber( units.aa / units.s ) ) + self.next_aa_pad ) self.translation_sequences = np.full( (len(sequences), max_len), polymerize.PAD_VALUE, dtype=np.int8 ) aa_ids_single_letter = sim_data.amino_acid_code_to_id_ordered.keys() aaMapping = {aa: i for i, aa in enumerate(aa_ids_single_letter)} for i, sequence in enumerate(sequences): for j, letter in enumerate(sequence): self.translation_sequences[i, j] = aaMapping[letter] aaIDs = list(sim_data.amino_acid_code_to_id_ordered.values()) self.translation_monomer_weights = ( ( sim_data.getter.get_masses(aaIDs) - sim_data.getter.get_masses([sim_data.molecule_ids.water]) ) / sim_data.constants.n_avogadro ).asNumber(units.fg) self.translation_end_weight = ( sim_data.getter.get_masses([sim_data.molecule_ids.water]) / sim_data.constants.n_avogadro ).asNumber(units.fg) # Load active ribosome footprint on RNA molecule_id_to_footprint_sizes = { row["molecule_id"]: row["footprint_size"] for row in raw_data.footprint_sizes } try: self.active_ribosome_footprint_size = molecule_id_to_footprint_sizes[ "active_ribosome" ] except KeyError: raise ValueError("RNA footprint size for ribosomes not found.")
[docs] def _build_translation_efficiency(self, raw_data, sim_data): monomer_ids = [protein_id[:-3] for protein_id in self.monomer_data["id"]] # Get mappings from monomer IDs to gene IDs monomer_id_to_rna_id = { rna["monomer_ids"][0]: rna["id"] for rna in raw_data.rnas if len(rna["monomer_ids"]) > 0 } rna_id_to_gene_id = {gene["rna_ids"][0]: gene["id"] for gene in raw_data.genes} monomer_id_to_gene_id = { monomer_id: rna_id_to_gene_id[monomer_id_to_rna_id[monomer_id]] for monomer_id in monomer_ids } # Get mappings from gene IDs to translation efficiencies gene_id_to_trl_eff = { x["geneId"]: x["translationEfficiency"] for x in raw_data.translation_efficiency if isinstance(x["translationEfficiency"], float) } trl_effs = [] for monomer_id in monomer_ids: gene_id = monomer_id_to_gene_id[monomer_id] if gene_id in gene_id_to_trl_eff: trl_effs.append(gene_id_to_trl_eff[gene_id]) else: trl_effs.append(np.nan) # If efficiency is unavailable, the average of existing effciencies # is used self.translation_efficiencies_by_monomer = np.array(trl_effs) self.translation_efficiencies_by_monomer[ np.isnan(self.translation_efficiencies_by_monomer) ] = np.nanmean(self.translation_efficiencies_by_monomer)
[docs] def _build_elongation_rates(self, raw_data, sim_data): protein_ids = self.monomer_data["id"] ribosomal_protein_ids = sim_data.molecule_groups.ribosomal_proteins protein_indexes = {protein: index for index, protein in enumerate(protein_ids)} ribosomal_proteins = { rprotein: protein_indexes.get(rprotein, -1) for rprotein in ribosomal_protein_ids } self.ribosomal_protein_indexes = np.array( [index for index in sorted(ribosomal_proteins.values()) if index >= 0], dtype=np.int64, ) self.basal_elongation_rate = ( sim_data.constants.ribosome_elongation_rate_basal.asNumber( units.aa / units.s ) ) self.max_elongation_rate = ( sim_data.constants.ribosome_elongation_rate_max.asNumber(units.aa / units.s) ) self.elongation_rates = np.full( self.n_monomers, self.basal_elongation_rate, dtype=np.int64 ) self.elongation_rates[self.ribosomal_protein_indexes] = self.max_elongation_rate
[docs] def make_elongation_rates(self, random, base, time_step, variable_elongation=False): return make_elongation_rates( random, self.n_monomers, base, self.ribosomal_protein_indexes, self.max_elongation_rate, time_step, variable_elongation, )