Source code for reconstruction.ecoli.dataclasses.process.replication

"""
SimulationData for replication process
"""

import numpy as np
import collections

from ecoli.library.sim_data import MAX_TIME_STEP
from wholecell.utils import units
from wholecell.utils.polymerize import polymerize
from wholecell.utils.random import stochasticRound


PROCESS_MAX_TIME_STEP = 2.0


[docs] class SiteNotFoundError(Exception): pass
[docs] class Replication(object): """ SimulationData for the replication process """ def __init__(self, raw_data, sim_data): self.max_time_step = min(MAX_TIME_STEP, PROCESS_MAX_TIME_STEP) self._n_nt_types = len(sim_data.dntp_code_to_id_ordered) self._build_sequence(raw_data, sim_data) self._build_gene_data(raw_data, sim_data) self._build_sites(raw_data, sim_data) self._build_replication(raw_data, sim_data) self._build_motifs(raw_data, sim_data) self._build_elongation_rates(raw_data, sim_data) self.c_period = sim_data.constants.c_period self.d_period = sim_data.constants.d_period self.c_period_in_mins = self.c_period.asNumber(units.min) self.d_period_in_mins = self.d_period.asNumber(units.min)
[docs] def _build_sequence(self, raw_data, sim_data): self.genome_sequence = raw_data.genome_sequence self.genome_sequence_rc = self.genome_sequence.reverse_complement() self.genome_length = len(self.genome_sequence) self.genome_A_count = self.genome_sequence.count("A") self.genome_T_count = self.genome_sequence.count("T") self.genome_G_count = self.genome_sequence.count("G") self.genome_C_count = self.genome_sequence.count("C")
[docs] def _build_gene_data(self, raw_data, sim_data): """ Build gene-associated simulation data from raw data. """ def extract_data(raw, key, use_first_from_list=False): if use_first_from_list: data = [row[key][0] for row in raw] else: data = [row[key] for row in raw] dtype = "U{}".format(max(len(d) for d in data if d is not None)) return data, dtype names, name_dtype = extract_data(raw_data.genes, "id") symbols, symbol_dtype = extract_data(raw_data.genes, "symbol") cistron_ids, cistron_dtype = extract_data( raw_data.genes, "rna_ids", use_first_from_list=True ) self.gene_data = np.zeros( len(raw_data.genes), dtype=[ ("name", name_dtype), ("symbol", symbol_dtype), ("cistron_id", cistron_dtype), ], ) self.gene_data["name"] = names self.gene_data["symbol"] = symbols self.gene_data["cistron_id"] = cistron_ids
[docs] def _build_sites(self, raw_data, sim_data): """ Build simulation data associated with DNA sites from raw_data. """ def get_site_center_coordinates(site_id): """ Calculate the center coordinates (rounded average of left and right end coordinates) of a given DNA site. """ try: left, right = sim_data.getter.get_genomic_coordinates(site_id) center_coordinate = round((left + right) / 2) except (KeyError, TypeError): raise SiteNotFoundError( f"Coordinates of DNA site with ID {site_id} were not found in raw_data." ) return center_coordinate # Get coordinates of oriC and terC self.oric_coordinate = get_site_center_coordinates( sim_data.molecule_ids.oriC_site ) self.terc_coordinate = get_site_center_coordinates( sim_data.molecule_ids.terC_site )
[docs] def _build_replication(self, raw_data, sim_data): """ Build replication-associated simulation data from raw data. """ # Map ATGC to 8 bit integers numerical_sequence = np.empty(self.genome_length, np.int8) ntMapping = collections.OrderedDict( [ (ntp_code, i) for i, ntp_code in enumerate(sim_data.dntp_code_to_id_ordered.keys()) ] ) for i, letter in enumerate(raw_data.genome_sequence): numerical_sequence[i] = ntMapping[ letter ] # Build genome sequence as small integers # Create 4 possible polymerization sequences # Forward sequence includes oriC self.forward_sequence = numerical_sequence[ np.hstack( ( np.arange(self.oric_coordinate, self.genome_length), np.arange(0, self.terc_coordinate), ) ) ] # Reverse sequence includes terC self.reverse_sequence = numerical_sequence[ np.arange(self.oric_coordinate - 1, self.terc_coordinate - 1, -1) ] self.forward_complement_sequence = self._get_complement_sequence( self.forward_sequence ) self.reverse_complement_sequence = self._get_complement_sequence( self.reverse_sequence ) assert ( self.forward_sequence.size + self.reverse_sequence.size == self.genome_length ) # Log array of lengths of each replichore self.replichore_lengths = np.array( [ self.forward_sequence.size, self.reverse_sequence.size, ] ) # Determine size of the matrix used by polymerize function maxLen = np.int64( self.replichore_lengths.max() + self.max_time_step * sim_data.constants.replisome_elongation_rate.asNumber(units.nt / units.s) ) self.replication_sequences = np.empty((4, maxLen), np.int8) self.replication_sequences.fill(polymerize.PAD_VALUE) self.replication_sequences[0, : self.forward_sequence.size] = ( self.forward_sequence ) self.replication_sequences[1, : self.forward_complement_sequence.size] = ( self.forward_complement_sequence ) self.replication_sequences[2, : self.reverse_sequence.size] = ( self.reverse_sequence ) self.replication_sequences[3, : self.reverse_complement_sequence.size] = ( self.reverse_complement_sequence ) # Get polymerized nucleotide weights self.replication_monomer_weights = ( sim_data.getter.get_masses(sim_data.molecule_groups.dntps) - sim_data.getter.get_masses([sim_data.molecule_ids.ppi]) ) / sim_data.constants.n_avogadro # Placeholder value for "child_domains" attribute of domains without # children domains self.no_child_place_holder = -1
[docs] def _build_motifs(self, raw_data, sim_data): """ Build simulation data associated with sequence motifs from raw_data. Coordinates of all motifs are calculated based on the given sequences of the genome and the motifs. """ # Initialize dictionary of motif coordinates self.motif_coordinates = dict() for motif in raw_data.sequence_motifs: # Keys are the IDs of the motifs; Values are arrays of the motif's # coordinates. self.motif_coordinates[motif["id"]] = self._get_motif_coordinates( motif["length"], motif["sequences"] )
[docs] def _get_complement_sequence(self, sequenceVector): """ Calculates the vector for a complement sequence of a DNA sequence given in vector form. """ return (self._n_nt_types - 1) - sequenceVector
[docs] def _get_motif_coordinates(self, motif_length, motif_sequences): """ Finds the coordinates of all sequence motifs of a specific type. The coordinates are given as the positions of the midpoint of the motif relative to the oriC in base pairs. """ # Append the first n bases to the end of the sequences to account for # motifs that span the two endpoints extended_sequence = ( self.genome_sequence + self.genome_sequence[: (motif_length - 1)] ) extended_rc_sequence = ( self.genome_sequence_rc + self.genome_sequence_rc[: (motif_length - 1)] ) # Initialize list for coordinates of motifs coordinates = [] # Loop through all possible motif sequences for sequence in motif_sequences: # Find occurrences of the motif in the original sequence loc = extended_sequence.find(sequence) # Returns -1 if not found while loc != -1: coordinates.append(loc + motif_length // 2) loc = extended_sequence.find(sequence, loc + 1) # Find occurrences of the motif in the reverse complement loc = extended_rc_sequence.find(sequence) while loc != -1: coordinates.append( self.genome_length - loc - motif_length + motif_length // 2 ) loc = extended_rc_sequence.find(sequence, loc + 1) # Compute coordinates relative to oriC motif_coordinates = self._get_relative_coordinates(np.array(coordinates)) motif_coordinates.sort() return motif_coordinates
[docs] def _get_relative_coordinates(self, coordinates): """ Converts an array of genomic coordinates into coordinates relative to the origin of replication. """ relative_coordinates = ( (coordinates - self.terc_coordinate) % self.genome_length + self.terc_coordinate - self.oric_coordinate ) relative_coordinates[relative_coordinates < 0] += 1 return relative_coordinates
[docs] def _build_elongation_rates(self, raw_data, sim_data): self.basal_elongation_rate = int( round( sim_data.constants.replisome_elongation_rate.asNumber( units.nt / units.s ) ) )
[docs] def make_elongation_rates(self, random, replisomes, base, time_step): rates = np.full( replisomes, stochasticRound(random, base * time_step), dtype=np.int64 ) return rates
[docs] def get_average_copy_number(self, tau, coords): """ Calculates the average copy number of a gene throughout the cell cycle given the location of the gene in coordinates. Args: tau (float): expected doubling time in minutes coords (int or ndarray[int]): chromosome coordinates of genes Returns: float or ndarray[float] (matches length of coords): average copy number of each gene expected at a doubling time, tau """ right_replichore_length = self.replichore_lengths[0] left_replichore_length = self.replichore_lengths[1] # Calculate the relative position of the gene along the chromosome # from its coordinate relative_pos = np.array(coords, float) relative_pos[coords > 0] = relative_pos[coords > 0] / right_replichore_length relative_pos[coords < 0] = -relative_pos[coords < 0] / left_replichore_length # Return the predicted average copy number n_avg_copy = 2 ** ( ((1 - relative_pos) * self.c_period_in_mins + self.d_period_in_mins) / tau ) return n_avg_copy