Source code for ecoli.analysis.multivariant.new_gene_translation_efficiency_heatmaps

"""
Plot one value per index via heatmap for
new_gene_expression_and_translation_efficiency variant.

Possible Plots:

- Percent of sims that successfully reached a given generation number
- Average doubling time
- Average cell volume, mass, dry cell mass, mRNA mass, protein mass
- Average translation efficiency, weighted by cistron count
- Average mRNA count, monomer count, mRNA mass fraction, protein mass fraction,
  RNAP portion, and ribosome portion for a capacity gene to measure burden on
  overall host expression
- Average new gene copy number
- Average new gene mRNA count
- Average new gene mRNA mass fraction
- Average new gene mRNA counts fraction
- Average new gene NTP mass fraction
- Average new gene protein count
- Average new gene protein mass fraction
- Average new gene protein counts fraction
- Average new gene initialization rate for RNAP and ribosomes
- Average new gene initialization probabilities for RNAP and ribosomes
- Average count and portion of new gene ribosome initialization events per time
  step
- Average number and proportion of RNAP on new genes at a given time step
- Average number and proportion of ribosomes on new gene mRNAs at a given time
  step
- Average number and proportion of RNAP making rRNAs at a given time step
- Average number and proportion of RNAP and ribosomes making RNAP subunits at
  a given time step
- Average number and proportion of RNAP and ribosomes making ribosomal proteins
  at a given time step
- Average fraction of time new gene is overcrowded by RNAP and Ribosomes
- Average overcrowding probability ratio for new gene RNA synthesis and
  polypeptide initiation
- Average max_p probabilities for RNA synthesis and polypeptide initiation
- Average number of overcrowded genes for RNAP and Ribosomes
- Average number of total, active, and free ribosomes
- Average number of ribosomes initialized at each time step
- Average number of total active, and free RNA polymerases
- Average ppGpp concentration
- Average rate of glucose consumption
- Average new gene monomer yields - per hour and per fg of glucose
"""

import itertools

from duckdb import DuckDBPyConnection
import numpy as np
import numpy.typing as npt
from matplotlib import pyplot as plt
import math
import pyarrow as pa
from unum.units import fg
from typing import Any, Callable, cast, Optional, TYPE_CHECKING
from tqdm import tqdm

from ecoli.variants.new_gene_internal_shift import get_new_gene_ids_and_indices
from ecoli.library.parquet_emitter import (
    get_field_metadata,
    ndarray_to_ndlist,
    ndlist_to_ndarray,
    open_arbitrary_sim_data,
    read_stacked_columns,
)
from ecoli.library.schema import bulk_name_to_idx
from wholecell.utils.plotting_tools import export_figure, heatmap
from wholecell.utils import units

import pickle

if TYPE_CHECKING:
    from reconstruction.ecoli.fit_sim_data_1 import SimulationDataEcoli

FONT_SIZE = 9

"""
Dashboard Flag

- 0: Separate Only (Each plot is its own file)
- 1: Dashboard Only (One file with all plots)
- 2: Both Dashboard and Separate
"""
DASHBOARD_FLAG = 2

"""
Standard Deviations Flag

- True: Plot an additional copy of all plots with standard deviation displayed
  insted of the average
- False: Plot no additional plots
"""
STD_DEV_FLAG = True

"""
Count number of sims that reach this generation (remember index 7 
corresponds to generation 8)
"""
COUNT_INDEX = 23
# COUNT_INDEX = 2 ### TODO: revert back after developing plot locally

"""
Plot data from generations [MIN_CELL_INDEX, MAX_CELL_INDEX)
Note that early generations may not be representative of dynamics 
due to how they are initialized
"""
MIN_CELL_INDEX = 16
# # MIN_CELL_INDEX = 1 ### TODO: revert back after developing plot locally
# MIN_CELL_INDEX = 0
MAX_CELL_INDEX = 24

"""
Specify which subset of heatmaps should be made
Completed_gens heatmap is always made, because it is used to
create the other heatmaps, and should not be included here.
The order listed here will be the order of the heatmaps in the
dashboard plot.
"""
HEATMAPS_TO_MAKE_LIST = [
    "completed_gens_heatmap",
    "doubling_times_heatmap",
    "cell_mass_heatmap",
    "cell_dry_mass_heatmap",
    "cell_volume_heatmap",
    "ppgpp_concentration_heatmap",
    "rnap_crowding_heatmap",
    "ribosome_crowding_heatmap",
    "cell_mRNA_mass_heatmap",
    "cell_protein_mass_heatmap",
    "rnap_counts_heatmap",
    "ribosome_counts_heatmap",
    "new_gene_mRNA_counts_heatmap",
    "new_gene_monomer_counts_heatmap",
    "new_gene_copy_number_heatmap",
    "new_gene_rnap_init_rate_heatmap",
    "new_gene_ribosome_init_rate_heatmap",
    "new_gene_mRNA_mass_fraction_heatmap",
    "new_gene_monomer_mass_fraction_heatmap",
    "new_gene_rnap_time_overcrowded_heatmap",
    "new_gene_ribosome_time_overcrowded_heatmap",
    "new_gene_mRNA_counts_fraction_heatmap",
    "new_gene_monomer_counts_fraction_heatmap",
    "active_rnap_counts_heatmap",
    "active_ribosome_counts_heatmap",
    "new_gene_rnap_counts_heatmap",
    "new_gene_rnap_portion_heatmap",
    "rrna_rnap_counts_heatmap",
    "rrna_rnap_portion_heatmap",
    "rnap_subunit_rnap_counts_heatmap",
    "rnap_subunit_rnap_portion_heatmap",
    "rnap_subunit_ribosome_counts_heatmap",
    "rnap_subunit_ribosome_portion_heatmap",
    "ribosomal_protein_rnap_counts_heatmap",
    "ribosomal_protein_rnap_portion_heatmap",
    "ribosomal_protein_ribosome_counts_heatmap",
    "ribosomal_protein_ribosome_portion_heatmap",
    "new_gene_ribosome_counts_heatmap",
    "new_gene_ribosome_portion_heatmap",
    "weighted_avg_translation_efficiency_heatmap",
    "protein_init_prob_max_p_heatmap",
    "new_gene_protein_init_prob_max_p_target_ratio_heatmap",
    "new_gene_target_protein_init_prob_heatmap",
    "new_gene_actual_protein_init_prob_heatmap",
    "rna_synth_prob_max_p_heatmap",
    "new_gene_rna_synth_prob_max_p_target_ratio_heatmap",
    "new_gene_target_rna_synth_prob_heatmap",
    "new_gene_actual_rna_synth_prob_heatmap",
    "capacity_gene_mRNA_counts_heatmap",
    "capacity_gene_monomer_counts_heatmap",
    "capacity_gene_rnap_portion_heatmap",
    "capacity_gene_ribosome_portion_heatmap",
    "capacity_gene_mRNA_mass_fraction_heatmap",
    "capacity_gene_monomer_mass_fraction_heatmap",
    "capacity_gene_mRNA_counts_fraction_heatmap",
    "capacity_gene_monomer_counts_fraction_heatmap",
    "free_rnap_counts_heatmap",
    "free_ribosome_counts_heatmap",
    "rnap_ribosome_counts_ratio_heatmap",
    "ribosome_init_events_heatmap",
    "new_gene_ribosome_init_events_heatmap",
    "new_gene_ribosome_init_events_portion_heatmap",
    "new_gene_yield_per_glucose",
    "new_gene_yield_per_hour",
    "glucose_consumption_rate",
    "new_gene_mRNA_NTP_fraction_heatmap",
]

capacity_gene_monomer_ids = ["EG10544-MONOMER[m]"]
# capacity_gene_monomer_id = ["EG11036-MONOMER[c]"]


[docs] def get_mean_and_std_matrices( conn: DuckDBPyConnection, variant_mapping: dict[int, tuple[int, int]], variant_matrix_shape: tuple[int, int], history_sql: str, columns: list[str], projections: Optional[list[str]] = None, remove_first: bool = False, func: Optional[Callable] = None, order_results: bool = False, custom_sql: Optional[str] = None, post_func: Optional[Callable] = None, num_digits_rounding: Optional[int] = None, default_value: Optional[Any] = None, ) -> tuple[np.ndarray, np.ndarray]: """ Reads one or more columns and calculates mean and std. dev. for each variant. If no custom SQL query is provided, this defaults to averaging per cell, then calculating the averages and standard deviations of all cells per variant. Args: conn: DuckDB connection variant_mapping: Mapping of variant IDs to row and column in matrix of new gene translation efficiency and expression factor variants variant_matrix_shape: Number of rows and columns in variant matrix history_sql: SQL subquery from :py:func:`ecoli.library.parquet_emitter.get_dataset_sql` columns, projections, remove_first, func, order_results: See :py:func:`ecoli.library.parquet_emitter.read_stacked_columns` custom_sql: SQL string containing a placeholder with name ``subquery`` where the result of read_stacked_columns will be placed. Final query result must only have two columns in order: ``variant`` and a value for each variant. If not provided, defaults to average of averages post_func: Function that is called on PyArrow table resulting from query. Should return a PyArrow table with exactly three columns: ``variant`` for the variant IDs, ``mean`` for some mean aggregate value (can be N-D list column), and ``std`` for some standard deviation aggregate. num_digits_rounding: Number of decimal places to round to default_value: Default value to put in output variant matrices if variant ID not included in query result (e.g. if variant failed in first generation and had no completed sims) new_gene_NTP_fraction: Set to True for NTP fraction heatmap so query output is properly handled Returns: Tuple of Numpy matrices with first two dimensions ``variant_matrix_shape``. Each cell in first matrix has the mean for that variant. Each cell in the second matrix has the std. dev. for that variant. These values can be Numpy arrays instead of scalar values (e.g. when calculating aggregates for many genes at once), in which case the matrices have shapes ``variant_matrix_shape + (num_genes,)`` """ subquery = read_stacked_columns( history_sql=history_sql, columns=columns, projections=projections, remove_first=remove_first, func=func, order_results=order_results, ) if custom_sql is None: if len(columns) > 1: raise RuntimeError( "Must provide custom SQL expression to handle " "multiple columns at once." ) custom_sql = f""" WITH avg_per_cell AS ( SELECT avg({columns[0]}) AS avg_col, experiment_id, variant FROM ({{subquery}}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(avg_col) AS mean, stddev(avg_col) AS std FROM avg_per_cell GROUP BY experiment_id, variant """ if post_func is None: data = conn.sql(custom_sql.format(subquery=subquery)).arrow() else: data = conn.sql(custom_sql.format(subquery=subquery)).arrow() data = post_func(data) if set(data.column_names) != {"variant", "mean", "std"}: raise RuntimeError( "post_func should return a PyArrow table with " "exactly three columns named `variant`, `mean`, and `std`" ) data = [(i["variant"], i["mean"], i["std"]) for i in data.to_pylist()] mean_matrix = [ [default_value for _ in range(variant_matrix_shape[1])] for _ in range(variant_matrix_shape[0]) ] std_matrix = [ [default_value for _ in range(variant_matrix_shape[1])] for _ in range(variant_matrix_shape[0]) ] for variant, mean, std in data: variant_row, variant_col = variant_mapping[variant] if mean is not None: if num_digits_rounding is not None: mean = np.round(mean, num_digits_rounding) mean_matrix[variant_row][variant_col] = mean if std is not None: if num_digits_rounding is not None: std = np.round(std, num_digits_rounding) std_matrix[variant_row][variant_col] = std return np.array(mean_matrix), np.array(std_matrix)
[docs] def get_mRNA_ids_from_monomer_ids( sim_data: "SimulationDataEcoli", target_monomer_ids: list[str] ) -> list[list[str]]: """ Map monomer IDs back to the mRNA IDs that they were translated from. Args: target_monomer_ids: IDs of the monomers to map to mRNA IDs Returns: List of mRNA ID lists, one for each monomer ID """ # Map protein ids to cistron ids monomer_ids = sim_data.process.translation.monomer_data["id"] cistron_ids = sim_data.process.translation.monomer_data["cistron_id"] monomer_to_cistron_id_dict = { monomer_id: cistron_id for monomer_id, cistron_id in zip(monomer_ids, cistron_ids) } target_cistron_ids = [ monomer_to_cistron_id_dict.get(monomer_id) for monomer_id in target_monomer_ids ] RNA_ids = sim_data.process.transcription.rna_data["id"] target_RNA_ids = [] # Map cistron ids to RNA ids for RNAP_cistron_id in target_cistron_ids: target_RNA_idx = sim_data.process.transcription.cistron_id_to_rna_indexes( RNAP_cistron_id ) target_RNA_ids.append(RNA_ids[target_RNA_idx].tolist()) return target_RNA_ids
[docs] def get_indexes( conn: DuckDBPyConnection, config_sql: str, index_type: str, ids: list[str] | list[list[str]], ) -> list[int | None] | list[list[int | None]]: """ Retrieve DuckDB indices of a given type for a set of IDs. Note that DuckDB lists are 1-indexed. Args: conn: DuckDB database connection config_sql: DuckDB SQL query for sim config data (see :py:func:`~ecoli.library.parquet_emitter.get_dataset_sql`) index_type: Type of indices to return (one of ``cistron``, ``RNA``, ``mRNA``, or ``monomer``) ids: List of IDs to get indices for (must be monomer IDs if ``index_type`` is ``monomer``, else mRNA IDs) Returns: List of requested indexes """ if index_type == "cistron": # Extract cistron indexes for each new gene cistron_idx_dict = { cis: i + 1 for i, cis in enumerate( get_field_metadata( conn, config_sql, "listeners__rnap_data__rna_init_event_per_cistron" ) ) } return [cistron_idx_dict.get(cistron) for cistron in ids] elif index_type == "RNA": # Extract RNA indexes for each new gene RNA_idx_dict = { rna: i + 1 for i, rna in enumerate( get_field_metadata( conn, config_sql, "listeners__rna_synth_prob__target_rna_synth_prob" ) ) } return [[RNA_idx_dict.get(rna_id) for rna_id in rna_ids] for rna_ids in ids] elif index_type == "mRNA": # Extract mRNA indexes for each new gene mRNA_idx_dict = { rna: i + 1 for i, rna in enumerate( get_field_metadata( conn, config_sql, "listeners__rna_counts__mRNA_counts" ) ) } return [[mRNA_idx_dict.get(rna_id) for rna_id in rna_ids] for rna_ids in ids] elif index_type == "monomer": # Extract protein indexes for each new gene monomer_idx_dict = { monomer: i + 1 for i, monomer in enumerate( get_field_metadata(conn, config_sql, "listeners__monomer_counts") ) } return [monomer_idx_dict.get(monomer_id) for monomer_id in ids] else: raise Exception( "Index type " + index_type + " has no instructions for data extraction." )
GENE_COUNTS_SQL = """ WITH unnested_counts AS ( SELECT unnest(gene_counts) AS gene_counts, generate_subscripts(gene_counts, 1) AS gene_idx, experiment_id, variant, lineage_seed, generation, agent_id FROM ({subquery}) ), avg_per_cell AS ( SELECT avg(gene_counts) AS avg_count, experiment_id, variant, gene_idx FROM unnested_counts GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, gene_idx ), avg_per_variant AS ( SELECT log10(avg(avg_count) + 1) AS avg_count, log10(stddev(avg_count) + 1) AS std_count, experiment_id, variant, gene_idx FROM avg_per_cell GROUP BY experiment_id, variant, gene_idx ) SELECT variant, list(avg_count ORDER BY gene_idx) AS mean, list(std_count ORDER BY gene_idx) AS std, FROM avg_per_variant GROUP BY experiment_id, variant """ """ Generic SQL query for calculating average of a 1D-array column per cell, aggregates that per variant into ``log10(mean + 1)`` and ``log10(std + 1)`` columns. """
[docs] def get_gene_mass_prod_func( sim_data: "SimulationDataEcoli", index_type: str, gene_ids: list[str] | list[list[str]], ) -> Callable[[pa.Table], pa.Table]: """ Create a function to be passed as the ``post_func`` argument to :py:func:`~.get_mean_and_std_matrices` which multiplies the average and standard deviation 1D array columns by the mass of the gene ID for each element. Args: sim_data: Simulation data index_type: Either ``mRNA`` or ``monomer``. If ``mRNA``, ``gene_ids`` is list of lists of mRNA IDs, where inner lists correspond to mRNAs for each gene. Therefore, we sum the masses for the mRNAs of each inner list and multiply the input mean and std by this sum per gene. gene_ids: IDs of genes in the order they appear in the 1D arrays of the query result """ # Get mass for gene if index_type == "monomer": gene_masses = np.array( [ ( sim_data.getter.get_mass(gene_id) / sim_data.constants.n_avogadro ).asNumber(fg) for gene_id in gene_ids ] ) elif index_type == "mRNA": gene_masses = np.array( [ np.sum( [ ( sim_data.getter.get_mass(gene_id) / sim_data.constants.n_avogadro ).asNumber(fg) for gene_id in one_gene_ids ] ) for one_gene_ids in gene_ids ] ) else: raise RuntimeError("index_type must be monomer or mRNA.") def gene_mass_prod(variant_agg): avg_arr = ndlist_to_ndarray(variant_agg["mean"]) std_arr = ndlist_to_ndarray(variant_agg["std"]) return pa.table( { "variant": variant_agg["variant"], "mean": ndarray_to_ndlist(avg_arr * gene_masses), "std": ndarray_to_ndlist(std_arr * gene_masses), } ) return gene_mass_prod
[docs] def get_gene_count_fraction_sql( gene_indices: list[int] | list[list[int]], column: str, index_type: str ) -> str: """ Construct generic SQL query that gets the average per cell of a select set of indices from a 1D list column divided by the total of all elements per row of that list column, and aggregates those ratios per variant into mean and std columns. Args: gene_indices: Indices to extract from 1D list column to get ratios for column: Name of 1D list column index_type: Can either be ``monomer`` or ``mRNA``. For ``monomer``, function works exactly as described above. For ``mRNA``, ``gene_indices`` will be a list of lists of mRNA indices. This is because one gene can have to multiple mRNAs (transcription units). Therefore, we sum the elements corresponding to each gene before proceeding (see :py:func:`~.get_rnas_combined_as_genes_projection`). """ if index_type == "monomer": list_to_unnest = f"list_select({column}, {gene_indices})" else: list_to_unnest = ( "[" + ", ".join( [ f"list_sum(list_select({column}, {idx_for_one_gene}))" for idx_for_one_gene in gene_indices ] ) + "]" ) return f""" WITH list_counts AS ( SELECT {list_to_unnest} AS selected_counts, list_sum({column}) AS total_counts, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) ), unnested_fracs AS ( SELECT unnest(selected_counts) / total_counts AS gene_fracs, generate_subscripts(selected_counts, 1) AS gene_idx, experiment_id, variant, lineage_seed, generation, agent_id FROM list_counts ), avg_per_cell AS ( SELECT avg(gene_fracs) AS avg_frac, experiment_id, variant, gene_idx FROM unnested_fracs GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, gene_idx ), avg_per_variant AS ( SELECT experiment_id, variant, avg(avg_frac) AS avg_frac, stddev(avg_frac) AS std_frac, gene_idx FROM avg_per_cell GROUP BY experiment_id, variant, gene_idx ) SELECT variant, list(avg_frac ORDER BY gene_idx) AS mean, list(std_frac ORDER BY gene_idx) AS std, FROM avg_per_variant GROUP BY experiment_id, variant """
[docs] def get_new_gene_mRNA_NTP_fraction_sql( sim_data: "SimulationDataEcoli", new_gene_mRNA_idx: list[list[int]], ntp_ids: list[str], ) -> str: """ Construct SQL query that gets, for each NTP, the fraction used by the mRNAs of each new gene, averages that per cell, and aggregate those fractions per variant into mean and std columns where each row is a 2D list with shape ``(# NTPs, # new genes)``. Args: sim_data: Simulation data new_gene_mRNA_idx: List of lists of mRNA indices for each new gene ntp_ids: IDs for NTPs in same order that they appear in ``sim_data.process.transcription.rna_data["counts_ACGU"]`` """ # Determine number of NTPs per new gene mRNA and for all mRNAs all_gene_mRNA_ACGU = sim_data.process.transcription.rna_data[ "counts_ACGU" ].asNumber()[sim_data.process.transcription.rna_data["is_mRNA"]] # Strip location tags from NTP IDs so they can be used in SQL # identifiers without quoting ntp_ids = [ntp[:-3] for ntp in ntp_ids] # DuckDB list comprehension to calculate # of each NTP used by each mRNA ntp_projections = ", ".join( f""" [count_ntp[1] * count_ntp[2] for count_ntp in list_zip(listeners__rna_counts__mRNA_counts, {all_gene_mRNA_ACGU[:, ntp_idx].tolist()})] AS count_{ntp_id} """ for ntp_idx, ntp_id in enumerate(ntp_ids) ) # For each NTP, calculate fraction used in mRNAs for each new gene ntp_frac_projections = ", ".join( "[" + ", ".join( f"list_sum(list_select(count_{ntp_id}, {one_gene_idx})) / " f"list_sum(count_{ntp_id})" for one_gene_idx in new_gene_mRNA_idx ) + f"] AS frac_{ntp_id}" for ntp_id in ntp_ids ) # Unnest average NTP fraction per mRNA unnested_frac_projections = ( ", ".join( [f"unnest(frac_{ntp_id}) AS unnested_{ntp_id}_frac" for ntp_id in ntp_ids] ) + f", generate_subscripts(frac_{ntp_ids[0]}, 1) AS gene_idx" ) # Average NTP fraction per new gene first per cell, then per variant cell_avg_frac_projections = ", ".join( [f"avg(unnested_{ntp_id}_frac) AS avg_{ntp_id}_frac" for ntp_id in ntp_ids] ) variant_avg_frac_projections = ", ".join( [ f"avg(avg_{ntp_id}_frac) AS avg_{ntp_id}_frac, " f"stddev(avg_{ntp_id}_frac) AS std_{ntp_id}_frac" for ntp_id in ntp_ids ] ) # Compile new gene average NTP fractions into 2D list columns # with shape (# of NTPs, # of new genes) list_avg_frac_projections = ( "[" + ", ".join( [f"list(avg_{ntp_id}_frac ORDER BY gene_idx)" for ntp_id in ntp_ids] ) + "] AS mean" ) list_std_frac_projections = ( "[" + ", ".join( [f"list(std_{ntp_id}_frac ORDER BY gene_idx)" for ntp_id in ntp_ids] ) + "] AS std" ) return f""" WITH ntp_counts AS ( SELECT {ntp_projections}, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) ), ntp_frac AS ( SELECT {ntp_frac_projections}, experiment_id, variant, lineage_seed, generation, agent_id FROM ntp_counts ), unnested_frac AS ( SELECT {unnested_frac_projections}, experiment_id, variant, lineage_seed, generation, agent_id FROM ntp_frac ), avg_per_cell AS ( SELECT {cell_avg_frac_projections}, experiment_id, variant, lineage_seed, generation, agent_id, gene_idx FROM unnested_frac GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, gene_idx ), avg_per_variant AS ( SELECT experiment_id, variant, gene_idx, {variant_avg_frac_projections} FROM avg_per_cell GROUP BY experiment_id, variant, gene_idx ) SELECT variant, {list_avg_frac_projections}, {list_std_frac_projections} FROM avg_per_variant GROUP BY experiment_id, variant """
[docs] def avg_ratio_of_1d_arrays_sql(numerator: str, denominator: str) -> str: """ Create generic SQL query that calculates the average per cell of each element in two 1D list columns divided elementwise and aggregates those] ratios per variant into mean and std columns. Args: numerator: Name of 1D list column that will be numerator in ratio denominator: Name of 1D list column that will be denominator in ratio """ return f""" WITH unnested_data AS ( SELECT unnest({denominator}) AS denominator, unnest({numerator}) AS numerator, generate_subscripts({numerator}, 1) AS list_idx, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) ), ratio_avg_per_cell AS ( SELECT avg(numerator / denominator) AS ratio_avg, experiment_id, variant, list_idx FROM unnested_data GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, list_idx ), ratio_avg_per_variant AS ( SELECT avg(ratio_avg) AS ratio_avg, stddev(ratio_avg) AS ratio_std, list_idx, experiment_id, variant FROM ratio_avg_per_cell GROUP BY experiment_id, variant, list_idx ) SELECT variant, list(ratio_avg ORDER BY list_idx) AS mean, list(ratio_std ORDER BY list_idx) AS std, FROM ratio_avg_per_variant GROUP BY experiment_id, variant """
[docs] def avg_1d_array_sql(column: str) -> str: """ Create generic SQL query that calculates the average per cell of each element in a 1D array column and aggregates that per variant into mean and std columns. Args: column: Name of 1D list column to aggregate """ return f""" WITH unnested_counts AS ( SELECT unnest({column}) AS array_col, generate_subscripts({column}, 1) AS array_idx, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) ), avg_per_cell AS ( SELECT avg(array_col) AS avg_array_col, experiment_id, variant, lineage_seed, generation, agent_id, array_idx FROM unnested_counts GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, array_idx ), avg_per_variant AS ( SELECT avg(avg_array_col) AS avg_array_col, stddev(avg_array_col) AS std_array_col, experiment_id, variant, array_idx FROM avg_per_cell GROUP BY experiment_id, variant, array_idx ) SELECT variant, list(avg_array_col ORDER BY array_idx) AS mean, list(std_array_col ORDER BY array_idx) AS std FROM avg_per_variant GROUP BY experiment_id, variant """
[docs] def avg_sum_1d_array_sql(column: str) -> str: """ Create generic SQL query that calculates the average per cell of the sum of elements in a 1D array column and aggregates that per variant into mean and std columns. Args: column: Name of 1D list column to aggregate """ return f""" WITH avg_per_cell AS ( SELECT avg(list_sum({column})) AS avg_sum, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(avg_sum) AS mean, stddev(avg_sum) AS std FROM avg_per_cell GROUP BY experiment_id, variant """
[docs] def avg_1d_array_over_scalar_sql(array_column: str, scalar_column: str) -> str: """ Create generic SQL query that calculates the average per cell of each element in a 1D array column divided by a scalar column, and aggregates those ratios per variant into mean and std columns. Args: array_column: Name of 1D list column to aggregate scalar_column: Name of scalar column to divide ``array_column`` cell averages by """ return f""" WITH unnested_counts AS ( SELECT unnest({array_column}) AS array_col, generate_subscripts({array_column}, 1) AS array_idx, {scalar_column} AS scalar_col, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) ), avg_per_cell AS ( SELECT avg(array_col / scalar_col) AS avg_ratio, experiment_id, variant, lineage_seed, generation, agent_id, array_idx FROM unnested_counts GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, array_idx ), avg_per_variant AS ( SELECT avg(avg_ratio) AS avg_ratio, stddev(avg_ratio) AS std_ratio, experiment_id, variant, array_idx FROM avg_per_cell GROUP BY experiment_id, variant, array_idx ) SELECT variant, list(avg_ratio ORDER BY array_idx) AS mean, list(std_ratio ORDER BY array_idx) AS std FROM avg_per_variant GROUP BY experiment_id, variant """
[docs] def avg_sum_1d_array_over_scalar_sql(array_column: str, scalar_column: str) -> str: """ Create generic SQL query that calculates the average per cell of the sum of elements in a 1D array column divided by a scalar column, and aggregates those ratios per variant as mean and std columns. Args: array_column: Name of 1D list column to aggregate scalar_column: Name of scalar column to divide ``array_column`` cell averages by """ return f""" WITH avg_per_cell AS ( SELECT avg(list_sum({array_column}) / {scalar_column}) AS avg_ratio, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(avg_ratio) AS mean, stddev(avg_ratio) AS std FROM avg_per_cell GROUP BY experiment_id, variant """
[docs] def get_rnap_counts_projection( sim_data: "SimulationDataEcoli", bulk_ids: list[str] ) -> str: """ Return SQL projection to selectively read bulk inactive RNAP count. Args: sim_data: Simulation data bulk_ids: List of all bulk IDs in order """ rnap_idx = bulk_name_to_idx(sim_data.molecule_ids.full_RNAP, bulk_ids) return f"bulk[{rnap_idx + 1}] AS bulk"
[docs] def get_ribosome_counts_projection( sim_data: "SimulationDataEcoli", bulk_ids: list[str] ) -> str: """ Return SQL projection to selectively read bulk inactive ribosome count (defined as minimum of free 30S and 50S subunits at any given moment) Args: sim_data: Simulation data bulk_ids: List of all bulk IDs in order """ ribosome_idx = cast( np.ndarray, bulk_name_to_idx( [ sim_data.molecule_ids.s30_full_complex, sim_data.molecule_ids.s50_full_complex, ], bulk_ids, ), ) return f"least(bulk[{ribosome_idx[0] + 1}], bulk[{ribosome_idx[1] + 1}]) AS bulk"
[docs] def get_overcrowding_sql(target_col: str, actual_col: str) -> str: """ Create generic SQL query that calculates for average number of genes for each cell that are overcrowded, then aggregate that per variant into mean and std columns. Args: target_col: Name of 1D list column with target values actual_col: Name of 1D list column with actual values. If the per-cell average of an element in ``target_col`` is greater than the per-cell average of the corresponding element in ``actual_col``, we say that the gene for that element is overcrowded. """ return f""" WITH avg_per_cell AS ( SELECT avg(list_sum([(i[1] > i[2])::UTINYINT for i in list_zip({target_col}, {actual_col})])) AS overcrowded, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(overcrowded) AS mean, stddev(overcrowded) AS std, FROM avg_per_cell GROUP BY experiment_id, variant """
[docs] def get_rnas_combined_as_genes_projection( column: str, rna_idx: list[list[int]], name: str, cast_type: Optional[str] = None ): """ Create generic SQL projection that evaluates to a list column where each element is the sum of a subset of elements from the original list column. This is mainly used to sum up all RNA data that corresponds to a single gene / cistron / monomer. """ if cast_type is not None: cast_type = f"::{cast_type}" else: cast_type = "" sum_projections = ", ".join( [ f"list_sum(list_select({column}, {rna_idx_for_one_gene}){cast_type})" for rna_idx_for_one_gene in rna_idx ] ) return f"[{sum_projections}] AS {name}"
[docs] def get_variant_mask( conn: DuckDBPyConnection, config_sql: str, variant_to_row_col: dict[int, tuple[int, int]], variant_matrix_shape: tuple[int, int], ) -> npt.NDArray[np.bool_]: """ Get a boolean matrix where the rows represent the different translation efficiencies and the columns represent the different expression factors that were used to create variants. The matrix is True for each combination that was actually simulated and False otherwise. """ variants = conn.sql( f"SELECT DISTINCT variant AS variant FROM ({config_sql})" ).fetchnumpy()["variant"] variant_mask = np.zeros(variant_matrix_shape, np.bool_) for variant in variants: variant_mask[*variant_to_row_col[variant]] = True return variant_mask
[docs] def plot_heatmaps( heatmap_data, heatmap_details, new_gene_cistron_ids, ntp_ids, capacity_gene_common_names, total_heatmaps_to_make, is_dashboard, variant_mask, heatmap_x_label, heatmap_y_label, new_gene_expression_factors, new_gene_translation_efficiency_values, summary_statistic, figsize_x, figsize_y, plotOutDir, plot_suffix, ): """ Plots all heatmaps in order given by HEATMAPS_TO_MAKE_LIST. Args: is_dashboard: Boolean flag for whether we are creating a dashboard of heatmaps or a number of individual heatmaps variant_mask: np.array of dimension (len(new_gene_translation_efficiency_values), len(new_gene_expression_factors)) with entries set to True if variant was run, False otherwise. heatmap_x_label: Label for x axis of heatmap heatmap_y_label: Label for y axis of heatmap new_gene_expression_factors: New gene expression factors used in these variants new_gene_translation_efficiency_values: New gene translation efficiency values used in these variants summary_statistic: Specifies whether average (mean) or standard deviation (std_dev) should be displayed on the heatmaps figsize_x: Horizontal size of each heatmap figsize_y: Vertical size of each heatmap plotOutDir: Output directory for plots plot_suffix: Suffix to add to plot file names, usually specifying which generations were plotted """ if summary_statistic == "std_dev": plot_suffix = plot_suffix + "_std_dev" elif summary_statistic != "mean": raise Exception( "mean and std_dev are the only currently supported" " summary statistics" ) if is_dashboard: # Determine dashboard layout if total_heatmaps_to_make > 3: dashboard_ncols = 4 dashboard_nrows = math.ceil((total_heatmaps_to_make + 1) / dashboard_ncols) else: dashboard_ncols = total_heatmaps_to_make + 1 dashboard_nrows = 1 fig, axs = plt.subplots( nrows=dashboard_nrows, ncols=dashboard_ncols, figsize=(figsize_y * dashboard_ncols, figsize_x * dashboard_nrows), layout="constrained", ) if dashboard_nrows == 1: axs = np.reshape(axs, (1, dashboard_ncols)) row_ax = 0 col_ax = 0 for h in HEATMAPS_TO_MAKE_LIST: if not heatmap_details[h]["is_nonstandard_plot"]: stop_index = 1 title_addition = "" filename_addition = "" if heatmap_details[h]["is_new_gene_heatmap"]: stop_index = len(new_gene_cistron_ids) elif heatmap_details[h]["is_capacity_gene_heatmap"]: stop_index = len(capacity_gene_common_names) for i in range(stop_index): if heatmap_details[h]["is_new_gene_heatmap"]: title_addition = f": {new_gene_cistron_ids[i][:-4]}" filename_addition = f"_{new_gene_cistron_ids[i][:-4]}" curr_heatmap_data = heatmap_data[h][summary_statistic][:, :, i] elif heatmap_details[h]["is_capacity_gene_heatmap"]: title_addition = f": {capacity_gene_common_names[i]}" filename_addition = f"_{capacity_gene_common_names[i]}" curr_heatmap_data = heatmap_data[h][summary_statistic][:, :, i] else: curr_heatmap_data = heatmap_data[h][summary_statistic] title = heatmap_details[h]["plot_title"] + title_addition if summary_statistic == "std_dev": title = f"Std Dev: {title}" if is_dashboard: curr_ax = axs[row_ax][col_ax] else: fig, curr_ax = plt.subplots(1, 1, figsize=(figsize_x, figsize_y)) heatmap( curr_ax, variant_mask, curr_heatmap_data, heatmap_data["completed_gens_heatmap"]["mean"], new_gene_expression_factors, new_gene_translation_efficiency_values, heatmap_x_label, heatmap_y_label, title, heatmap_details[h]["box_text_size"], ) if not is_dashboard: fig.tight_layout() export_figure(plt, plotOutDir, h + filename_addition + plot_suffix) plt.close(fig) else: col_ax += 1 if col_ax == dashboard_ncols: col_ax = 0 row_ax += 1 elif h == "new_gene_mRNA_NTP_fraction_heatmap": for cistron_idx, cistron_id in enumerate(new_gene_cistron_ids): for ntp_idx, ntp_id in enumerate(ntp_ids): title = ( f"{heatmap_details[h]['plot_title']} {ntp_id[:-3]}" f" Fraction: {cistron_id[:-4]}" ) if summary_statistic == "std_dev": title = f"Std Dev: {title}" if is_dashboard: curr_ax = axs[row_ax][col_ax] else: fig, curr_ax = plt.subplots( 1, 1, figsize=(figsize_x, figsize_y) ) heatmap( curr_ax, variant_mask, heatmap_data[h][summary_statistic][:, :, ntp_idx, cistron_idx], heatmap_data["completed_gens_heatmap"]["mean"], new_gene_expression_factors, new_gene_translation_efficiency_values, heatmap_x_label, heatmap_y_label, title, heatmap_details[h]["box_text_size"], ) if not is_dashboard: fig.tight_layout() export_figure( plt, plotOutDir, f"new_gene_mRNA_{ntp_id[:-3]}_fraction_heatmap" + filename_addition + plot_suffix, ) plt.close(fig) else: col_ax += 1 if col_ax == dashboard_ncols: col_ax = 0 row_ax += 1 else: raise Exception( f"Heatmap {h} is neither a standard plot nor a" f" nonstandard plot that has specific instructions for" f" plotting." ) if is_dashboard: fig.tight_layout() export_figure( plt, plotOutDir, f"00_new_gene_exp_trl_eff_dashboard{plot_suffix}" ) ## TODO: Revert back after running new plots on Sherlock sims plt.close("all")
[docs] def plot( params: dict[str, Any], conn: DuckDBPyConnection, history_sql: str, config_sql: str, sim_data_dict: dict[str, dict[int, str]], validation_data_paths: list[str], outdir: str, variant_metadata: dict[str, dict[int, Any]], variant_names: dict[str, str], ): """ Create either a single multi-heatmap plot or 1+ separate heatmaps of data for a grid of new gene variant simulations with varying expression and translation efficiencies. """ # Filter to specified generation range history_sql = ( f"FROM ({history_sql}) WHERE generation >= {MIN_CELL_INDEX}" f" AND generation < {MAX_CELL_INDEX}" ) config_sql = ( f"FROM ({config_sql}) WHERE generation >= {MIN_CELL_INDEX}" f" AND generation < {MAX_CELL_INDEX}" ) # Define baseline variant (ID = 0) as 0 new gene expr. and trans. eff. experiment_id = next(iter(variant_metadata.keys())) variant_metadata = variant_metadata[experiment_id] variant_metadata[0] = {"exp_trl_eff": {"exp": 0, "trl_eff": 0}} with open_arbitrary_sim_data(sim_data_dict) as f: sim_data = pickle.load(f) # Determine new gene cistron and monomer ids (new_gene_cistron_ids, _, new_gene_monomer_ids, _) = get_new_gene_ids_and_indices( sim_data ) # Assuming we ran a workflow with `n` new gene expression factors # and `m` new gene translation efficiency values, create an `n * m` # grid sorted in ascending order along both axes and calculate # mapping from variant numbers to row and column indices in grid assert "exp_trl_eff" in next(iter(variant_metadata.values())), ( "This plot is intended to be run on simulations where the" " new gene expression-translation efficiency variant was " "enabled, but no parameters for this variant were found." ) new_gene_expression_factors = sorted( set( variant_params["exp_trl_eff"]["exp"] for variant_params in variant_metadata.values() ) ) new_gene_translation_efficiency_values = sorted( set( variant_params["exp_trl_eff"]["trl_eff"] for variant_params in variant_metadata.values() ) ) variant_matrix_shape = ( len(new_gene_translation_efficiency_values), len(new_gene_expression_factors), ) variant_to_row_col = { variant: ( new_gene_translation_efficiency_values.index( variant_params["exp_trl_eff"]["trl_eff"] ), new_gene_expression_factors.index(variant_params["exp_trl_eff"]["exp"]), ) for variant, variant_params in variant_metadata.items() } variant_mask = get_variant_mask( conn, config_sql, variant_to_row_col, variant_matrix_shape ) bulk_ids = get_field_metadata(conn, config_sql, "bulk") ntp_ids = list(sim_data.ntp_code_to_id_ordered.values()) # Get indices for data extraction rnap_subunit_mRNA_ids = get_mRNA_ids_from_monomer_ids( sim_data, sim_data.molecule_groups.RNAP_subunits ) rnap_subunit_mRNA_indexes = list( set( itertools.chain.from_iterable( cast( list[list[int]], get_indexes(conn, config_sql, "mRNA", rnap_subunit_mRNA_ids), ) ) ) ) rnap_subunit_monomer_indexes = get_indexes( conn, config_sql, "monomer", sim_data.molecule_groups.RNAP_subunits ) ribosomal_mRNA_ids = get_mRNA_ids_from_monomer_ids( sim_data, sim_data.molecule_groups.ribosomal_proteins ) ribosomal_mRNA_indexes = list( set( itertools.chain.from_iterable( cast( list[list[int]], get_indexes(conn, config_sql, "mRNA", ribosomal_mRNA_ids), ) ) ) ) ribosomal_monomer_indexes = get_indexes( conn, config_sql, "monomer", sim_data.molecule_groups.ribosomal_proteins ) cistron_ids = get_field_metadata( conn, config_sql, "listeners__rna_counts__full_mRNA_cistron_counts" ) capacity_gene_mRNA_ids = get_mRNA_ids_from_monomer_ids( sim_data, capacity_gene_monomer_ids ) capacity_gene_mRNA_indexes = cast( list[list[int]], get_indexes(conn, config_sql, "mRNA", capacity_gene_mRNA_ids) ) capacity_gene_monomer_indexes = cast( list[int], get_indexes(conn, config_sql, "monomer", capacity_gene_monomer_ids) ) capacity_gene_common_names = [ sim_data.common_names.get_common_name(monomer_id[:-3]) for monomer_id in capacity_gene_monomer_ids ] # Convert cistron IDs to RNA IDs RNA_ids = sim_data.process.transcription.rna_data["id"] new_gene_mRNA_ids = [] for cistron_id in new_gene_cistron_ids: target_RNA_idx = sim_data.process.transcription.cistron_id_to_rna_indexes( cistron_id ) new_gene_mRNA_ids.append(RNA_ids[target_RNA_idx]) new_gene_mRNA_indexes = cast( list[list[int]], get_indexes(conn, config_sql, "mRNA", new_gene_mRNA_ids) ) new_gene_monomer_indexes = cast( list[int], get_indexes(conn, config_sql, "monomer", new_gene_monomer_ids) ) new_gene_RNA_indexes = cast( list[list[int]], get_indexes(conn, config_sql, "RNA", new_gene_mRNA_ids) ) new_gene_cistron_indexes = cast( list[int], get_indexes(conn, config_sql, "cistron", new_gene_cistron_ids) ) # Determine glucose index in exchange fluxes external_molecule_ids = np.array( get_field_metadata( conn, config_sql, "listeners__fba_results__external_exchange_fluxes" ) ) if "GLC[p]" not in external_molecule_ids: print("This plot only runs when glucose is the carbon source.") return glucose_idx = np.where(external_molecule_ids == "GLC[p]")[0][0] + 1 flux_scaling_factor = ( sim_data.getter.get_mass("GLC[p]") * (units.mmol / units.g / units.h) * units.fg # Flux * dry mass units ).asNumber() # Get normalized translation efficiency for all mRNAs trl_effs = sim_data.process.translation.translation_efficiencies_by_monomer trl_eff_ids = sim_data.process.translation.monomer_data["cistron_id"] mRNA_cistron_idx_dict = {rna: i for i, rna in enumerate(cistron_ids)} trl_eff_id_mapping = np.array([mRNA_cistron_idx_dict[id] for id in trl_eff_ids]) ordered_trl_effs = trl_effs[np.argsort(trl_eff_id_mapping)] """ Details needed to create all possible heatmaps key (string): Heatmap identifier, will also be used in file name if plots are saved separately is_new_gene_heatmap (bool): If True, one heatmap will be made for each new gene is_nonstandard_plotting (bool): False if only one plot (or one plot per new gene) needs to be made. True in all other cases. data_table (string): Table to get data from. data_column (string): Column in table to get data from. default_value (int): Value to use in heatmap if no data is extracted for this parameter combination. remove_first (bool): If True, removes the first column of data from each cell (which might be set to a default value in some cases) num_digits_rounding (int): Specifies how to round the number displayed in each sequare of the heatmap box_text_size (string): Specifies font size of number displayed in each square of the heatmap plot_title (string): Title of heatmap to display """ # Specify unique fields and non-default values here heatmap_details: dict[str, dict] = { "completed_gens_heatmap": { "columns": ["time"], "custom_sql": f""" WITH max_gen_per_seed AS ( SELECT max(generation) AS max_generation, experiment_id, variant FROM ({{subquery}}) GROUP BY experiment_id, variant, lineage_seed ) -- Boolean values must be explicitly cast to numeric for aggregation SELECT variant, avg((max_generation = {COUNT_INDEX})::BIGINT) AS mean, stddev((max_generation = {COUNT_INDEX})::BIGINT) AS std FROM max_gen_per_seed GROUP BY experiment_id, variant """, "plot_title": f"Percentage of Sims That Reached Generation {COUNT_INDEX}", }, "doubling_times_heatmap": { "columns": ["time"], "custom_sql": """ WITH doubling_times AS ( SELECT (max(time) - min(time)) / 60 AS doubling_time, experiment_id, variant, lineage_seed, generation, agent_id FROM ({subquery}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(doubling_time) AS mean, stddev(doubling_time) AS std FROM doubling_times GROUP BY experiment_id, variant """, "plot_title": "Doubling Time (minutes)", }, "cell_volume_heatmap": { "columns": ["listeners__mass__volume"], "plot_title": "Cell Volume (fL)", }, "cell_mass_heatmap": { "columns": ["listeners__mass__cell_mass"], "plot_title": "Cell Mass (fg)", }, "cell_dry_mass_heatmap": { "columns": ["listeners__mass__dry_mass"], "plot_title": "Dry Cell Mass (fg)", }, "cell_mRNA_mass_heatmap": { "columns": ["listeners__mass__mRna_mass"], "plot_title": "Total mRNA Mass (fg)", }, "cell_protein_mass_heatmap": { "columns": ["listeners__mass__protein_mass"], "plot_title": "Total Protein Mass (fg)", "box_text_size": "x-small", "num_digits_rounding": 0, }, "ppgpp_concentration_heatmap": { "columns": ["listeners__growth_limits__ppgpp_conc"], "plot_title": "ppGpp Concentration (uM)", "remove_first": True, "num_digits_rounding": 1, }, "ribosome_init_events_heatmap": { "columns": ["listeners__ribosome_data__did_initialize"], "plot_title": "Ribosome Init Events Per Time Step", "num_digits_rounding": 0, "box_text_size": "x-small", }, "rnap_counts_heatmap": { "columns": ["bulk", "listeners__unique_molecule_counts__active_RNAP"], "plot_title": "RNA Polymerase (RNAP) Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [ get_rnap_counts_projection(sim_data, bulk_ids), "listeners__unique_molecule_counts__active_RNAP", ], "custom_sql": """ WITH total_counts AS ( SELECT avg(bulk + listeners__unique_molecule_counts__active_RNAP) AS rnap_counts, experiment_id, variant FROM ({subquery}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(rnap_counts) AS mean, stddev(rnap_counts) AS std FROM total_counts GROUP BY experiment_id, variant """, }, "ribosome_counts_heatmap": { "columns": ["bulk", "listeners__unique_molecule_counts__active_ribosome"], "plot_title": "Active Ribosome Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [ get_ribosome_counts_projection(sim_data, bulk_ids), "listeners__unique_molecule_counts__active_ribosome", ], "custom_sql": """ WITH total_counts AS ( SELECT avg(bulk + listeners__unique_molecule_counts__active_ribosome) AS ribosome_counts, experiment_id, variant FROM ({subquery}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(ribosome_counts) AS mean, stddev(ribosome_counts) AS std FROM total_counts GROUP BY experiment_id, variant """, }, "active_ribosome_counts_heatmap": { "columns": ["listeners__unique_molecule_counts__active_ribosome"], "plot_title": "Active Ribosome Counts", "num_digits_rounding": 0, "box_text_size": "x-small", }, "active_rnap_counts_heatmap": { "columns": ["listeners__unique_molecule_counts__active_RNAP"], "plot_title": "Active RNA Polymerase (RNAP) Counts", "num_digits_rounding": 0, "box_text_size": "x-small", }, "free_ribosome_counts_heatmap": { "columns": ["bulk"], "plot_title": "Free Ribosome Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [get_ribosome_counts_projection(sim_data, bulk_ids)], }, "free_rnap_counts_heatmap": { "columns": ["bulk"], "plot_title": "Free RNA Polymerase (RNAP) Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [get_rnap_counts_projection(sim_data, bulk_ids)], }, "rnap_ribosome_counts_ratio_heatmap": { "columns": [ "bulk", "listeners__unique_molecule_counts__active_RNAP", "listeners__unique_molecule_counts__active_ribosome", ], "plot_title": "RNAP Counts / Ribosome Counts", "num_digits_rounding": 4, "box_text_size": "x-small", "projections": [ get_rnap_counts_projection(sim_data, bulk_ids) + "_rnap, " + get_ribosome_counts_projection(sim_data, bulk_ids) + "_ribosome", "listeners__unique_molecule_counts__active_RNAP", "listeners__unique_molecule_counts__active_ribosome", ], "custom_sql": """ WITH total_counts AS ( SELECT avg(bulk_ribosome + listeners__unique_molecule_counts__active_ribosome) AS ribosome_counts, avg(bulk_rnap + listeners__unique_molecule_counts__active_RNAP) AS rnap_counts, experiment_id, variant FROM ({subquery}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(rnap_counts / ribosome_counts) AS mean, stddev(rnap_counts / ribosome_counts) AS std, FROM total_counts GROUP BY experiment_id, variant """, }, "rnap_crowding_heatmap": { "columns": [ "listeners__rna_synth_prob__target_rna_synth_prob", "listeners__rna_synth_prob__actual_rna_synth_prob", ], "plot_title": "RNAP Crowding: # of TUs", "num_digits_rounding": 0, "box_text_size": "x-small", "custom_sql": get_overcrowding_sql( "listeners__rna_synth_prob__target_rna_synth_prob", "listeners__rna_synth_prob__actual_rna_synth_prob", ), }, "ribosome_crowding_heatmap": { "columns": [ "listeners__ribosome_data__target_prob_translation_per_transcript", "listeners__ribosome_data__actual_prob_translation_per_transcript", ], "plot_title": "Ribosome Crowding: # of Monomers", "num_digits_rounding": 0, "box_text_size": "x-small", "custom_sql": get_overcrowding_sql( "listeners__ribosome_data__target_prob_translation_per_transcript", "listeners__ribosome_data__actual_prob_translation_per_transcript", ), }, "rna_synth_prob_max_p_heatmap": { "columns": ["listeners__rna_synth_prob__max_p"], "plot_title": "RNA Synth Max Prob", "num_digits_rounding": 4, }, "protein_init_prob_max_p_heatmap": { "columns": ["listeners__ribosome_data__max_p"], "plot_title": "Protein Init Max Prob", "num_digits_rounding": 4, }, "weighted_avg_translation_efficiency_heatmap": { "columns": ["listeners__rna_counts__full_mRNA_cistron_counts"], "plot_title": "Translation Efficiency (Weighted Average)", "num_digits_rounding": 3, "custom_sql": f""" WITH unnested_counts AS ( SELECT unnest(listeners__rna_counts__full_mRNA_cistron_counts) AS array_col, generate_subscripts(listeners__rna_counts__full_mRNA_cistron_counts, 1) AS array_idx, unnest({ordered_trl_effs.tolist()}) AS trl_eff, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) ), -- Materialize in memory so we can left join per-gene averages with -- sum of all gene averages without re-reading data avg_per_cell AS MATERIALIZED ( SELECT avg(array_col) AS avg_array_col, experiment_id, variant, lineage_seed, generation, agent_id, array_idx, any_value(trl_eff) AS trl_eff FROM unnested_counts GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, array_idx ), total_avg_per_cell AS ( SELECT sum(avg_array_col) AS sum_avg_array_col, experiment_id, variant, lineage_seed, generation, agent_id FROM avg_per_cell GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ), weighted_avg_per_cell AS ( SELECT sum(avg_array_col / sum_avg_array_col * trl_eff) AS weighted_avg_array_col, experiment_id, variant FROM avg_per_cell LEFT JOIN total_avg_per_cell USING (experiment_id, variant, lineage_seed, generation, agent_id) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(weighted_avg_array_col) AS mean, stddev(weighted_avg_array_col) AS std FROM weighted_avg_per_cell GROUP BY experiment_id, variant """, }, "capacity_gene_mRNA_counts_heatmap": { "columns": ["listeners__rna_counts__mRNA_counts"], "plot_title": "Log(Capacity Gene mRNA Counts+1): ", "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_counts__mRNA_counts", capacity_gene_mRNA_indexes, "gene_counts", ) ], "custom_sql": GENE_COUNTS_SQL, }, "capacity_gene_monomer_counts_heatmap": { "columns": ["listeners__monomer_counts"], "plot_title": "Log(Capacity Gene Protein Counts+1): ", "projections": [ "list_select(listeners__monomer_counts, " f"{capacity_gene_monomer_indexes}) AS gene_counts" ], "custom_sql": GENE_COUNTS_SQL, }, "capacity_gene_mRNA_mass_fraction_heatmap": { "columns": [ "listeners__rna_counts__mRNA_counts", "listeners__mass__mRna_mass", ], "plot_title": "Capacity Gene mRNA Mass Fraction: ", "num_digits_rounding": 3, "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_counts__mRNA_counts", capacity_gene_mRNA_indexes, "gene_counts", ), "listeners__mass__mRna_mass AS mass", ], "custom_sql": avg_1d_array_over_scalar_sql("gene_counts", "mass"), "post_func": get_gene_mass_prod_func( sim_data, "mRNA", capacity_gene_mRNA_ids ), }, "capacity_gene_monomer_mass_fraction_heatmap": { "columns": ["listeners__monomer_counts", "listeners__mass__protein_mass"], "plot_title": "Capacity Gene Protein Mass Fraction: ", "num_digits_rounding": 3, "projections": [ "list_select(listeners__monomer_counts, " f"{capacity_gene_monomer_indexes}) AS gene_counts", "listeners__mass__protein_mass AS mass", ], "custom_sql": avg_1d_array_over_scalar_sql("gene_counts", "mass"), "post_func": get_gene_mass_prod_func( sim_data, "monomer", capacity_gene_monomer_ids ), }, "capacity_gene_mRNA_counts_fraction_heatmap": { "columns": ["listeners__rna_counts__mRNA_counts"], "plot_title": "Capacity Gene mRNA Counts Fraction: ", "num_digits_rounding": 3, "custom_sql": get_gene_count_fraction_sql( capacity_gene_mRNA_indexes, "listeners__rna_counts__mRNA_counts", "mRNA", ), }, "capacity_gene_monomer_counts_fraction_heatmap": { "columns": ["listeners__monomer_counts"], "plot_title": "Capacity Gene Protein Counts Fraction: ", "num_digits_rounding": 3, "custom_sql": get_gene_count_fraction_sql( capacity_gene_monomer_indexes, "listeners__monomer_counts", "monomer" ), }, "new_gene_copy_number_heatmap": { "columns": ["listeners__rna_synth_prob__gene_copy_number"], "plot_title": "New Gene Copy Number", "projections": [ "list_select(listeners__rna_synth_prob__gene_copy_number, " f"{new_gene_cistron_indexes}) AS gene_counts" ], "num_digits_rounding": 3, "custom_sql": avg_1d_array_sql("gene_counts"), }, "new_gene_mRNA_counts_heatmap": { "columns": ["listeners__rna_counts__mRNA_counts"], "plot_title": "Log(New Gene mRNA Counts+1)", "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_counts__mRNA_counts", new_gene_mRNA_indexes, "gene_counts", ) ], "custom_sql": GENE_COUNTS_SQL, }, "new_gene_monomer_counts_heatmap": { "columns": ["listeners__monomer_counts"], "plot_title": "Log(New Gene Protein Counts+1)", "projections": [ "list_select(listeners__monomer_counts, " f"{new_gene_monomer_indexes}) AS gene_counts" ], "custom_sql": GENE_COUNTS_SQL, }, "new_gene_mRNA_mass_fraction_heatmap": { "columns": [ "listeners__rna_counts__mRNA_counts", "listeners__mass__mRna_mass", ], "plot_title": "New Gene mRNA Mass Fraction", "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_counts__mRNA_counts", new_gene_mRNA_indexes, "gene_counts", ), "listeners__mass__mRna_mass AS mass", ], "custom_sql": avg_1d_array_over_scalar_sql("gene_counts", "mass"), "post_func": get_gene_mass_prod_func(sim_data, "mRNA", new_gene_mRNA_ids), }, "new_gene_mRNA_counts_fraction_heatmap": { "columns": ["listeners__rna_counts__mRNA_counts"], "plot_title": "New Gene mRNA Counts Fraction", "custom_sql": get_gene_count_fraction_sql( new_gene_mRNA_indexes, "listeners__rna_counts__mRNA_counts", "mRNA" ), }, "new_gene_mRNA_NTP_fraction_heatmap": { "columns": ["listeners__rna_counts__mRNA_counts"], "plot_title": "New Gene", "num_digits_rounding": 4, "box_text_size": "x-small", "custom_sql": get_new_gene_mRNA_NTP_fraction_sql( sim_data, new_gene_mRNA_indexes, ntp_ids ), "is_nonstandard_plot": True, }, "new_gene_monomer_mass_fraction_heatmap": { "columns": ["listeners__monomer_counts", "listeners__mass__protein_mass"], "plot_title": "New Gene Protein Mass Fraction", "projections": [ "list_select(listeners__monomer_counts, " f"{new_gene_monomer_indexes}) AS gene_counts", "listeners__mass__protein_mass AS mass", ], "custom_sql": avg_1d_array_over_scalar_sql("gene_counts", "mass"), "post_func": get_gene_mass_prod_func( sim_data, "monomer", new_gene_monomer_ids ), }, "new_gene_monomer_counts_fraction_heatmap": { "columns": ["listeners__monomer_counts"], "plot_title": "New Gene Protein Counts Fraction", "custom_sql": get_gene_count_fraction_sql( new_gene_monomer_indexes, "listeners__monomer_counts", "monomer" ), }, "new_gene_rnap_init_rate_heatmap": { "columns": [ "listeners__rna_synth_prob__gene_copy_number", "listeners__rnap_data__rna_init_event_per_cistron", ], "plot_title": "New Gene RNAP Initialization Rate", "projections": [ "list_select(listeners__rna_synth_prob__gene_copy_number, " f"{new_gene_cistron_indexes}) AS gene_copy_number", "list_select(listeners__rnap_data__rna_init_event_per_cistron, " f"{new_gene_cistron_indexes}) AS rna_init_event_per_cistron", ], "custom_sql": avg_ratio_of_1d_arrays_sql( "rna_init_event_per_cistron", "gene_copy_number" ), }, "new_gene_ribosome_init_rate_heatmap": { "columns": [ "listeners__rna_counts__mRNA_counts", "listeners__ribosome_data__ribosome_init_event_per_monomer", ], "plot_title": "New Gene Ribosome Initalization Rate", "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_counts__mRNA_counts", new_gene_mRNA_indexes, "mRNA_counts", ), "list_select(listeners__ribosome_data__ribosome_init_event_per_monomer, " f"{new_gene_monomer_indexes}) AS ribosome_init_event_per_monomer", ], "custom_sql": avg_ratio_of_1d_arrays_sql( "ribosome_init_event_per_monomer", "mRNA_counts" ), }, "new_gene_rnap_time_overcrowded_heatmap": { "columns": ["listeners__rna_synth_prob__tu_is_overcrowded"], "plot_title": "Fraction of Time RNAP Overcrowded New Gene", # Need to explicitly cast boolean list to numeric list for aggregation "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_synth_prob__tu_is_overcrowded", new_gene_RNA_indexes, "overcrowded", "UTINYINT[]", ) ], "custom_sql": avg_1d_array_sql("overcrowded"), }, "new_gene_ribosome_time_overcrowded_heatmap": { "columns": ["listeners__ribosome_data__mRNA_is_overcrowded"], "plot_title": "Fraction of Time Ribosome Overcrowded New Gene", # Need to explicitly cast boolean list to numeric list for aggregation "projections": [ "list_select(listeners__ribosome_data__mRNA_is_overcrowded, " f"{new_gene_monomer_indexes})::UTINYINT[] AS overcrowded" ], "custom_sql": avg_1d_array_sql("overcrowded"), }, "new_gene_actual_protein_init_prob_heatmap": { "columns": [ "listeners__ribosome_data__actual_prob_translation_per_transcript" ], "plot_title": "New Gene Actual Protein Init Prob", "num_digits_rounding": 4, "projections": [ "list_select(listeners__ribosome_data__actual_prob_translation_per_transcript, " f"{new_gene_monomer_indexes}) AS init_prob" ], "custom_sql": avg_1d_array_sql("init_prob"), }, "new_gene_target_protein_init_prob_heatmap": { "columns": [ "listeners__ribosome_data__target_prob_translation_per_transcript" ], "plot_title": "New Gene Target Protein Init Prob", "num_digits_rounding": 4, "projections": [ "list_select(listeners__ribosome_data__target_prob_translation_per_transcript, " f"{new_gene_monomer_indexes}) AS init_prob" ], "custom_sql": avg_1d_array_sql("init_prob"), }, "new_gene_protein_init_prob_max_p_target_ratio_heatmap": { "columns": [ "listeners__ribosome_data__target_prob_translation_per_transcript", "listeners__ribosome_data__max_p_per_protein", ], "plot_title": "New Gene Protein Max Prob / Target Prob Ratio", "num_digits_rounding": 4, "projections": [ "list_select(listeners__ribosome_data__target_prob_translation_per_transcript, " f"{new_gene_monomer_indexes}) AS target_prob", "list_select(listeners__ribosome_data__max_p_per_protein, " f"{new_gene_monomer_indexes}) AS max_p", ], "custom_sql": avg_ratio_of_1d_arrays_sql("target_prob", "max_p"), }, "new_gene_rna_synth_prob_max_p_target_ratio_heatmap": { "columns": [ "listeners__rna_synth_prob__target_rna_synth_prob", "listeners__rna_synth_prob__max_p", ], "plot_title": "New Gene Protein Max Prob / Target Prob Ratio", "num_digits_rounding": 4, "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_synth_prob__target_rna_synth_prob", new_gene_RNA_indexes, "target_prob", ), "listeners__rna_synth_prob__max_p AS max_p", ], "custom_sql": avg_1d_array_over_scalar_sql("target_prob", "max_p"), }, "new_gene_ribosome_init_events_heatmap": { "columns": ["listeners__ribosome_data__ribosome_init_event_per_monomer"], "plot_title": "New Gene Ribosome Init Events Per Time Step", "num_digits_rounding": 0, "box_test_size": "x-small", "projections": [ "list_select(listeners__ribosome_data__ribosome_init_event_per_monomer, " f"{new_gene_monomer_indexes}) AS init_events" ], "custom_sql": avg_1d_array_sql("init_events"), }, "new_gene_ribosome_init_events_portion_heatmap": { "columns": [ "listeners__ribosome_data__ribosome_init_event_per_monomer", "listeners__ribosome_data__did_initialize", ], "plot_title": "New Gene Portion of Initiated Ribosomes", "num_digits_rounding": 4, "projections": [ "list_select(listeners__ribosome_data__ribosome_init_event_per_monomer, " f"{new_gene_monomer_indexes}) AS init_events", "listeners__ribosome_data__did_initialize AS did_initialize", ], "custom_sql": avg_1d_array_over_scalar_sql("init_events", "did_initialize"), }, "new_gene_actual_rna_synth_prob_heatmap": { "columns": ["listeners__rna_synth_prob__actual_rna_synth_prob"], "plot_title": "New Gene Actual RNA Synth Prob", "num_digits_rounding": 4, "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_synth_prob__actual_rna_synth_prob", new_gene_RNA_indexes, "synth_probs", ) ], "custom_sql": avg_1d_array_sql("synth_probs"), }, "new_gene_target_rna_synth_prob_heatmap": { "columns": ["listeners__rna_synth_prob__target_rna_synth_prob"], "plot_title": "New Gene Target RNA Synth Prob", "num_digits_rounding": 4, "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_synth_prob__target_rna_synth_prob", new_gene_RNA_indexes, "synth_probs", ) ], "custom_sql": avg_1d_array_sql("synth_probs"), }, "new_gene_rnap_counts_heatmap": { "columns": ["listeners__rna_counts__partial_mRNA_counts"], "plot_title": "New Gene RNAP Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_counts__partial_mRNA_counts", new_gene_mRNA_indexes, "rnap_counts", ) ], "custom_sql": avg_1d_array_sql("rnap_counts"), }, "new_gene_rnap_portion_heatmap": { "columns": [ "listeners__rna_counts__partial_mRNA_counts", "listeners__unique_molecule_counts__active_RNAP", ], "plot_title": "New Gene RNAP Portion", "num_digits_rounding": 3, "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_counts__partial_mRNA_counts", new_gene_mRNA_indexes, "rnap_counts", ), "listeners__unique_molecule_counts__active_RNAP AS active_counts", ], "custom_sql": avg_1d_array_over_scalar_sql("rnap_counts", "active_counts"), }, "rrna_rnap_counts_heatmap": { "columns": ["listeners__rna_counts__partial_rRNA_counts"], "plot_title": "rRNA RNAP Counts", "num_digits_rounding": 0, "projections": [ "listeners__rna_counts__partial_rRNA_counts AS rnap_counts" ], "custom_sql": avg_sum_1d_array_sql("rnap_counts"), }, "rrna_rnap_portion_heatmap": { "columns": [ "listeners__rna_counts__partial_rRNA_counts", "listeners__unique_molecule_counts__active_RNAP", ], "plot_title": "rRNA RNAP Portion", "num_digits_rounding": 3, "projections": [ "listeners__rna_counts__partial_rRNA_counts AS rnap_counts", "listeners__unique_molecule_counts__active_RNAP AS active_counts", ], "custom_sql": avg_sum_1d_array_over_scalar_sql( "rnap_counts", "active_counts" ), }, "rnap_subunit_rnap_counts_heatmap": { "columns": ["listeners__rna_counts__partial_mRNA_counts"], "plot_title": "RNAP Subunit RNAP Counts", "num_digits_rounding": 0, "projections": [ "list_select(listeners__rna_counts__partial_mRNA_counts, " f"{rnap_subunit_mRNA_indexes}) AS rnap_counts" ], "custom_sql": avg_sum_1d_array_sql("rnap_counts"), }, "rnap_subunit_rnap_portion_heatmap": { "columns": [ "listeners__rna_counts__partial_mRNA_counts", "listeners__unique_molecule_counts__active_RNAP", ], "plot_title": "RNAP Subunit RNAP Portion", "num_digits_rounding": 3, "projections": [ "list_select(listeners__rna_counts__partial_mRNA_counts, " f"{rnap_subunit_mRNA_indexes}) AS rnap_counts", "listeners__unique_molecule_counts__active_RNAP AS active_counts", ], "custom_sql": avg_sum_1d_array_over_scalar_sql( "rnap_counts", "active_counts" ), }, "rnap_subunit_ribosome_counts_heatmap": { "columns": ["listeners__ribosome_data__n_ribosomes_per_transcript"], "plot_title": "RNAP Subunit Ribosome Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [ "list_select(listeners__ribosome_data__n_ribosomes_per_transcript, " f"{rnap_subunit_monomer_indexes}) AS ribosome_counts" ], "custom_sql": avg_sum_1d_array_sql("ribosome_counts"), }, "rnap_subunit_ribosome_portion_heatmap": { "columns": [ "listeners__ribosome_data__n_ribosomes_per_transcript", "listeners__unique_molecule_counts__active_ribosome", ], "plot_title": "RNAP Subunit Ribosome Portion", "num_digits_rounding": 3, "projections": [ "list_select(listeners__ribosome_data__n_ribosomes_per_transcript, " f"{rnap_subunit_monomer_indexes}) AS ribosome_counts", "listeners__unique_molecule_counts__active_ribosome AS active_counts", ], "custom_sql": avg_sum_1d_array_over_scalar_sql( "ribosome_counts", "active_counts" ), }, "ribosomal_protein_rnap_counts_heatmap": { "columns": ["listeners__rna_counts__partial_mRNA_counts"], "plot_title": "Ribosomal Protein RNAP Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [ "list_select(listeners__rna_counts__partial_mRNA_counts, " f"{ribosomal_mRNA_indexes}) AS rnap_counts" ], "custom_sql": avg_sum_1d_array_sql("rnap_counts"), }, "ribosomal_protein_rnap_portion_heatmap": { "columns": [ "listeners__rna_counts__partial_mRNA_counts", "listeners__unique_molecule_counts__active_RNAP", ], "plot_title": "Ribosomal Protein RNAP Portion", "num_digits_rounding": 3, "projections": [ "list_select(listeners__rna_counts__partial_mRNA_counts, " f"{ribosomal_mRNA_indexes}) AS rnap_counts", "listeners__unique_molecule_counts__active_RNAP AS active_counts", ], "custom_sql": avg_sum_1d_array_over_scalar_sql( "rnap_counts", "active_counts" ), }, "ribosomal_protein_ribosome_counts_heatmap": { "columns": ["listeners__ribosome_data__n_ribosomes_per_transcript"], "plot_title": "Ribosomal Protein Ribosome Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [ "list_select(listeners__ribosome_data__n_ribosomes_per_transcript, " f"{ribosomal_monomer_indexes}) AS ribosome_counts" ], "custom_sql": avg_sum_1d_array_sql("ribosome_counts"), }, "ribosomal_protein_ribosome_portion_heatmap": { "columns": [ "listeners__ribosome_data__n_ribosomes_per_transcript", "listeners__unique_molecule_counts__active_ribosome", ], "plot_title": "Ribosomal Protein Ribosome Portion", "num_digits_rounding": 3, "projections": [ "list_select(listeners__ribosome_data__n_ribosomes_per_transcript, " f"{ribosomal_monomer_indexes}) AS ribosome_counts", "listeners__unique_molecule_counts__active_ribosome AS active_counts", ], "custom_sql": avg_sum_1d_array_over_scalar_sql( "ribosome_counts", "active_counts" ), }, "new_gene_ribosome_counts_heatmap": { "columns": ["listeners__ribosome_data__n_ribosomes_per_transcript"], "plot_title": "New Gene Ribosome Counts", "num_digits_rounding": 0, "box_text_size": "x-small", "projections": [ "list_select(listeners__ribosome_data__n_ribosomes_per_transcript, " f"{new_gene_monomer_indexes}) AS ribosome_counts" ], "custom_sql": avg_1d_array_sql("ribosome_counts"), }, "new_gene_ribosome_portion_heatmap": { "columns": [ "listeners__ribosome_data__n_ribosomes_per_transcript", "listeners__unique_molecule_counts__active_ribosome", ], "plot_title": "New Gene Ribosome Portion", "num_digits_rounding": 3, "projections": [ "list_select(listeners__ribosome_data__n_ribosomes_per_transcript, " f"{new_gene_monomer_indexes}) AS ribosome_counts", "listeners__unique_molecule_counts__active_ribosome AS active_counts", ], "custom_sql": avg_1d_array_over_scalar_sql( "ribosome_counts", "active_counts" ), }, "capacity_gene_rnap_portion_heatmap": { "columns": [ "listeners__rna_counts__partial_mRNA_counts", "listeners__unique_molecule_counts__active_RNAP", ], "plot_title": "Capacity Gene RNAP Portion: ", "num_digits_rounding": 4, "projections": [ get_rnas_combined_as_genes_projection( "listeners__rna_counts__partial_mRNA_counts", capacity_gene_mRNA_indexes, "rnap_counts", ), "listeners__unique_molecule_counts__active_RNAP AS active_counts", ], "custom_sql": avg_1d_array_over_scalar_sql("rnap_counts", "active_counts"), }, "capacity_gene_ribosome_portion_heatmap": { "columns": [ "listeners__ribosome_data__n_ribosomes_per_transcript", "listeners__unique_molecule_counts__active_ribosome", ], "plot_title": "Capacity Gene Ribosome Portion: ", "num_digits_rounding": 4, "projections": [ "list_select(listeners__ribosome_data__n_ribosomes_per_transcript, " f"{capacity_gene_monomer_indexes}) AS ribosome_counts", "listeners__unique_molecule_counts__active_ribosome AS active_counts", ], "custom_sql": avg_1d_array_over_scalar_sql( "ribosome_counts", "active_counts" ), }, "glucose_consumption_rate": { "columns": [ "listeners__fba_results__external_exchange_fluxes", "listeners__mass__dry_mass", ], "plot_title": "Average Glucose Consumption Rate (fg/hr)", "num_digits_rounding": 1, "projections": [ "-listeners__fba_results__external_exchange_fluxes[" f"{glucose_idx}] AS glucose_flux", "listeners__mass__dry_mass AS dry_mass", ], "remove_first": True, "custom_sql": f""" WITH avg_per_cell AS ( SELECT avg(glucose_flux * dry_mass) * {flux_scaling_factor} AS avg_fg_glc_per_hr, experiment_id, variant FROM ({{subquery}}) GROUP BY experiment_id, variant, lineage_seed, generation, agent_id ) SELECT variant, avg(avg_fg_glc_per_hr) AS mean, stddev(avg_fg_glc_per_hr) AS std FROM avg_per_cell GROUP BY experiment_id, variant """, }, "new_gene_yield_per_glucose": { "columns": [ "listeners__fba_results__external_exchange_fluxes", "listeners__mass__dry_mass", "time", "listeners__monomer_counts", ], "plot_title": "New Gene fg Protein Yield per fg Glucose", "num_digits_rounding": 3, "projections": [ "-listeners__fba_results__external_exchange_fluxes[" f"{glucose_idx}] AS glucose_flux", "listeners__mass__dry_mass AS dry_mass", "time", "list_select(listeners__monomer_counts, " f"{new_gene_monomer_indexes}) AS monomer_counts", ], "remove_first": True, "custom_sql": f""" WITH unnested_counts AS ( SELECT unnest(monomer_counts) AS monomer_counts, generate_subscripts(monomer_counts, 1) AS monomer_idx, glucose_flux, dry_mass, time, experiment_id, variant, lineage_seed, generation, agent_id FROM ({{subquery}}) ), avg_per_cell AS ( SELECT avg(glucose_flux * dry_mass) * {flux_scaling_factor} AS avg_fg_glc_per_hr, (max(time) - min(time)) / 60 AS doubling_time, last(monomer_counts::BIGINT ORDER BY time) - first(monomer_counts::BIGINT ORDER BY time) AS monomer_delta, experiment_id, variant, monomer_idx FROM unnested_counts GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, monomer_idx ), avg_per_variant AS ( SELECT experiment_id, variant, avg(monomer_delta / ( avg_fg_glc_per_hr * doubling_time)) AS avg_monomer_per_fg_glc, monomer_idx, stddev(monomer_delta / (avg_fg_glc_per_hr * doubling_time)) AS std_monomer_per_fg_glc FROM avg_per_cell GROUP BY experiment_id, variant, monomer_idx ) SELECT variant, list(avg_monomer_per_fg_glc ORDER BY monomer_idx) AS mean, list(std_monomer_per_fg_glc ORDER BY monomer_idx) AS std FROM avg_per_variant GROUP BY experiment_id, variant """, "post_func": get_gene_mass_prod_func( sim_data, "monomer", new_gene_monomer_ids ), }, "new_gene_yield_per_hour": { "columns": ["time", "listeners__monomer_counts"], "plot_title": "New Gene fg Protein Yield per Hour", "num_digits_rounding": 2, "projections": [ "time", "list_select(listeners__monomer_counts, " f"{new_gene_monomer_indexes}) AS monomer_counts", ], "remove_first": True, "custom_sql": """ WITH unnested_counts AS ( SELECT unnest(monomer_counts) AS monomer_counts, time, generate_subscripts(monomer_counts, 1) AS monomer_idx, experiment_id, variant, lineage_seed, generation, agent_id FROM ({subquery}) ), avg_per_cell AS ( SELECT (max(time) - min(time)) / 60 AS doubling_time, last(monomer_counts::BIGINT ORDER BY time) - first( monomer_counts::BIGINT ORDER BY time) AS monomer_delta, experiment_id, variant, monomer_idx FROM unnested_counts GROUP BY experiment_id, variant, lineage_seed, generation, agent_id, monomer_idx ), avg_per_variant AS ( SELECT experiment_id, variant, avg(monomer_delta / doubling_time) AS avg_monomer_per_hr, monomer_idx, stddev(monomer_delta / doubling_time) AS std_monomer_per_hr FROM avg_per_cell GROUP BY experiment_id, variant, monomer_idx ) SELECT variant, list(avg_monomer_per_hr ORDER BY monomer_idx) AS mean, list(std_monomer_per_hr ORDER BY monomer_idx) AS std FROM avg_per_variant GROUP BY experiment_id, variant """, "post_func": get_gene_mass_prod_func( sim_data, "monomer", new_gene_monomer_ids ), }, } # Check validity of requested heatmaps and fill in default values where needed heatmaps_to_make = HEATMAPS_TO_MAKE_LIST total_heatmaps_to_make = 0 for h in heatmaps_to_make: assert h in heatmap_details, "Heatmap " + h + " is not an option" heatmap_details[h].setdefault("is_nonstandard_plot", False) heatmap_details[h].setdefault("remove_first", False) heatmap_details[h].setdefault("num_digits_rounding", 2) heatmap_details[h].setdefault("box_text_size", "medium") heatmap_details[h].setdefault("is_new_gene_heatmap", False) heatmap_details[h].setdefault("is_capacity_gene_heatmap", False) heatmap_details[h].setdefault("default_value", -1) heatmap_details[h].setdefault("projections", None) heatmap_details[h].setdefault("order_results", False) heatmap_details[h].setdefault("custom_sql", None) heatmap_details[h].setdefault("post_func", None) if h == "new_gene_mRNA_NTP_fraction_heatmap": total_heatmaps_to_make += len(ntp_ids) * len(new_gene_cistron_ids) heatmap_details[h]["default_value"] = np.full( (len(ntp_ids), len(new_gene_cistron_ids)), -1 ) elif h.startswith("new_gene_"): total_heatmaps_to_make += len(new_gene_cistron_ids) heatmap_details[h]["is_new_gene_heatmap"] = True heatmap_details[h]["default_value"] = np.full(len(new_gene_mRNA_ids), -1) elif h.startswith("capacity_gene_"): total_heatmaps_to_make += len(capacity_gene_monomer_ids) heatmap_details[h]["is_capacity_gene_heatmap"] = True heatmap_details[h]["default_value"] = np.full( len(capacity_gene_monomer_ids), -1 ) else: total_heatmaps_to_make += 1 # Data extraction print("---Data Extraction---") heatmap_data = {} for h in tqdm(heatmaps_to_make): h_details = heatmap_details[h] mean_matrix, std_matrix = get_mean_and_std_matrices( conn, variant_to_row_col, variant_matrix_shape, history_sql, h_details["columns"], h_details["projections"], h_details["remove_first"], None, h_details["order_results"], h_details["custom_sql"], h_details["post_func"], h_details["num_digits_rounding"], h_details["default_value"], ) heatmap_data[h] = {"mean": mean_matrix, "std_dev": std_matrix} # Plotting print("---Plotting---") plot_suffix = "_gens_" + str(MIN_CELL_INDEX) + "_through_" + str(MAX_CELL_INDEX) heatmap_x_label = "Expression Variant" heatmap_y_label = "Translation Efficiency Value" figsize_x = 2 + 2 * len(new_gene_expression_factors) / 3 figsize_y = 2 * len(new_gene_translation_efficiency_values) / 2 # Figure out whether to create dashboard / individual plots and # whether to make std. dev. plots in addition to mean plots summary_statistics = ["mean"] if DASHBOARD_FLAG == 0: is_dashboards = [False] elif DASHBOARD_FLAG == 1: is_dashboards = [True] elif DASHBOARD_FLAG == 2: is_dashboards = [True, False] if STD_DEV_FLAG: summary_statistics.append("std_dev") for is_dashboard in is_dashboards: for summary_statistic in summary_statistics: plot_heatmaps( heatmap_data, heatmap_details, new_gene_cistron_ids, ntp_ids, capacity_gene_common_names, total_heatmaps_to_make, is_dashboard, variant_mask, heatmap_x_label, heatmap_y_label, new_gene_expression_factors, new_gene_translation_efficiency_values, summary_statistic, figsize_x, figsize_y, outdir, plot_suffix, )