Source code for ecoli.analysis.multiseed.ecocyc_table

"""
Generates tables of data to share with EcoCyc for display on the "modeling" tab.

TODO:
    other values
        weighted average for counts (time step weighted and cell cycle progress weighted)
        max/min
"""

import csv
import json
import os
import pickle
import tempfile
from typing import TYPE_CHECKING, Any

from duckdb import DuckDBPyConnection
import numpy as np
import polars as pl
from scipy.stats import pearsonr

from ecoli.library.parquet_emitter import (
    get_config_value,
    get_field_metadata,
    open_arbitrary_sim_data,
    open_output_file,
    num_cells,
    read_stacked_columns,
    skip_n_gens,
)
from ecoli.processes.metabolism import (
    COUNTS_UNITS,
    MASS_UNITS,
    TIME_UNITS,
    VOLUME_UNITS,
)
from wholecell.utils import units

if TYPE_CHECKING:
    from reconstruction.ecoli.simulation_data import SimulationDataEcoli

IGNORE_FIRST_N_GENS = 2

MEDIA_NAME_TO_ID = {
    "minimal": "MIX0-57",
    "minimal_minus_oxygen": "MIX0-57-ANAEROBIC",
    "minimal_plus_amino_acids": "MIX0-850",
    "minimal_acetate": "MIX0-58",
    "minimal_succinate": "MIX0-844",
}


[docs] def save_file(outdir, filename, columns, values: list[pl.Series]): outfile = os.path.join(outdir, filename) print(f"Saving data to {outfile}") # Data rows out_df = pl.DataFrame({k: s for k, s in zip(columns, values)}) with open(outfile, "w") as f, tempfile.NamedTemporaryFile("w+") as temp_out: out_df.write_csv(temp_out.name, separator="\t") # Header for columns writer = csv.writer(f, delimiter="\t") writer.writerow(["# Column descriptions:"]) for col, desc in columns.items(): writer.writerow([f"# {col}", desc]) for line in temp_out: f.write(line)
[docs] def plot( params: dict[str, Any], conn: DuckDBPyConnection, history_sql: str, config_sql: str, sim_data_dict: dict[str, dict[int, str]], validation_data_paths: list[str], outdir: str, variant_metadata: dict[str, dict[int, Any]], variant_names: dict[str, str], ): with open_arbitrary_sim_data(sim_data_dict) as f: sim_data: "SimulationDataEcoli" = pickle.load(f) with open_output_file(validation_data_paths[0]) as f: validation_data = pickle.load(f) media_name = sim_data.conditions[sim_data.condition]["nutrients"] media_id = MEDIA_NAME_TO_ID.get(media_name, media_name) # Ignore first N generations history_sql = skip_n_gens(history_sql, IGNORE_FIRST_N_GENS) config_sql = skip_n_gens(config_sql, IGNORE_FIRST_N_GENS) if num_cells(conn, config_sql) == 0: print("Skipping analysis -- not enough simulations run.") return # Load tables and attributes for mRNAs mRNA_ids = get_field_metadata( conn, config_sql, "listeners__rna_counts__mRNA_cistron_counts" ) # mass_unit = get_field_metadata(config_lf, 'listeners__mass__dry_mass') # assert mass_unit == 'fg' # Load tables and attributes for tRNAs and rRNAs bulk_ids = get_field_metadata(conn, config_sql, "bulk") bulk_id_to_idx = {bulk_id: i + 1 for i, bulk_id in enumerate(bulk_ids)} uncharged_tRNA_ids = sim_data.process.transcription.uncharged_trna_names uncharged_tRNA_idx = [bulk_id_to_idx[trna] for trna in uncharged_tRNA_ids] charged_tRNA_ids = sim_data.process.transcription.charged_trna_names charged_tRNA_idx = [bulk_id_to_idx[trna] for trna in charged_tRNA_ids] tRNA_cistron_ids = [tRNA_id[:-3] for tRNA_id in uncharged_tRNA_ids] rRNA_ids = [ sim_data.molecule_groups.s30_16s_rRNA[0], sim_data.molecule_groups.s50_23s_rRNA[0], sim_data.molecule_groups.s50_5s_rRNA[0], ] rRNA_idx = [bulk_id_to_idx[trna] for trna in rRNA_ids] rRNA_cistron_ids = [rRNA_id[:-3] for rRNA_id in rRNA_ids] ribosomal_subunit_ids = [ sim_data.molecule_ids.s30_full_complex, sim_data.molecule_ids.s50_full_complex, ] ribo_subunit_idx = [bulk_id_to_idx[ribo] for ribo in ribosomal_subunit_ids] # Filter out first timestep for each cell because counts_to_molar is 0 rna_subquery = read_stacked_columns( history_sql, [ "bulk", "listeners__unique_molecule_counts__active_ribosome", "listeners__enzyme_kinetics__counts_to_molar", "listeners__mass__dry_mass", "listeners__rna_counts__mRNA_cistron_counts", ], [ # Extract only necessary bulk counts to reduce RAM usage f"list_select(bulk, {charged_tRNA_idx}) AS charged_tRNAs, " f"list_select(bulk, {uncharged_tRNA_idx}) AS uncharged_tRNAs, " f"list_select(bulk, {rRNA_idx}) AS rRNAs, " f"list_select(bulk, {ribo_subunit_idx}) AS ribo_subunits", "listeners__unique_molecule_counts__active_ribosome", "listeners__enzyme_kinetics__counts_to_molar", "listeners__mass__dry_mass", "listeners__rna_counts__mRNA_cistron_counts", ], remove_first=True, order_results=False, ) rna_data = conn.sql( f""" WITH rna_list AS ( SELECT -- Create RNA counts list of mRNAs, tRNAs, and rRNAs (in order) ( -- mRNA listeners__rna_counts__mRNA_cistron_counts + -- tRNA = charged + uncharged [ trna[1] + trna[2] FOR trna IN list_zip(charged_tRNAs, uncharged_tRNAs) ] + -- First rRNA = bulk + active ribosome + small subunit [ rRNAs[1] + listeners__unique_molecule_counts__active_ribosome + ribo_subunits[1] ] + -- Remaining rRNAs = bulk + active ribosome + large subunit [ rrna_count + listeners__unique_molecule_counts__active_ribosome + ribo_subunits[2] FOR rrna_count IN rRNAs[2:] ] ) AS rna_counts, listeners__enzyme_kinetics__counts_to_molar AS counts_to_molar, listeners__mass__dry_mass AS dry_masses FROM ({rna_subquery}) ), unnested_counts AS ( SELECT -- Unnest RNA counts to perform aggregations (e.g. mean, stddev) unnest(rna_counts) AS rna_counts, -- Track list index for each unnested row for grouped aggregations generate_subscripts(rna_counts, 1) AS rna_idx, counts_to_molar, dry_masses FROM rna_list ) SELECT avg(rna_counts) AS "rna-count-avg", stddev(rna_counts) AS "rna-count-std", avg(rna_counts * counts_to_molar) AS "rna-concentration-avg", stddev(rna_counts * counts_to_molar) AS "rna-concentration-std", avg(dry_masses) AS "dry-masses-avg" FROM unnested_counts GROUP BY rna_idx ORDER BY rna_idx """ ).pl() # Filter out first timestep for each cell because counts_to_molar is 0 gene_copy_num_subquery = read_stacked_columns( history_sql, ["listeners__rna_synth_prob__gene_copy_number"], remove_first=True, order_results=False, ) gene_copy_data = conn.sql( f""" WITH unnested_counts AS ( SELECT -- Unnest gene copy number to perform aggregations (e.g. mean, stddev) unnest(listeners__rna_synth_prob__gene_copy_number) AS gene_copy_numbers, -- Track list index for each unnested row for grouped aggregations generate_subscripts(listeners__rna_synth_prob__gene_copy_number, 1) AS gene_idx FROM ({gene_copy_num_subquery}) ) SELECT avg(gene_copy_numbers) AS "gene-copy-number-avg", stddev(gene_copy_numbers) AS "gene-copy-number-std" FROM unnested_counts GROUP BY gene_idx ORDER BY gene_idx """ ).pl() # Retrieve gene copy numbers in order of RNA counts cistron_id_to_gene_id = { cistron["id"]: cistron["gene_id"] for cistron in sim_data.process.transcription.cistron_data } gene_ids = [ cistron_id_to_gene_id[rna_id] for rna_id in mRNA_ids + tRNA_cistron_ids + rRNA_cistron_ids ] gene_ids_rna_synth_prob = get_field_metadata( conn, config_sql, "listeners__rna_synth_prob__gene_copy_number" ) gene_id_to_idx = {gene: i for i, gene in enumerate(gene_ids_rna_synth_prob)} gene_to_rna_order_idx = [gene_id_to_idx[gene] for gene in gene_ids] # Get RNA molecular weights gene_id_to_mw = { gene_id: cistron_mw for (gene_id, cistron_mw) in zip( sim_data.process.transcription.cistron_data["gene_id"], sim_data.process.transcription.cistron_data["mw"].asNumber( units.fg / units.count ), ) } rna_mw = np.array([gene_id_to_mw[gene] for gene in gene_ids]) # Calculate relative statistics rel_to_type_count = np.zeros(len(rna_data)) rel_to_type_mass = np.zeros(len(rna_data)) start_idx = 0 for add_idx in [len(mRNA_ids), len(tRNA_cistron_ids), len(rRNA_cistron_ids)]: rna_slice_mw = rna_mw[start_idx : start_idx + add_idx] rel_to_type_count[start_idx : start_idx + add_idx] = rna_data.select( pl.col("rna-count-avg").slice(start_idx, add_idx) / pl.col("rna-count-avg").slice(start_idx, add_idx).sum() )["rna-count-avg"] rel_to_type_mass[start_idx : start_idx + add_idx] = rna_data.select( pl.col("rna-count-avg").slice(start_idx, add_idx) * rna_slice_mw / (pl.col("rna-count-avg").slice(start_idx, add_idx) * rna_slice_mw).sum() )["rna-count-avg"] start_idx += add_idx rna_data = rna_data.with_columns( **{ # type: ignore[arg-type] "relative-rna-count-to-total-rna-type-counts": rel_to_type_count, "relative-rna-mass-to-total-rna-type-mass": rel_to_type_mass, "relative-rna-count-to-total-rna-counts": rna_data.select( pl.col("rna-count-avg") / pl.col("rna-count-avg").sum() )["rna-count-avg"], "relative-rna-mass-to-total-rna-mass": rna_data.select( pl.col("rna-count-avg") * rna_mw / (pl.col("rna-count-avg") * rna_mw).sum() )["rna-count-avg"], "relative-rna-mass-to-total-cell-dry-mass": rna_data.select( pl.col("rna-count-avg") * rna_mw / pl.col("dry-masses-avg").sum() )["rna-count-avg"], "id": pl.Series(gene_ids), "gene-copy-number-avg": gene_copy_data["gene-copy-number-avg"][ gene_to_rna_order_idx ], "gene-copy-number-std": gene_copy_data["gene-copy-number-std"][ gene_to_rna_order_idx ], } ) columns = { "id": "Object ID, according to EcoCyc", "gene-copy-number-avg": "A floating point number", "gene-copy-number-std": "A floating point number", "rna-count-avg": "A floating point number", "rna-count-std": "A floating point number", "rna-concentration-avg": "A floating point number in mM units", "rna-concentration-std": "A floating point number in mM units", "relative-rna-count-to-total-rna-counts": "A floating point number", "relative-rna-count-to-total-rna-type-counts": "A floating point number", "relative-rna-mass-to-total-rna-mass": "A floating point number", "relative-rna-mass-to-total-rna-type-mass": "A floating point number", "relative-rna-mass-to-total-cell-dry-mass": "A floating point number", } values = [rna_data[k] for k in columns] save_file(outdir, f"wcm_rnas_{media_id}.tsv", columns, values) # Build dictionary for metadata ecocyc_metadata = { "git_hash": get_config_value(conn, config_sql, "git_hash"), "n_ignored_generations": IGNORE_FIRST_N_GENS, "n_total_generations": get_config_value(conn, config_sql, "generations"), "n_seeds": get_config_value(conn, config_sql, "n_init_sims"), "n_cells": num_cells(conn, config_sql), "n_timesteps": len(rna_data), } # Filter out first timestep for each cell because counts_to_molar is 0 monomer_subquery = read_stacked_columns( history_sql, [ "listeners__enzyme_kinetics__counts_to_molar", "listeners__mass__dry_mass", "listeners__monomer_counts", ], remove_first=True, order_results=False, ) # Load tables and attributes for proteins monomer_data = conn.sql( f""" WITH unnested_counts AS ( SELECT listeners__mass__dry_mass AS dry_masses, listeners__enzyme_kinetics__counts_to_molar AS counts_to_molar, -- Unnest monomer counts to calculate aggregations (e.g. mean) unnest(listeners__monomer_counts) AS monomer_counts, -- Track list index for each unnested row for grouped aggregations generate_subscripts(listeners__monomer_counts, 1) AS monomer_idx, FROM ({monomer_subquery}) ) SELECT avg(monomer_counts) AS "protein-count-avg", stddev(monomer_counts) AS "protein-count-std", avg(monomer_counts * counts_to_molar) AS "protein-concentration-avg", stddev(monomer_counts * counts_to_molar) AS "protein-concentration-std", avg(dry_masses) AS "dry-masses-avg" FROM unnested_counts GROUP BY monomer_idx ORDER BY monomer_idx """ ).pl() monomer_ids = get_field_metadata(conn, config_sql, "listeners__monomer_counts") monomer_ecocyc_ids = [monomer[:-3] for monomer in monomer_ids] # strip [*] monomer_mw = sim_data.getter.get_masses(monomer_ids).asNumber( units.fg / units.count ) monomer_sim_data = sim_data.process.translation.monomer_data.struct_array monomer_to_gene_id = { monomer_id: cistron_id_to_gene_id[cistron_id] for cistron_id, monomer_id in zip( monomer_sim_data["cistron_id"], monomer_sim_data["id"] ) } # Calculate relative statistics monomer_data = monomer_data.with_columns( **{ "relative-protein-count-to-total-protein-counts": monomer_data.select( pl.col("protein-count-avg") / pl.col("protein-count-avg").sum() )["protein-count-avg"], "relative-protein-mass-to-total-protein-mass": monomer_data.select( pl.col("protein-count-avg") * monomer_mw / (pl.col("protein-count-avg") * monomer_mw).sum() )["protein-count-avg"], "relative-protein-mass-to-total-cell-dry-mass": monomer_data.select( pl.col("protein-count-avg") * monomer_mw / pl.col("dry-masses-avg") )["protein-count-avg"], "gene-id": pl.Series( [monomer_to_gene_id[monomer_id] for monomer_id in monomer_ids] ), "id": pl.Series(monomer_ecocyc_ids), } ) monomer_data = monomer_data.join( rna_data.select(**{"gene-id": "id", "rna-count-avg": "rna-count-avg"}), on="gene-id", ) monomer_data = monomer_data.with_columns( **{ "relative-protein-count-to-protein-rna-counts": monomer_data.select( pl.col("protein-count-avg") / pl.col("rna-count-avg") )["protein-count-avg"] } ) columns = { "id": "Object ID, according to EcoCyc", "protein-count-avg": "A floating point number", "protein-count-std": "A floating point number", "protein-concentration-avg": "A floating point number in mM units", "protein-concentration-std": "A floating point number in mM units", "relative-protein-count-to-protein-rna-counts": "A floating point number", "relative-protein-mass-to-total-protein-mass": "A floating point number", "relative-protein-mass-to-total-cell-dry-mass": "A floating point number", } values = [monomer_data[k] for k in columns] # Add validation data if sims used minimal glucose media if media_name == "minimal": protein_id_to_schmidt_counts = { item[0]: item[1] for item in validation_data.protein.schmidt2015Data } protein_counts_val = np.array( [ protein_id_to_schmidt_counts.get(protein_id, np.nan) for protein_id in monomer_ids ] ) columns["validation-count"] = "A floating point number" values.append(pl.Series(protein_counts_val)) protein_val_exists = np.logical_not(np.isnan(protein_counts_val)) r, _ = pearsonr( monomer_data["protein-count-avg"].filter(pl.Series(protein_val_exists)), protein_counts_val[protein_val_exists], ) ecocyc_metadata["protein_validation_r_squared"] = r**2 save_file(outdir, f"wcm_monomers_{media_id}.tsv", columns, values) # Read data and load attributes for complexes complex_ids = sim_data.process.complexation.ids_complexes complex_idx = [bulk_id_to_idx[cplx] for cplx in complex_ids] complex_subquery = read_stacked_columns( history_sql, [ "bulk", "listeners__enzyme_kinetics__counts_to_molar", "listeners__mass__dry_mass", ], [ # Extract only complex bulk counts to reduce RAM usage f"list_select(bulk, {complex_idx}) AS complex_counts", "listeners__enzyme_kinetics__counts_to_molar", "listeners__mass__dry_mass", ], remove_first=True, order_results=False, ) complex_data = conn.sql( f""" WITH unnested_counts AS ( SELECT -- Unnest complex counts to calculate aggregations (e.g. mean) unnest(complex_counts) AS complex_counts, -- Track list index for each unnested row for grouped aggregations generate_subscripts(complex_counts, 1) AS complex_idx, listeners__enzyme_kinetics__counts_to_molar AS counts_to_molar, listeners__mass__dry_mass AS dry_masses, FROM ({complex_subquery}) ) SELECT avg(complex_counts) AS "complex-count-avg", stddev(complex_counts) AS "complex-count-std", avg(complex_counts * counts_to_molar) AS "complex-concentration-avg", stddev(complex_counts * counts_to_molar) AS "complex-concentration-std", avg(dry_masses) AS "dry-masses-avg" FROM unnested_counts GROUP BY complex_idx ORDER BY complex_idx """ ).pl() # Calculate derived protein values complex_mw = sim_data.getter.get_masses(complex_ids).asNumber( units.fg / units.count ) complex_data = complex_data.with_columns( **{ "relative-complex-mass-to-total-protein-mass": complex_data.select( pl.col("complex-count-avg") * complex_mw / (pl.col("complex-count-avg") * complex_mw).sum() )["complex-count-avg"], "relative-complex-mass-to-total-cell-dry-mass": complex_data.select( pl.col("complex-count-avg") * complex_mw / pl.col("dry-masses-avg") )["complex-count-avg"], "id": pl.Series( [complex_id[:-3] for complex_id in complex_ids] ), # strip [*] } ) # Save complex data in table columns = { "id": "Object ID, according to EcoCyc", "complex-count-avg": "A floating point number", "complex-count-std": "A floating point number", "complex-concentration-avg": "A floating point number in mM units", "complex-concentration-std": "A floating point number in mM units", "relative-complex-mass-to-total-protein-mass": "A floating point number", "relative-complex-mass-to-total-cell-dry-mass": "A floating point number", } values = [complex_data[k] for k in columns] save_file(outdir, f"wcm_complexes_{media_id}.tsv", columns, values) # Load attributes for metabolic fluxes cell_density = sim_data.constants.cell_density cell_density = cell_density.asNumber(MASS_UNITS / VOLUME_UNITS) reaction_ids = sim_data.process.metabolism.base_reaction_ids # Read fluxes flux_subquery = read_stacked_columns( history_sql, [ "listeners__fba_results__base_reaction_fluxes", "listeners__mass__cell_mass", "listeners__mass__dry_mass", ], order_results=False, ) flux_data = conn.sql( f""" WITH unnest_fluxes AS ( SELECT listeners__mass__dry_mass / listeners__mass__cell_mass * {cell_density} AS conversion_coeffs, -- Unnest monomer counts to calculate aggregations (e.g. mean) unnest(listeners__fba_results__base_reaction_fluxes) AS fluxes, generate_subscripts(listeners__fba_results__base_reaction_fluxes, 1) AS idx, FROM ({flux_subquery}) ) SELECT avg(fluxes / conversion_coeffs) AS "flux-avg", stddev(fluxes / conversion_coeffs) AS "flux-std" FROM unnest_fluxes GROUP BY idx ORDER BY idx """ ).pl() flux_data = flux_data.with_columns( **{ "id": pl.Series(reaction_ids), "flux-avg": ( (COUNTS_UNITS / MASS_UNITS / TIME_UNITS) * pl.col("flux-avg") ).asNumber(units.mmol / units.g / units.h), "flux-std": ( (COUNTS_UNITS / MASS_UNITS / TIME_UNITS) * pl.col("flux-std") ).asNumber(units.mmol / units.g / units.h), } ) columns = { "id": "Object ID, according to EcoCyc", "flux-avg": "A floating point number in mmol/g DCW/h units", "flux-std": "A floating point number in mmol/g DCW/h units", } values = [flux_data[k] for k in columns] save_file(outdir, f"wcm_metabolic_reactions_{media_id}.tsv", columns, values) metadata_file = os.path.join(outdir, f"wcm_metadata_{media_id}.json") with open(metadata_file, "w") as f: print(f"Saving data to {metadata_file}") json.dump(ecocyc_metadata, f, indent=4)