"""
ValidationData for Ecoli
Raw data processed into forms convienent for validation and analysis
"""
import numpy as np
from unum import Unum
# Raw data class
from reconstruction.ecoli.knowledge_base_raw import KnowledgeBaseEcoli
# Data classes
from reconstruction.ecoli.dataclasses.getter_functions import (
GetterFunctions,
UNDEFINED_COMPARTMENT_IDS_TO_ABBREVS,
)
from reconstruction.ecoli.dataclasses.molecule_groups import MoleculeGroups
from reconstruction.ecoli.dataclasses.molecule_ids import MoleculeIds
from reconstruction.ecoli.dataclasses.constants import Constants
from reconstruction.ecoli.dataclasses.state.internal_state import InternalState
from reconstruction.ecoli.dataclasses.process.process import Process
from reconstruction.ecoli.dataclasses.growth_rate_dependent_parameters import (
Mass,
GrowthRateParameters,
)
from reconstruction.ecoli.dataclasses.relation import Relation
__all__ = [
"KnowledgeBaseEcoli",
"GetterFunctions",
"MoleculeGroups",
"MoleculeIds",
"Constants",
"InternalState",
"Process",
"Mass",
"GrowthRateParameters",
"Relation",
"ValidationDataEcoli",
"Protein",
"ReactionFlux",
"EssentialGenes",
"GeneFunctions",
]
[docs]
class ValidationDataEcoli(object):
"""ValidationDataEcoli"""
def __init__(self):
pass
[docs]
def initialize(self, validation_data_raw, knowledge_base_raw):
self.protein = Protein(validation_data_raw, knowledge_base_raw)
self.reactionFlux = ReactionFlux(validation_data_raw, knowledge_base_raw)
self.essential_genes = EssentialGenes(validation_data_raw)
self.geneFunctions = GeneFunctions(validation_data_raw)
self.macromolecular_growth_rate_modulation = MacromolecularGrowthRateModulation(
validation_data_raw
)
self._add_amino_acid_growth_rates(validation_data_raw)
[docs]
def _add_amino_acid_growth_rates(self, validation_data_raw):
"""
Loads growth rates with single amino acids supplemented in media.
amino_acid_media_growth_rates: dict with data from 4 replicates
{
media ID (str):
{
'mean': mean max growth rate from 4 replicates (float with units per time)
'std': standard deviation for max growth rate from 4 replicates (float with units per time)
}
}
amino_acid_media_dose_dependent_growth_rates: dict with data from single measurements from 4 concentrations
{
media ID (str):
{
'conc': concentration of the amino acid in media (np.ndarray[float] with units mol/volume)
'growth': max growth rate corresponding to each media concentration (float with units per time)
}
}
"""
rates = {}
for row in validation_data_raw.amino_acid_growth_rates:
rates[row["Media"]] = {
"mean": row["Average max growth rate"],
"std": row["Max growth rate standard deviation"],
}
self.amino_acid_growth_rates = rates
rates = {}
for row in validation_data_raw.amino_acid_growth_rates_dose_response:
rates[row["Media"]] = {
"conc": row["Media concentrations"],
"growth": row["Max growth rates"],
}
self.amino_acid_dose_dependent_growth_rates = rates
[docs]
class Protein(object):
"""Protein"""
def __init__(self, validation_data_raw, knowledge_base_raw):
compartment_ids_to_abbreviations = {
comp["id"]: comp["abbrev"] for comp in knowledge_base_raw.compartments
}
# Compartments that don't exist in compartments.tsv
compartment_ids_to_abbreviations.update(UNDEFINED_COMPARTMENT_IDS_TO_ABBREVS)
protein_id_to_compartment_tag = {}
for protein in knowledge_base_raw.proteins:
exp_compartment = protein["experimental_compartment"]
comp_compartment = protein["computational_compartment"]
if len(exp_compartment) + len(comp_compartment) == 0:
compartment = "CCO-CYTOSOL"
elif len(exp_compartment) > 0:
compartment = exp_compartment[0]
else:
compartment = comp_compartment[0]
protein_id_to_compartment_tag.update(
{protein["id"]: [compartment_ids_to_abbreviations[compartment]]}
)
# Build and save a dict from gene ID to monomerId
rna_id_to_gene_id = {
gene["rna_ids"][0]: gene["id"] for gene in knowledge_base_raw.genes
}
self.geneIdToMonomerId = {}
for rna in knowledge_base_raw.rnas:
if (
len(rna["monomer_ids"]) > 0
and rna["monomer_ids"][0] in protein_id_to_compartment_tag
):
self.geneIdToMonomerId[rna_id_to_gene_id[rna["id"]]] = (
f"{rna['monomer_ids'][0]}[{protein_id_to_compartment_tag[rna['monomer_ids'][0]][0]}]"
)
# Build and save a dict from gene symbol to corresponding monomerId
self.geneSymbolToMonomerId = {}
gene_id_to_symbol = {
gene["id"]: gene["symbol"] for gene in knowledge_base_raw.genes
}
for gene_id, monomer_id in self.geneIdToMonomerId.items():
self.geneSymbolToMonomerId[gene_id_to_symbol[gene_id]] = monomer_id
self._loadTaniguchi2010Counts(validation_data_raw)
self._loadHouser2015Counts(validation_data_raw)
self._loadWisniewski2014Counts(validation_data_raw, knowledge_base_raw)
self._loadSchmidt2015Counts(validation_data_raw)
self._load_li(validation_data_raw)
[docs]
def _loadTaniguchi2010Counts(self, validation_data_raw):
# Load taniguichi Xie Science 2010 dataset
taniguichi_dataset = validation_data_raw.taniguichi2010_table_6
self.taniguichi2010counts = np.zeros(
len(taniguichi_dataset),
dtype=[
("monomerId", "U100"),
("gene_symbol", "U10"),
("b_number", "U10"),
("counts_ave", np.float32),
("gamma_shape_parameter", np.float32),
("gamma_scale_parameter", np.float32),
],
)
for idx, row in enumerate(taniguichi_dataset):
self.taniguichi2010counts[idx]["gene_symbol"] = row["Gene_Name"]
self.taniguichi2010counts[idx]["b_number"] = row["B_Number"]
self.taniguichi2010counts[idx]["counts_ave"] = row["Mean_Protein"]
self.taniguichi2010counts[idx]["gamma_shape_parameter"] = row["A_Protein"]
self.taniguichi2010counts[idx]["gamma_scale_parameter"] = row["B_Protein"]
# Add a monomerId to each row
if row["Gene_Name"] in self.geneSymbolToMonomerId:
self.taniguichi2010counts[idx]["monomerId"] = (
self.geneSymbolToMonomerId[row["Gene_Name"]]
)
[docs]
def _loadHouser2015Counts(self, validation_data_raw):
# Load Houser Wilke PLoSCB 2015 dataset
houser_dataset = validation_data_raw.houser2015_javier_table
self.houser2015counts = np.zeros(
len(houser_dataset),
dtype=[
("monomerId", "U100"),
("gene_symbol", "U10"),
("counts_ave_exponential", np.float32),
("sample16_t3", np.float32),
("sample17_t4", np.float32),
("sample18_t5", np.float32),
("sample19_t6", np.float32),
("sample20_t8", np.float32),
("sample21_t24", np.float32),
("sample22_t48", np.float32),
("sample23_t168", np.float32),
("sample24_336", np.float32),
("sample25_t3", np.float32),
("sample26_t4", np.float32),
("sample27_t5", np.float32),
("sample28_t6", np.float32),
("sample29_t8", np.float32),
("sample30_t24", np.float32),
("sample31_t48", np.float32),
("sample32_t168", np.float32),
("sample34_336", np.float32),
("sample97_t3", np.float32),
("sample98_t4", np.float32),
("sample99_t5", np.float32),
("sample100_t6", np.float32),
("sample101_t8", np.float32),
("sample102_t24", np.float32),
("sample103_t48", np.float32),
("sample104_t168", np.float32),
("sample105_336", np.float32),
],
)
for idx, row in enumerate(houser_dataset):
if row["gene_symbol"] in self.geneSymbolToMonomerId:
self.houser2015counts[idx]["monomerId"] = self.geneSymbolToMonomerId[
row["gene_symbol"]
]
self.houser2015counts[idx]["gene_symbol"] = row["gene_symbol"]
# Hour 6 is the most clearly exponential time point, this is in columns labeled as t6
exponential_average = np.mean(
[
float(row["sample19_t6"]),
float(row["sample28_t6"]),
float(row["sample100_t6"]),
]
)
# Only include proteins observed at least once in this time step
if exponential_average:
self.houser2015counts[idx]["counts_ave_exponential"] = (
exponential_average
)
# Otherwise, remove this row from the dataset
else:
np.delete(self.houser2015counts, idx, 0)
# Load the rest of the data as-is
for fieldName in row:
if fieldName == "gene_symbol" or fieldName == "":
continue
self.houser2015counts[idx][fieldName] = row[fieldName]
[docs]
def _loadWisniewski2014Counts(self, validation_data_raw, knowledge_base_raw):
dataset = validation_data_raw.wisniewski2014_supp2
rep1 = np.array([x["rep1"] for x in dataset])
rep2 = np.array([x["rep2"] for x in dataset])
rep3 = np.array([x["rep3"] for x in dataset])
avg = np.mean((rep1, rep2, rep3), axis=0)
geneIds = [x["EcoCycID"] for x in dataset]
monomer_ids = []
avg_counts_filtered = []
for i, (gene_id, avg_count) in enumerate(zip(geneIds, avg)):
if gene_id in self.geneIdToMonomerId:
monomer_ids.append(self.geneIdToMonomerId[gene_id])
avg_counts_filtered.append(avg_count)
nEntries = len(monomer_ids)
wisniewski2014Data = np.zeros(
nEntries,
dtype=[
("monomerId", "U50"),
("avgCounts", "f8"),
],
)
wisniewski2014Data["monomerId"] = monomer_ids
wisniewski2014Data["avgCounts"] = avg_counts_filtered
self.wisniewski2014Data = wisniewski2014Data
[docs]
def _loadSchmidt2015Counts(self, validation_data_raw):
dataset = validation_data_raw.schmidt2015_javier_table
geneIds = [x["EcoCycID"] for x in dataset]
monomerIds = [self.geneIdToMonomerId[x] for x in geneIds]
glucoseCounts = [x["Glucose"] for x in dataset]
lb_counts = [x["LB"] for x in dataset]
nEntries = len(geneIds)
schmidt2015Data = np.zeros(
nEntries,
dtype=[
("monomerId", "U50"),
("glucoseCounts", "f8"),
("LB_counts", "f8"),
],
)
schmidt2015Data["monomerId"] = monomerIds
schmidt2015Data["glucoseCounts"] = glucoseCounts
schmidt2015Data["LB_counts"] = lb_counts
self.schmidt2015Data = schmidt2015Data
[docs]
def _load_li(self, validation_data_raw):
monomers = []
rich_rates = []
minimal_rates = []
for line in validation_data_raw.li_protein_synthesis_rates_2014:
gene = line["Gene"]
if (symbol := self.geneSymbolToMonomerId.get(gene)) is not None:
monomers.append(symbol)
rich_rates.append(int(str(line["MOPS complete"]).strip("[]")))
minimal_rates.append(int(str(line["MOPS minimal"]).strip("[]")))
self.li_2014 = np.zeros(
len(monomers),
dtype=[
("monomer", f"U{max([len(monomer) for monomer in monomers])}"),
("rich_rate", "f8"),
("minimal_rate", "f8"),
],
)
self.li_2014["monomer"] = monomers
self.li_2014["rich_rate"] = rich_rates
self.li_2014["minimal_rate"] = minimal_rates
[docs]
class ReactionFlux(object):
"""ReactionFlux"""
def __init__(self, validation_data_raw, knowledge_base_raw):
self._loadToya2010Fluxes(validation_data_raw)
[docs]
def _loadToya2010Fluxes(self, validation_data_raw):
# Load Toya 2010 Biotech Prog central carbon metabolism C13 flux dataset
toya_dataset = validation_data_raw.toya_2010_central_carbon_fluxes
self.toya2010fluxes = np.zeros(
len(toya_dataset),
dtype=[
("reactionID", "U100"),
("reactionFlux", Unum),
("reactionFluxStdev", Unum),
],
)
for idx, row in enumerate(toya_dataset):
self.toya2010fluxes[idx]["reactionID"] = row["reactionID"]
self.toya2010fluxes[idx]["reactionFlux"] = row["flux"]
self.toya2010fluxes[idx]["reactionFluxStdev"] = row[
"flux standard deviation"
]
[docs]
class EssentialGenes(object):
"""EssentialGenes"""
def __init__(self, validation_data_raw):
self._load_essential_genes(validation_data_raw)
[docs]
def _load_essential_genes(self, validation_data_raw):
self.essential_genes = []
self.essential_cistrons = []
self.essential_proteins = []
for row in validation_data_raw.essential_genes:
self.essential_genes.append(row["FrameID"])
self.essential_cistrons.append(row["rnaID"])
self.essential_proteins.append(
row["proteinID"] + "[" + row["proteinLoc"] + "]"
)
[docs]
class GeneFunctions(object):
"""GeneFunctions"""
def __init__(self, validation_data_raw):
self._loadGeneFunctions(validation_data_raw)
[docs]
def _loadGeneFunctions(self, validation_data_raw):
self.geneFunctions = {}
for row in validation_data_raw.geneFunctions:
self.geneFunctions[row["FrameID"]] = row["Function"]
class MacromolecularGrowthRateModulation(object):
"""MacromolecularGrowthRateModulation"""
def __init__(self, validation_data_raw):
self._load_macromolecular_growth_rate_modulation(validation_data_raw)
def _load_macromolecular_growth_rate_modulation(self, validation_data_raw):
dataset = validation_data_raw.macromolecular_growth_rate_modulation
# List of data value names extracted from the raw flat file
values = list(dataset[0].keys())
for value in values:
# Store numerical value of raw data into a temporary ndarray
temp = np.zeros(len(dataset))
for idx, row in enumerate(dataset):
y = row[value]
temp[idx] = Unum.coerceToUnum(y).asNumber()
# Set each ndarray as the corresponding attribute, while tagging the data's original units onto the ndarray
setattr(self, value, Unum(Unum.coerceToUnum(dataset[0][value])._unit, temp))