Source code for reconstruction.ecoli.scripts.metabolite_concentrations.convert_to_flat

#! /usr/bin/env python

"""
Extracts concentrations for metabolites from various raw data files.

Data in lempp2019.tsv was converted to a tsv from the file downloaded from
Lempp et al. Systematic identification of metabolites controlling gene
expression in E. coli. 2019.
https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-019-12474-1/MediaObjects/41467_2019_12474_MOESM4_ESM.xlsx

Data in park2016.tsv was converted to a tsv from supplementary table 5
(manually removed some columns and rows) in the file downloaded from
https://static-content.springer.com/esm/art%3A10.1038%2Fnchembio.2077/MediaObjects/41589_2016_BFnchembio2077_MOESM583_ESM.pdf

Data in kochanowski2017*.tsv was converted to a tsv from EV table 6-1
(absolute) and EV table 6-2 (relative) (manually removed some rows)
from the file downloaded from
https://www.embopress.org/action/downloadSupplement?doi=10.15252%2Fmsb.20167402&file=msb167402-sup-0007-TableEV6.xlsx

Data in sander2019.tsv is from Sander et al. Allosteric Feedback Inhibition
Enables Robust Amino Acid Biosynthesis in E. coli by Enforcing Enzyme Overabundance.
2019 Table S7 (tab 'Amino Acids') from
https://ars.els-cdn.com/content/image/1-s2.0-S2405471218304794-mmc2.xlsx
"""

import io
import os
import sys
import time
from typing import Any, cast, IO
from urllib import request

import numpy as np
import numpy.typing as npt

from wholecell.io import tsv


# Directories
FILE_LOCATION = os.path.realpath(os.path.dirname(__file__))
DATA_DIR = os.path.join(FILE_LOCATION, "data")
OUT_DIR = os.path.join(FILE_LOCATION, "out")
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

# Files
LEMPP_INPUT = os.path.join(DATA_DIR, "lempp2019.tsv")
PARK_INPUT = os.path.join(DATA_DIR, "park2016.tsv")
KOCHANOWSKI_ABSOLUTE_INPUT = os.path.join(DATA_DIR, "kochanowski2017absolute.tsv")
KOCHANOWSKI_RELATIVE_INPUT = os.path.join(DATA_DIR, "kochanowski2017relative.tsv")
SANDER_INPUT = os.path.join(DATA_DIR, "sander2019.tsv")
ABSOLUTE_OUTPUT_FILE = os.path.join(OUT_DIR, "{}_concentrations.tsv")
RELATIVE_OUTPUT_FILE = os.path.join(OUT_DIR, "relative_metabolite_concentrations.tsv")

# Correct EcoCyc IDs to match the whole-cell model ID
ECOCYC_SUBSTITUTIONS = {
    "D-GLUCOSAMINE-6-P": "CPD-13469",
    "D-glucopyranose-6-phosphate": "GLC-6-P",
    "Isocitrate": "THREO-DS-ISO-CITRATE",
    "CPD-18719": "FRUCTOSE-6P",
}

# Kochanowski mappings
## Special media IDs used later (implemented in WCM)
GLC_MEDIA = "minimal"
ACETATE_MEDIA = "minimal_acetate"
SUCCINATE_MEDIA = "minimal_succinate"
## Map metabolite column to wcm IDs (not all could be matched)
KOCHANOWSKI_METABOLITES = {
    "Glycerol-P": "GLYCEROL-3P",
    "F1P": "FRU1P",
    # 'Ga6P': '',
    "G6P": "GLC-6-P",
    "F6P": "FRUCTOSE-6P",
    "FBP": "FRUCTOSE-16-DIPHOSPHATE",
    "DHAP": "DIHYDROXY-ACETONE-PHOSPHATE",
    "BPG": "DPG",
    # 'xPG': '',
    "PEP": "PHOSPHO-ENOL-PYRUVATE",
    "6PG": "CPD-2961",
    "Ru5P": "RIBULOSE-5P",
    "R5P": "RIBOSE-5P",
    "Xu5P": "XYLULOSE-5-PHOSPHATE",
    "R1P": "RIBOSE-1P",
    "S7P": "D-SEDOHEPTULOSE-7-P",
    "Lactate": "Lactate",
    "Acetyl-CoA": "ACETYL-COA",
    "Citrate/Isocitrate": "THREO-DS-ISO-CITRATE",
    "Aconitate": "CIS-ACONITATE",
    "Alpha ketoglutarate": "2-KETOGLUTARATE",
    "Succinate": "SUC",
    "Fumarate": "FUM",
    "Malate": "MAL",
    "AMP": "AMP",
    "ADP": "ADP",
    "ATP": "ATP",
    "GMP": "GMP",
    "GDP": "GDP",
    "GTP": "GTP",
    "IMP": "IMP",
    "NAD": "NAD",
    "NADH": "NADH",
    "NADP": "NADP",
    "NADPH": "NADPH",
    "GTTred": "GLUTATHIONE",
    "GTTox": "OXIDIZED-GLUTATHIONE",
    "Asparagine": "ASN",
    "Aspartate": "L-ASPARTATE",
    "Arginine": "ARG",
    "Glutamine": "GLN",
    "Glutamate": "GLT",
    "Phenylalanine": "PHE",
    "Tyrosine": "TYR",
    "Panthothenate": "PANTOTHENATE",
    # 'UDP-hexose': '',
    "cAMP": "CAMP",
}
SANDER_METABOLITES = {
    "Glutamic acid": "GLT",
    "Glutamine": "GLN",
    "Arginine": "ARG",
    "Proline": "PRO",
    "Aspartic acid": "L-ASPARTATE",
    "Asparagine": "ASN",
    "Lysine": "LYS",
    "Methionine": "MET",
    "Threonine": "THR",
    # '(Iso-)Leucine': '',
    "Valine": "VAL",
    "Alanine": "L-ALPHA-ALANINE",
    "Serine": "SER",
    "Glycine": "GLY",
    "Histidine": "HIS",
    "Phenylalanine": "PHE",
    "Tryptophan": "TRP",
    "Tyrosine": "TYR",
}
## Map media headers to wcm IDs (not all are currently valid media)
KOCHANOWSKI_MEDIA = {
    " M9 galactose 2g per L": "minimal_galactose",
    " M9 acetate 3.6g per L": ACETATE_MEDIA,
    " M9 mannose 2g per L": "minimal_mannose",
    " M9 pyruvate 5g per L": "minimal_pyruvate",
    " M9 lactate 5g per L": "minimal_lactate",
    " M9 glycerol 2g per L": "minimal_glycerol",
    " M9 sorbitol 2g per L": "minimal_sorbitol",
    " M9 fructose 2g per L": "minimal_fructose",
    " M9 succinate 2g per L": SUCCINATE_MEDIA,
    " M9 glcNAc 2g per L": "minimal_glcNAc",
    " M9 mannitol 2g per L": "minimal_mannitol",
    " M9 gluconate 2g per L": "minimal_gluconate",
    " M9 glucose 2g per L first experiment": GLC_MEDIA,  # methods say 5 g/L
    " M9 G6P 2g per L": "minimal_g6p",
    " M9 glucose 2g per L plus CAA 2g per L": "minimal_plus_cas_amino_acids",
    # ' M9 glucose 0 mueM Chloramphenicol': '',
    # ' M9 glucose 0.75 mueM Chloramphenicol': '',
    # ' M9 glucose 1 mueM Chloramphenicol': '',
    # ' M9 glucose 2 mueM Chloramphenicol': '',
    # ' M9 glucose 4 mueM Chloramphenicol': '',
    # ' M9 glucose 6 mueM Chloramphenicol': '',
    # ' M9 glucose 8 mueM Chloramphenicol': '',
    # ' M9 glucose 10 mueM Chloramphenicol': '',
}



[docs]
def lempp_concentrations() -> dict[str, float]:
    """
    Load Lempp data for average metabolite concentrations at the first time point.

    Returns:
            met_conc: EcoCyc molecule ID to concentration (in M)
    """

    met_conc = {}

    with io.open(LEMPP_INPUT, "rb") as f:
        reader = tsv.reader(f)

        start_conc_col = next(reader).index("intracellular concentrations (\xb5M)")
        next(reader)  # discard line
        n_conc = np.sum([t.startswith("t0") for t in next(reader)[start_conc_col:]])
        end_conc_col = start_conc_col + n_conc
        id_col = next(reader).index("KEGG")

        for line in reader:
            met_id = line[id_col]
            try:
                conc = np.array(line[start_conc_col:end_conc_col], float).mean()
            except ValueError as _:
                # Concentration data does not exist ('-')
                continue

            # Convert from uM to M concentration
            met_conc[met_id] = conc / 1e6

    return kegg_to_ecocyc(met_conc)




[docs]
def park_concentrations() -> dict[str, float]:
    """
    Load Park data for reported metabolite concentrations.

    Returns:
            met_conc: EcoCyc molecule ID to concentration (in M)
    """

    met_conc = {}

    with io.open(PARK_INPUT, "rb") as f:
        reader = tsv.reader(f)

        next(reader)  # discard line
        headers = next(reader)
        id_col = headers.index("KEGG ID")
        conc_col = headers.index("E. coli")

        for line in reader:
            met_id = line[id_col]
            try:
                conc = float(line[conc_col])
            except ValueError as _:
                # Concentration data does not exist ('-')
                continue

            met_conc[met_id] = conc

    return kegg_to_ecocyc(met_conc)




[docs]
def load_kochanowski(
    filename: str,
) -> tuple[dict[str, npt.NDArray[np.float64]], npt.NDArray[np.str_]]:
    """
    Load Kochanowski data (absolute or relative).

    Args:
        filename: path to file to load

    Returns:
        met_conc: WCM ID to concentration (absolute or relative)
    """

    met_conc = {}

    with io.open(filename, "rb") as f:
        reader = tsv.reader(f)

        next(reader)  # discard line
        headers = next(reader)[1:]
        valid_conditions = np.array([h in KOCHANOWSKI_MEDIA for h in headers])
        condition_headers = np.array([KOCHANOWSKI_MEDIA.get(h) for h in headers])[
            valid_conditions
        ]
        met_col = 0

        for line in reader:
            met_id = KOCHANOWSKI_METABOLITES.get(line[met_col])
            if met_id is None:
                continue

            conc = np.array(line[1:], float)[valid_conditions]
            met_conc[met_id] = conc
            conc[conc <= 0] = np.nan

    return met_conc, condition_headers




[docs]
def kochanowski_concentrations() -> dict[str, float]:
    """
    Load absolute Kochanowski concentration data in the glucose condition.

    Returns:
        met_conc: WCM ID to concentration (in M)
    """

    raw_data, headers = load_kochanowski(KOCHANOWSKI_ABSOLUTE_INPUT)
    glc_col = np.where(headers == GLC_MEDIA)[0][0]

    met_conc = {}
    for met, conc in raw_data.items():
        conc_in_glc = conc[glc_col]
        if np.isfinite(conc_in_glc):
            met_conc[met] = conc_in_glc / 1000  # convert mM to M

    return met_conc




[docs]
def sander_concentrations() -> dict[str, float]:
    """
    Load Sander data for amino acid concentrations.

    Returns:
            met_conc: WCM ID to concentration (in M)
    """

    met_conc = {}

    with io.open(SANDER_INPUT, "rb") as f:
        reader = tsv.reader(f)

        next(reader)  # discard line
        next(reader)  # discard line
        headers = next(reader)
        id_col = headers.index("Name")
        conc_col = headers.index("WT")

        for line in reader:
            met_id = SANDER_METABOLITES.get(line[id_col])
            if met_id is None:
                continue

            met_conc[met_id] = float(line[conc_col]) / 1000  # convert mM to M

    return met_conc




[docs]
def kegg_to_ecocyc(data: dict[str, Any]) -> dict[str, Any]:
    """
    Convert a dictionary with KEGG ID keys to a dictionary with EcoCyc ID keys.

    Args:
        data: dictionary to convert KEGG molecule IDs to EcoCyc IDs

    Returns:
        new_data: new dictionary with EcoCyc IDs
    """

    kegg_ids = list(data.keys())
    mapping = {}
    id_type = "Kegg:"
    url = "https://websvc.biocyc.org/ECOLI/foreignid?ids="
    ids = ",".join(["{}{}".format(id_type, i) for i in kegg_ids])

    u = cast(IO[bytes], request.urlopen(url + ids))  # type "addinfourl"?
    reader = tsv.reader(u)

    for line in reader:
        if line[1] == "1":
            mol_id = line[0].split(id_type)[1]
            ecocyc_id = line[2]
            mapping[mol_id] = ECOCYC_SUBSTITUTIONS.get(ecocyc_id, ecocyc_id)

    new_data = {mapping[m]: c for m, c in data.items() if m in mapping}

    return new_data




[docs]
def save_concentrations(conc: dict[str, float], label: str):
    """
    Save EcoCyc ID concentrations to a file.

    Args:
        conc: ID to concentration (in M)
        label: column and filename dataset label (author)
    """

    output = ABSOLUTE_OUTPUT_FILE.format(label.lower())
    with io.open(output, "wb") as f:
        writer = tsv.writer(f)
        writer.writerow(
            ["Metabolite", "{} Concentration (units.mol/units.L)".format(label)]
        )

        for m, c in sorted(conc.items(), key=lambda d: d[0]):
            writer.writerow(
                [m, "{:.2e}".format(c).replace("e+0", "e").replace("e-0", "e-")]
            )




[docs]
def save_kochanowski_relative_changes():
    """
    Convert relative Kochanowski concentration data to wcm IDs for all conditions.
    """

    met_conc, headers = load_kochanowski(KOCHANOWSKI_RELATIVE_INPUT)

    # Reorder output to put most interesting conditions first
    first_headers = [GLC_MEDIA, ACETATE_MEDIA, SUCCINATE_MEDIA]
    reordered_headers = first_headers + [h for h in headers if h not in first_headers]
    reordered_indexing = np.array(
        [np.where(headers == h)[0][0] for h in reordered_headers]
    )

    with io.open(RELATIVE_OUTPUT_FILE, "wb") as f:
        writer = tsv.writer(f, quotechar="'", lineterminator="\n")
        writer.writerow(
            ["# Created with {} on {}".format(" ".join(sys.argv), time.ctime())]
        )
        writer.writerow(["Metabolite"] + reordered_headers)

        for met, changes in sorted(met_conc.items()):
            data = [
                change if np.isfinite(change) else "NaN"
                for change in changes[reordered_indexing]
            ]
            writer.writerow(['"{}"'.format(met)] + data)



if __name__ == "__main__":
    # Lempp 2019
    lempp = lempp_concentrations()
    save_concentrations(lempp, "Lempp")

    # Park 2016
    park = park_concentrations()
    save_concentrations(park, "Park")

    # Kochanowski 2017
    kochanowski = kochanowski_concentrations()
    save_concentrations(kochanowski, "Kochanowski")
    save_kochanowski_relative_changes()

    # Sander 2019
    sander = sander_concentrations()
    save_concentrations(sander, "Sander")