Source code for reconstruction.ecoli.scripts.metabolite_concentrations.merge_files

#! /usr/bin/env python

"""
Merge two or more concentration flat files with a single row for each metabolite
and empty entries for missing concentration data.

Usage with paths to tsv files to merge:
        ./merge_files [TSV1 TSV2 ...]
"""

import io
import os
import sys
import time


from wholecell.io import tsv


FILE_LOCATION = os.path.realpath(os.path.dirname(__file__))
OUTPUT_FILE = os.path.join(FILE_LOCATION, "metabolite_concentrations.tsv")



[docs]
def load_conc(filename: str) -> tuple[str, dict[str, str]]:
    """
    Load concentration data from a tsv file.  First column should be metabolite
    ID and second column should be concentration.  Does not handle more than
    one concentration column at this point.

    Args:
        filename: path to concentration tsv file

    Returns:
        label: header describing the concentration data
        conc: metabolite ID to concentration
    """

    conc: dict[str, str] = {}
    with io.open(filename, "rb") as f:
        reader = tsv.reader(f)

        headers = next(reader)
        while headers[0].startswith("#"):
            headers = next(reader)
        label = headers[1]

        for line in reader:
            mol_id = line[0].strip('# "')
            mol_conc = line[1]
            if mol_conc:
                conc[mol_id] = mol_conc

    return label, conc




[docs]
def save_conc(conc: list[tuple[str, dict[str, str]]]):
    """
    Save combined concentration data with blank entries for metabolites with
    unknown concentrations.

    Args:
            conc: entries with header description and metabolite ID to concentration mapping
    """

    mets = {m for c in conc for m in c[1]}
    headers = ['"Metabolite"'] + ['"{}"'.format(c[0]) for c in conc]

    with io.open(OUTPUT_FILE, "wb") as f:
        writer = tsv.writer(f, quotechar="'", lineterminator="\n")
        writer.writerow(
            ["# Created with {} on {}".format(" ".join(sys.argv), time.ctime())]
        )
        writer.writerow(headers)
        for m in sorted(mets):
            writer.writerow(['"{}"'.format(m)] + [c[1].get(m, "NaN") for c in conc])



if __name__ == "__main__":
    if len(sys.argv) < 3:
        raise ValueError(
            "Expecting two or more files to merge. {} [TSV1 TSV2 ...]".format(
                sys.argv[0]
            )
        )

    conc_ = [load_conc(f) for f in sys.argv[1:]]
    save_conc(conc_)