Source code for ecoli.analysis.antibiotics_colony.validation

import os
from typing import Any, cast, List, Optional

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit, root_scalar

from ecoli.analysis.antibiotics_colony import (
    DE_GENES,
    MAX_TIME,
    SPLIT_TIME,
    restrict_data,
    AVG_GEN_TIME,
)



[docs]
def plot_colony_growth(
    data: pd.DataFrame,
    ax: Optional[plt.Axes] = None,
    axs: Optional[List[plt.Axes]] = None,
    antibiotic: str = "Tet.",
    antibiotic_col: str = "Initial external tet.",
    mic: float = 3.375,
) -> None:
    """Plot traces of total colony mass and total colony growth rate. Can use
    methodology described in Lambert and Pearson 2000 to empirically calculate
    MIC from simulated data if a list of two axes is provided using `axs`

    Args:
        data: DataFrame where each row is an agent and each column is a variable
            of interest. Must have these columns: 'Time', 'Condition', and
            'Dry Mass'. The first experimental condition in the 'Condition'
            column is treated as a control (default gray). Include at most
            2 conditions and 1 seed per condition.
        ax: List of 2 Matplotlib Axes, one for colony mass and other for MIC
        axs: List of 2 Matplotlib Axes, one for colony mass and other for MIC
        antibiotic: Name of antibiotic to put as legend title
        antibiotic_col: Name of column with initial external antibiotic concs.
        mic: Minimum inhibitory concentration (uM, rounded to 3 decimal places)
    """
    data["antibiotic_conc"] = np.round(data.loc[:, antibiotic_col] * 1000, 3)
    columns_to_include = ["Dry mass", "Time", "antibiotic_conc", "Condition"]
    data = data.loc[:, columns_to_include]
    mask = (
        (data.loc[:, "antibiotic_conc"] == 0)
        & (data.loc[:, "Time"] > SPLIT_TIME)
        & (data.loc[:, "Time"] <= MAX_TIME)
    )
    remaining_glucose_data = data.loc[mask, :]
    data = restrict_data(data)
    data = pd.concat([data, remaining_glucose_data])
    data = data.groupby(["antibiotic_conc", "Time"]).sum().reset_index()

    # Convert time to hours
    data.loc[:, "Time"] = data.loc[:, "Time"] / 3600

    cmap = matplotlib.colormaps["Greys"]
    antibiotic_min = data.loc[:, "antibiotic_conc"].min()
    antibiotic_max = data.loc[:, "antibiotic_conc"].max()
    norm = matplotlib.colors.Normalize(
        vmin=1.5 * antibiotic_min - 0.5 * antibiotic_max, vmax=antibiotic_max
    )
    antibiotic_concs = data.loc[:, "antibiotic_conc"].unique()
    palette = {
        antibiotic_conc: cmap(norm(antibiotic_conc))
        for antibiotic_conc in antibiotic_concs
    }
    palette[mic] = (0, 0.4, 1)

    if axs:
        ax = axs[0]
    ax = cast(plt.Axes, ax)
    sns.lineplot(
        data=data,
        x="Time",
        y="Dry mass",
        hue="antibiotic_conc",
        ax=ax,
        palette=palette,
        errorbar=None,
    )
    # Set y-limits so that major ticks surround data
    log_ylim = 10 ** np.round(
        np.log10([data.loc[:, "Dry mass"].min(), data.loc[:, "Dry mass"].max()]), 0
    )
    new_ymin = min(log_ylim[0], data.loc[:, "Dry mass"].min())
    new_ymax = max(log_ylim[1], data.loc[:, "Dry mass"].max())
    ax.set_ylim(new_ymin, new_ymax)
    max_x = np.round(data.loc[:, "Time"].max(), 1)
    ticks = list(np.arange(0, max_x, 2, dtype=int)) + [max_x]
    labels = [str(tick) for tick in ticks]
    ax.set_xticks(ticks=ticks, labels=labels)
    # Log scale so linear means exponential growth
    ax.set(yscale="log")
    ax.text(0, 1, f"{antibiotic} (\u03bcM)", transform=ax.transAxes, size=8)
    ypos = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4]
    concs = list(palette.keys())
    colors = list(palette.values())
    for i, y in enumerate(ypos):
        ax.text(0, y, concs[i], c=colors[i], transform=ax.transAxes, size=8)
    ax.legend().remove()
    sns.despine(ax=ax, offset=3, trim=True)

    if axs:
        # Calculate MIC according to method from Lambert and Pearson 2000
        background_auc = (
            data.loc[data.loc[:, "Time"] == SPLIT_TIME / 3600, "Dry mass"].iloc[0]
            * SPLIT_TIME
            / 3600
        )
        post_tet_data = data.loc[data.loc[:, "Time"] >= SPLIT_TIME / 3600, :]
        treatment_groups = post_tet_data.groupby("antibiotic_conc")
        auc_dict = {}
        for antibiotic_conc, treatment_data in treatment_groups:
            if antibiotic_conc != 0:
                auc_dict[antibiotic_conc] = max(
                    np.trapezoid(treatment_data["Dry mass"], x=treatment_data["Time"])  # type: ignore[attr-defined]
                    - background_auc,
                    0,
                )
            else:
                baseline_auc = (
                    np.trapezoid(treatment_data["Dry mass"], x=treatment_data["Time"])  # type: ignore[attr-defined]
                    - background_auc
                )
        antibiotic_concs = np.log10(np.array(list(auc_dict.keys())))
        areas_under_curve = np.array(list(auc_dict.values())) / baseline_auc

        def altered_gomperts(x, b, m):
            return np.exp(-np.exp(b * (x - m)))

        popt, _ = curve_fit(altered_gomperts, antibiotic_concs, areas_under_curve)

        ax_2 = axs[1]
        ax_2.scatter(10**antibiotic_concs, areas_under_curve, c="k")
        fit_x = np.linspace(
            antibiotic_concs.min() - 1, antibiotic_concs.max() + 1, 1000000
        )
        ax_2.plot(
            10**fit_x,
            np.apply_along_axis(lambda x: altered_gomperts(x, *popt), 0, fit_x),
            "k",
        )
        ax_2.set(xscale="log")
        ax_2.set_xlabel(f"{antibiotic} (\u03bcM)", fontsize=8)
        ax_2.set_ylabel("Fractional AUC", fontsize=8)
        # Set x-limits so that major ticks surround data
        ax_2.set_xlim(0.1, 100)

        def inflection_tangent(x, b, m):
            return -b * np.exp(-1) * (x - (m + 1 / b))

        log10_mic = popt[-1] + 1 / popt[-2]
        inflection_x = np.linspace(antibiotic_concs.min() - 1, log10_mic, 1000000)
        ax_2.plot(
            10**inflection_x,
            np.apply_along_axis(
                lambda x: inflection_tangent(x, *popt), 0, inflection_x
            ),
            "k--",
            linewidth=1,
        )
        ax_2.scatter([10**log10_mic], [0], c=(0, 0.4, 1), facecolors="none")
        ax_2.set_ylim(-0.01, 1.01)
        ax_2.set_xticks(
            np.append(cast(np.ndarray, ax_2.get_xticks()), 10**log10_mic),
            np.append(
                cast(np.ndarray, ax_2.get_xticklabels()),
                [f"MIC:\n{np.round(10**log10_mic, 1)}"],
            ),
        )
        for ticklabel in ax_2.get_xticklabels():
            if ticklabel.get_text() == f"MIC:\n{np.round(10**log10_mic, 1)}":
                ticklabel.set_color((0, 0.4, 1))
        print(f"{antibiotic} MIC: {str(10 ** (popt[-1] + 1 / popt[-2]))}")
        sns.despine(ax=ax_2, offset=3, trim=True)




[docs]
def plot_mrna_fc(
    data: pd.DataFrame,
    ax: plt.Axes,
    genes_to_plot: list[str],
    highlight_genes: Optional[dict[str, tuple]] = None,
) -> None:
    """Plot scatter plot of simulated and experimental log2 fold change for
    final mRNA concentrations of key genes regulated during tetracycline exposure.

    Args:
        data: DataFrame where each row is an agent and each column is a variable
            of interest. Must have these columns: 'Time', 'Condition', and 'Dry Mass'.
            The first experimental condition in the 'Condition' column is treated as
            a control.
        ax: Single instance of Matplotlib Axes to plot on.
        genes_to_plot: List of gene names to include in plot.
        highlight_genes: Mapping of gene names to point colors.
    """
    data = data.loc[data.loc[:, "Time"] <= MAX_TIME, :]
    data.loc[:, "ompF mRNA"] += data.loc[:, "micF-ompF duplex"]
    mrna_cols = [f"{gene} mRNA" for gene in genes_to_plot if gene not in ["MicF"]] + [
        "GAPDH mRNA"
    ]
    mrna_data = data.loc[:, mrna_cols + ["Condition", "Volume"]]
    conditions = mrna_data.loc[:, "Condition"].unique()
    # Get mRNA concentrations aggregated over entire final colonies
    mrna_data = mrna_data.set_index(["Condition"])
    mrna_data = (
        mrna_data.divide(mrna_data.loc[:, "Volume"], axis=0)
        .drop(["Volume"], axis=1)
        .reset_index()
    )
    mrna_data = mrna_data.groupby(["Condition"]).mean()
    # Normalize by housekeeping gene expression
    mrna_data = mrna_data.divide(mrna_data.loc[:, "GAPDH mRNA"], axis=0)
    # Convert to relative amounts assuming that first condition is control
    relative_data = pd.DataFrame(
        np.log2(mrna_data.loc[conditions[1]] / mrna_data.loc[conditions[0]])
    )
    relative_data.index = [
        mrna_name.split(" mRNA")[0] for mrna_name in relative_data.index
    ]
    relative_data = relative_data.rename(
        columns={0: "Sim. RNA $\\mathregular{log_2 FC}$"}
    )
    # Compare against literature relative amounts
    tet_degenes = (
        DE_GENES.loc[:, ["Gene name", "Fold change"]]
        .set_index("Gene name")
        .drop(["MicF"], axis=0)
        .rename(columns={"Fold change": "Literature RNA $\\mathregular{log_2 FC}$"})
    )
    tet_degenes.loc[:, "Literature RNA $\\mathregular{log_2 FC}$"] = np.log2(
        tet_degenes.loc[:, "Literature RNA $\\mathregular{log_2 FC}$"]
    )
    relative_data = relative_data.join(tet_degenes, how="inner").reset_index()
    relative_data = relative_data.rename(columns={"index": "Gene"})
    palette: dict[str, Any] = {gene: "k" for gene in relative_data["Gene"].unique()}
    if highlight_genes:
        palette = {**palette, **highlight_genes}
    sns.scatterplot(
        data=relative_data,
        x="Literature RNA $\\mathregular{log_2 FC}$",
        y="Sim. RNA $\\mathregular{log_2 FC}$",
        ax=ax,
        hue="Gene",
        legend=False,
        palette=palette,
    )
    min_fc = relative_data.loc[:, "Literature RNA $\\mathregular{log_2 FC}$"].min()
    max_fc = relative_data.loc[:, "Literature RNA $\\mathregular{log_2 FC}$"].max()
    validation_line = np.linspace(min_fc, max_fc, 2)
    ax.plot(validation_line, validation_line, "--", c="0.4")
    if highlight_genes:
        # Start with label above point, then alternate
        direction = -1
        for gene, color in highlight_genes.items():
            location = relative_data.loc[relative_data["Gene"] == gene, :]
            xy = (
                location["Literature RNA $\\mathregular{log_2 FC}$"],
                location["Sim. RNA $\\mathregular{log_2 FC}$"],
            )
            ax.annotate(
                gene,
                xy=xy,
                xytext=(xy[0], xy[1] + direction * 0.5),
                ha="center",
                va="center",
                c=color,
                arrowprops={"arrowstyle": "-", "color": color},
                fontstyle="italic",
            )
            direction *= -1
    sns.despine(ax=ax, offset=3, trim=True)




[docs]
def plot_protein_synth_inhib(
    data: pd.DataFrame,
    ax: plt.Axes,
    literature: pd.DataFrame,
):
    """Plot scatter plot of normalized % protein synthesis inhibition across a
    variety of tetracycline concentrations.

    Args:
        data: DataFrame where each row is an agent and each column is a variable
            of interest. Must have these columns: 'Time', 'Condition', 'Seed',
            and 'Dry Mass'.
        ax: Single instance of Matplotlib Axes to plot on.
        literature: DataFrames with 3 columns: 'Tetracycline', 'Percent inhibition',
            and 'Source'
    """
    # Assume tetracycline is added at t = 0 (reached equilibrium by 400 s)
    sampled_time = data.loc[
        (data.loc[:, "Time"] > 400) & (data.loc[:, "Time"] < 500), :
    ]
    sampled_time["Total ribosomes"] = (
        sampled_time.loc[:, "Active ribosomes"]
        + sampled_time.loc[:, "Inactive 30S subunit"]
    )
    sampled_time.loc[:, "Cytoplasmic tetracycline"] *= 1000
    sampled_time = sampled_time.sort_values("Time")

    grouped_agents = sampled_time.groupby(["Condition", "Seed", "Agent ID"])
    normed_data: dict[str, Any] = {
        "Condition": [],
        "Normed delta": [],
        "Tetracycline": [],
    }
    for (condition, seed, agent_id), agent_data in grouped_agents:
        protein_deltas = np.diff(agent_data.loc[:, "Protein mass"])
        # Ignore final timestep (protein mass deltas require two timepoints)
        agent_data = agent_data.iloc[:-1, :]
        # Normalize by ribosome and RNA counts to match cell-free system
        normed_deltas = (
            protein_deltas
            / agent_data.loc[:, "Total ribosomes"]
            / agent_data.loc[:, "Total mRNA"]
        )
        normed_data["Condition"].append(condition)
        normed_data["Normed delta"].append(normed_deltas.mean())
        normed_data["Tetracycline"].append(
            agent_data.loc[:, "Cytoplasmic tetracycline"].mean()
        )
    normed_data = pd.DataFrame(normed_data)
    normed_data = normed_data.groupby(["Condition"]).mean()
    control = normed_data.index[normed_data["Tetracycline"] == 0]
    normed_data["Percent inhibition"] = (
        100
        - (
            normed_data.loc[:, "Normed delta"]
            / normed_data.loc[control, "Normed delta"].to_numpy()
        )
        * 100
    )
    normed_data["Source"] = ["This model"] * len(normed_data)
    normed_data = normed_data.loc[:, ["Percent inhibition", "Tetracycline", "Source"]]
    normed_data = normed_data.loc[normed_data.loc[:, "Tetracycline"] > 0, :]
    palette: dict[str, Any] = {"This model": (0, 0.4, 1)}
    if literature is not None:
        gray = 0.3
        for source in literature.loc[:, "Source"].unique():
            palette[source] = str(gray)
            gray += 0.3
        normed_data = pd.concat([normed_data, literature])

    # 3-parameter sigmoid function used to fit inhibition curves
    def func(x, a, b, c):
        return a * np.power(x, b) / (np.power(c, b) + np.power(x, b))

    ic_50 = {}
    normed_data = normed_data.sort_values("Source")
    for i, source in enumerate(normed_data.loc[:, "Source"].unique()):
        source_data = normed_data.loc[normed_data.loc[:, "Source"] == source, :]
        fittedParameters, pcov = curve_fit(
            func, source_data["Tetracycline"], source_data["Percent inhibition"]
        )
        x = np.linspace(
            source_data["Tetracycline"].min(), source_data["Tetracycline"].max(), 10000
        )
        pred = func(x, *(fittedParameters))
        ax.plot(x, pred, c=palette[source], linewidth=1)
        sol = root_scalar(lambda x: func(x, *fittedParameters) - 0.5, x0=10, x1=5)
        ic_50[source] = str(np.round(sol.root, 1)) + " \u03bcM"

    new_palette = {}
    sources_new = normed_data.loc[:, "Source"].unique()
    for source in sources_new:
        for orig_source in palette:
            if orig_source in source:
                new_palette[source] = palette[orig_source]
    palette = new_palette
    markers = {source: "o" for source in sources_new}
    markers["This model"] = "^"

    sns.scatterplot(
        data=normed_data,
        x="Tetracycline",
        y="Percent inhibition",
        hue="Source",
        ax=ax,
        palette=palette,
        style="Source",
        markers=markers,
        legend=False,
        s=16,
    )
    ax.set_xscale("log")
    ax.set_xticks([0.1, 100])
    sns.despine(offset=3, trim=True)
    xticks = np.array([0.1, 1, 10, 100])
    xticklabels = [f"$10^{{{int(exp)}}}$" for exp in np.log10(xticks)]
    ax.set_xticks(xticks, xticklabels, size=8)
    ax.text(
        0,
        0.9,
        "This model",
        c=palette["This model"],
        size=8,
        transform=ax.transAxes,
        weight="bold",
    )
    for i, source in enumerate(palette):
        if source != "This model":
            ax.text(
                0,
                0.8 - 0.1 * i,
                source,
                c=palette[source],
                size=8,
                transform=ax.transAxes,
            )
    ax.legend().remove()
    ax.set_ylabel("% protein synth. inhib.", size=9)
    ax.set_xlabel("Tetracycline (\u03bcM)", size=9)




[docs]
def plot_mass_fraction(
    data: pd.DataFrame,
):
    """Plot line plots of submass fold changes over generation time.

    Args:
        data: DataFrame where each row is an agent and each column is a variable
            of interest.
    """
    submasses = [
        "Cell",
        "Dry",
        "Water",
        "Protein",
        "rRNA",
        "mRNA",
        "tRNA",
        "DNA",
        "Small molecule",
    ]
    grouped_agents = data.groupby(["Condition", "Seed", "Agent ID"])
    for (condition, seed, agent_id), agent_data in grouped_agents:
        # Start all agents at t = 0 min
        times = agent_data.loc[:, "Time"] - agent_data.loc[:, "Time"].iloc[0]
        times = times / 60
        for submass in submasses:
            agent_data[f"{submass} fold change"] = (
                agent_data.loc[:, f"{submass} mass"]
                / agent_data.loc[:, f"{submass} mass"].iloc[0]
            )
            plt.plot(
                times, agent_data.loc[:, f"{submass} fold change"], label=f"{submass}"
            )
        plt.legend()
        plt.xlabel("Time (min)")
        plt.ylabel("Mass fold change (normalized to t = 0 min)")
        out_dir = f"out/analysis/paper_figures/{condition}/{seed}"
        os.makedirs(out_dir, exist_ok=True)
        plt.tight_layout()
        plt.savefig(f"{out_dir}/{agent_id}.svg")
        plt.close()




[docs]
def plot_death_timescale_analysis(
    data: pd.DataFrame,
    axs: List[plt.Axes],
    antibiotic_col: str = "Initial external amp.",
    mic: float = 5.724,
):
    """Create histogram of cell age (time since birth) until death and labelled
    scatterplot of average time exposed to ampicillin until death against
    initial external ampicillin concentration.

    Args:
        data: DataFrame where each row is an agent and each column is a variable
            of interest.
        axes: List of 2 matplotlib Axes instances on which to draw plots
        antibiotic_col: Name of column with initial external antibiotic concs.
        mic: Minimum inhibitory concentration (uM, rounded to 3 decimal places)
    """
    data["antibiotic_conc"] = np.round(data.loc[:, antibiotic_col] * 1000, 3)
    grouped_data = data.groupby("Condition")

    # Compile condition-specific lists of age of cell and total time that cell's
    # lineage was exposed to ampicillin at time of cracking and death
    death_times = {}
    cracking_times = {}
    for condition, condition_data in grouped_data:
        all_agents = condition_data.loc[:, "Agent ID"].unique()
        death_times[condition] = {
            "age": [],
            "exposure": [],
            "amp": condition_data.loc[:, "antibiotic_conc"].iloc[0],
        }
        cracking_times[condition] = {
            "age": [],
            "exposure": [],
            "amp": condition_data.loc[:, "antibiotic_conc"].iloc[0],
        }
        agents_data = condition_data.groupby("Agent ID")
        for agent_id, agent_data in agents_data:
            crack_time = agent_data.loc[
                agent_data.loc[:, "Wall cracked"], "Time"
            ].to_numpy()
            if len(crack_time) != 0:
                birth_time = agent_data.loc[:, "Time"].min()
                cracking_times[condition]["age"].append(crack_time[0] - birth_time)
                cracking_times[condition]["exposure"].append(crack_time[0] - SPLIT_TIME)
                # Cell has no descendants
                if agent_id + "0" not in all_agents:
                    death_time = agent_data.loc[:, "Time"].max()
                    # Cell did not survive to end of simulation
                    if death_time != MAX_TIME:
                        death_times[condition]["age"].append(death_time - birth_time)
                        death_times[condition]["exposure"].append(
                            death_time - SPLIT_TIME
                        )

    # Plot avg # of generations to death vs amp conc (inc. reg. line)
    avg_times = []
    amp_concs = []
    for condition in death_times:
        avg_times.append(np.mean(death_times[condition]["exposure"]))
        amp_concs.append(float(condition.split("Ampicillin (")[1].split(" mg/L)")[0]))
    avg_gens = np.array(avg_times) / AVG_GEN_TIME
    amp_concs = np.array(amp_concs)
    conc_sort_idx = amp_concs.argsort()
    amp_concs = amp_concs[conc_sort_idx]
    avg_gens = avg_gens[conc_sort_idx]
    axs[0].scatter(amp_concs, avg_gens, c=(0, 0.4, 1))
    axs[0].plot(amp_concs, avg_gens, c=(0, 0.4, 1))

    # Plot Boman and Ericksson 1963 data (inc. reg. line)
    lit_data = pd.read_csv("data/colony_data/lysis_ratios.csv", header=None)
    lit_data = lit_data.rename(
        columns={0: "Ampicillin (mg/L)", 1: "Avg. generations to lysis"}
    )
    lit_data["Avg. generations to lysis"] = 1 / lit_data["Avg. generations to lysis"]
    lit_data = lit_data.loc[
        (lit_data.loc[:, "Ampicillin (mg/L)"] > 1)
        & (lit_data.loc[:, "Ampicillin (mg/L)"] < 9),
        :,
    ]
    lit_data = lit_data.sort_values(by="Ampicillin (mg/L)")
    axs[0].scatter(
        lit_data["Ampicillin (mg/L)"],
        lit_data["Avg. generations to lysis"],
        color=(1, 100 / 255, 0),
    )
    axs[0].plot(
        lit_data["Ampicillin (mg/L)"],
        lit_data["Avg. generations to lysis"],
        color=(1, 100 / 255, 0),
    )
    axs[0].set_xlabel("Ampicillin (mg/L)", fontsize=9)
    axs[0].set_ylabel("Avg. generations to lysis", fontsize=9)
    axs[0].set_yticks(["0", "1", "2", "3", "4"], ["0", "1", "2", "3", "4"], fontsize=8)
    axs[0].set_xticks(["0", "2", "4", "6", "8"], ["0", "2", "4", "6", "8"], fontsize=8)
    ylim = axs[0].get_ylim()
    axs[0].text(
        8,
        ylim[1],
        "Boman and Ericksson 1963",
        color=(1, 100 / 255, 0),
        size=8,
        verticalalignment="top",
        horizontalalignment="right",
    )
    axs[0].text(
        8,
        ylim[1] - 0.3,
        "MIC: 4 mg/L",
        color=(1, 100 / 255, 0),
        size=8,
        verticalalignment="top",
        horizontalalignment="right",
    )
    axs[0].text(
        0.3,
        ylim[0] + 0.6,
        "This model",
        color=(0, 0.4, 1, 1),
        size=8,
        verticalalignment="top",
        horizontalalignment="left",
    )
    axs[0].text(
        0.3,
        ylim[0] + 0.3,
        "MIC: 2 mg/L",
        color=(0, 0.4, 1, 1),
        size=8,
        verticalalignment="top",
        horizontalalignment="left",
    )
    sns.despine(ax=axs[0], offset=3, trim=True)
    axs[0].set_yticks(["0", "1", "2", "3", "4"], ["0", "1", "2", "3", "4"], fontsize=8)
    axs[0].set_xticks(["0", "2", "4", "6", "8"], ["0", "2", "4", "6", "8"], fontsize=8)
    plt.tight_layout()