Source code for wholecell.utils.fitting

from __future__ import annotations

from typing import List, Optional, Tuple

import numpy as np
from numpy import typing as npt
from scipy import stats
import unum  # Imported here to be used in getCountsFromMassAndExpression assertions

from wholecell.utils import units


FUNCTIONS = {
    "none": lambda x: x,
    "sqrt": lambda x: np.sqrt(x),
    "exp": lambda x: np.exp(x),
    "log": lambda x: np.log(x),
    "log2": lambda x: np.log(x**2),
    "logsqrt": lambda x: np.log(np.sqrt(x)),
    "2": lambda x: x**2,
    "3": lambda x: x**3,
    "1/sqrt": lambda x: 1 / np.sqrt(x),
    "1/x": lambda x: 1 / x,
    "1/x2": lambda x: 1 / x**2,
}
INVERSE_FUNCTIONS = {
    "none": lambda x: x,
    "sqrt": lambda x: x**2,
    "exp": lambda x: np.log(x),
    "log": lambda x: np.exp(x),
    "log2": lambda x: np.sqrt(np.exp(x)),
    "logsqrt": lambda x: np.exp(x) ** 2,
    "2": lambda x: np.sqrt(x),
    "3": lambda x: x ** (1 / 3),
    "1/sqrt": lambda x: (1 / x) ** 2,
    "1/x": lambda x: 1 / x,
    "1/x2": lambda x: np.sqrt(1 / x),
}



[docs]
def normalize(array):
    return np.array(array).astype("float") / np.linalg.norm(array, 1)




[docs]
def countsFromMassAndExpression(
    mass: float,
    mws: npt.NDArray[np.float64],
    relativeExpression: npt.NDArray[np.float64],
    nAvogadro: float,
) -> npt.NDArray[np.float64]:
    """
    Args:
        mass: Total mass you want counts to sum to
        mws: Molecular weights of each species
        relativeExpression: Relative expression of each species
        nAvogadro: Avogadro's number

    Returns:
        Counts of each molecule
    """

    assert np.allclose(np.sum(relativeExpression), 1)
    assert not isinstance(mass, unum.Unum)
    assert not isinstance(mws, unum.Unum)
    assert not isinstance(relativeExpression, unum.Unum)
    assert not isinstance(nAvogadro, unum.Unum)
    return mass / np.dot(mws / nAvogadro, relativeExpression)




[docs]
def masses_and_counts_for_homeostatic_target(
    dry_mass_of_non_small_molecules: unum.Unum,
    concentrations: unum.Unum,
    weights: unum.Unum,
    cell_density: unum.Unum,
    avogadros_number: unum.Unum,
) -> tuple[unum.Unum, unum.Unum]:
    """
    Computes the dry mass fractions and counts associated with small molecules to maintain
    concentrations consistent with targets.  (Also includes water.)

    The cell is composed of a number of 'mass fractions' i.e. DNA, RNA, protein, water, and
    the less specific "small molecules" which includes both inorganic and organic molecular species
    that form part of a cell.  While we take many of the former calculations as ground truth, we
    chose to adjust (recompute) the small molecule mass fraction according to per-molecule
    observations of small molecule concentrations (compiled from various sources).

    However, this creates a potential issue: we need the small molecule mass to compute the volume,
    and the volume in turn is used to compute the counts (and therefore masses) of the small
    molecules.  We denote the first small molecule mass as Ms, and the second as Ms'.

    The total mass of the cell, Mt, is the sum of the small and non-small molecule masses:

    Mt = Ms + Mns

    The volume of the cell V times the density of the cell rho is Mt, and therefore

    rho * V = Ms + Mns

    Ms = rho * V - Mns

    This gives us our first calculation of the small molecule mass.  For the second calculation, we
    first find the abundance of each small molecule species (count n_i) as

    n_i = V * c_i

    where c_i is the concentration of each species.  Then the mass associated with each species is

    m_i = V * w_i * c_i

    where w_i is the molecular weight of a given species,.  Finally, the total small molecule mass,
    estimated from small molecule counts, is

    Ms' = sum_i m_i = V * w^T c

    where w^T c is the dot-product between the two vectors.

    Equating Ms' and Ms, and solving for V:

    V = Mns / (rho - w^Tc)

    This allows us to compute the new volume, from which we can also compute and return all n_i and
    m_i.

    Args:
        dry_mass_of_non_small_molecules: float unit'd scalar, dimensions of mass
            The total mass of the cell, minus the 'wet' mass (water) and the dry mass of other small
            molecules.
        concentrations: 1-D float unit'd array, with dimensions of concentration
            The target concentrations of the small molecules.
        weights: 1-D float unit'd array, with dimensions of mass per mol
            The molecular weights of the small molecules.
        cell_density: float unit'd scalar, dimensions of mass per volume
            The total density of the cell (wet and dry mass).
        avogadros_number: float unit'd scalar, dimensions of per mol
            The number of molecules per mole.

    Returns:
        2-element tuple containing

        - **masses**: The mass associated with each molecular species,
        - **counts**: The counts associated with each molecular species
    """

    # Compute the total mass concentration of the small molecules

    total_small_mol_mass_conc = np.dot(weights, concentrations)

    # Compute the new total cell volume that accomodates the small molecule concentrations

    cell_volume = dry_mass_of_non_small_molecules / (
        cell_density - total_small_mol_mass_conc
    )

    if cell_volume.asNumber() < 0:
        raise ValueError(
            "Could not achieve concentration targets with the expected dry mass."
            " Check for any unusually high concentrations."
        )

    # Calculate and return the counts of molecules and their associated masses

    mols = cell_volume * concentrations

    counts = mols * avogadros_number
    masses = weights * mols

    return masses, counts




[docs]
def calcProteinCounts(sim_data, monomerMass):
    monomerExpression = calcProteinDistribution(sim_data)

    nMonomers = calcProteinTotalCounts(sim_data, monomerMass, monomerExpression)

    return nMonomers * monomerExpression




[docs]
def calcProteinTotalCounts(sim_data, monomerMass, monomerExpression):
    return countsFromMassAndExpression(
        monomerMass.asNumber(units.g),
        sim_data.process.translation.monomer_data["mw"].asNumber(units.g / units.mol),
        monomerExpression,
        sim_data.constants.n_avogadro.asNumber(1 / units.mol),
    )




[docs]
def calcProteinDistribution(sim_data):
    return normalize(
        sim_data.process.transcription.rna_data["expression"][
            sim_data.relation.cistron_to_monomer_mapping
        ]
        / (
            np.log(2) / sim_data.doubling_time.asNumber(units.s)
            + sim_data.process.translation.monomer_data["deg_rate"].asNumber(
                1 / units.s
            )
        )
    )




[docs]
def cosine_similarity(samples):
    """
    Finds the cosine similarity between samples.

    samples is a matrix of size (n_samples, sample_size)

    The output is a matrix of size (n_samples, n_samples).

    The cosine similarity is the normalized dot product between two
    vectors.  The name originates from the fact that the normalized dot
    product between two vectors is equal to the cosine of the angle
    formed by the two vectors.
    """

    magnitudes = np.sqrt(np.sum(np.square(samples), 1))

    normed = samples / magnitudes[:, None]

    return normed.dot(normed.T)




[docs]
def fit_linearized_transforms(
    x: np.ndarray,
    y: np.ndarray,
    x_fun: Optional[List] = None,
    y_fun: Optional[List] = None,
    r_tol: float = 0.99,
    p_tol: float = 1e-2,
    verbose: Optional[float] = None,
) -> Tuple[str, str, float, float]:
    """
    Transforms x and y data based on a set of functions and finds the transforms
    that lead to the best linear fit of the data.  Can use the return values
    as args to interpolate_linearized_fit in order to interpolate new x values.

    Args:
        x: x data to fit
        y: y data to fit
        x_fun: list of functions to try for transforming x data
        y_fun: list of functions to try for transforming y data
        r_tol: best fit r value needs to be higher than this value
        p_tol: best fit p value needs to be lower than this value
        verbose: if given, prints the r and p value for each function pair that
                results in an r value higher than this

    Returns:
        4-element tuple containing

        - **x_transform**: name of the best transformation function for x data
        - **y_transform**: name of the best transformation function for y data
        - **slope**: best fit slope for the transformed data
        - **intercept**: best fit intercept for the transformed data
    """

    # Check all possible functions if none are given
    if x_fun is None:
        x_fun = list(FUNCTIONS.keys())
    if y_fun is None:
        y_fun = list(FUNCTIONS.keys())

    # Start with worst case r and p
    best_r = 0
    best_p = 1
    for x_name in x_fun:
        fx = FUNCTIONS[x_name]
        for y_name in y_fun:
            fy = FUNCTIONS[y_name]
            with np.errstate(invalid="ignore", over="ignore"):
                result = stats.linregress(fx(x), fy(y))
            abs_r = np.abs(result.rvalue)

            # If this is a new best fit, save the parameters
            if abs_r > best_r:
                best_r = abs_r
                best_p = result.pvalue
                x_transform = x_name
                y_transform = y_name
                slope = result.slope
                intercept = result.intercept

            if verbose and abs_r > verbose:
                print(
                    "{} {}: {:.3f} {:.1e}".format(
                        x_name, y_name, result.rvalue, result.pvalue
                    )
                )

    if verbose:
        print(
            f"Selected functions (r={best_r:.3f} p={best_p:.1e}):\n\tx: {x_transform}\n\ty: {y_transform}"
        )

    # Check tolerances to make sure the fit is as expected
    if best_r < r_tol:
        raise RuntimeError(f"Could not fit to the desired r: {best_r} < {r_tol}")
    if best_p > p_tol:
        raise RuntimeError(f"Could not fit to the desired p: {best_p} < {p_tol}")

    return x_transform, y_transform, slope, intercept




[docs]
def interpolate_linearized_fit(x, x_transform, y_transform, slope, intercept):
    """Interpolate one or more values based on the linearized fit parameters."""
    return INVERSE_FUNCTIONS[y_transform](FUNCTIONS[x_transform](x) * slope + intercept)