Source code for wholecell.utils.spatial_tool

"""
Functions that may be useful for future investigation in spatial model.
Some of the functions contain default values specific to E coli. Please read
before you use. References for the numbers are included in each function.
"""

import numpy as np
import scipy.constants
from unum import Unum

from wholecell.utils import units


K_B = scipy.constants.Boltzmann * units.J / units.K  # Boltzmann constant, unit: J/K



[docs]
def compute_hydrodynamic_radius(mw, mtype=None):
    """
    This function compute the hydrodynamic diameter of a macromolecules from
    its molecular weight. It is important to note that the hydrodynamic
    diameter is mainly used for computation of diffusion constant, and can
    be different from the observed diameter under microscopes or the radius
    of gyration, especially for loose polymers such as RNAs. This function
    is not E coli specific.

    References: Bioinformatics (2012). doi:10.1093/bioinformatics/bts537

    Args:
        mw: molecular weight of the macromolecules, units: Daltons.
        mtype: There are 5 possible mtype options: protein, RNA, linear_DNA,
            circular_DNA, and supercoiled_DNA.

    Returns: the hydrodynamic radius (in unit of nm) of the macromolecules
        using the following formula
        - rp = 0.0515*MW^(0.392) nm (Hong & Lei 2008) (protein)
        - rp = 0.0566*MW^(0.38) nm (Werner 2011) (RNA)
        - rp = 0.024*MW^(0.57) nm (Robertson et al 2006) (linear DNA)
        - rp = 0.0125*MW^(0.59) nm (Robertson et al 2006) (circular DNA)
        - rp = 0.0145*MW^(0.57) nm (Robertson et al 2006) (supercoiled DNA)
    """
    if mtype is None:
        mtype = "protein"

    dic_rp = {
        "protein": (0.0515, 0.392),
        "RNA": (0.0566, 0.38),
        "linear_DNA": (0.024, 0.57),
        "circular_DNA": (0.0125, 0.59),
        "supercoiled_DNA": (0.0145, 0.57),
    }

    if isinstance(mw, Unum):
        mw_unitless = mw.asNumber(units.g / units.mol)
    else:
        mw_unitless = mw

    if mtype in dic_rp:
        rp_0, rp_power = dic_rp[mtype]
    else:
        raise KeyError(
            "The input 'mtype' should be one of the following 5 "
            "options: 'protein', 'RNA', 'linear_DNA', "
            "'circular_DNA', 'supercoiled_DNA'."
        )

    rp = rp_0 * mw_unitless**rp_power
    rp = units.nm * rp
    return rp




[docs]
def compute_rg_rna(n_nt):
    """
    This function computes the radius of gyration (in units of nm) of an RNA
    with length n_nt(in units of nt). This function is not E coli specific.
    It is important to note that radius of gyration can be very different
    from the hydrodynamic radius or the radius you can observe under
    microscopes, especially for RNAs.

    References: Nucleic Acids Research, 2017, vol 45, 5: 2919-2934.
        doi: 10.1093/nar/gkx023

    Formula: Rg = a*N**nu. a = 0.366 +/- 0.146 nm, nu = 0.50 +/- 0.05

    Args:
        n_nt: the number of nucleotide in an RNA.

    Returns:
        The radius of gyration of an RNA in the unit of nm.
    """
    if isinstance(n_nt, Unum):
        n_nt = n_nt.asNumber(units.nt)

    a = units.nm * 0.366  # unit: nm
    nu = 0.50
    return a * n_nt**nu




[docs]
def compute_diffusion_constant_from_mw(
    mw, mtype=None, loc=None, temp=None, parameters=None
):
    """
    Warning: The default values of the 'parameters' are E coli specific.

    This function computes the hypothesized diffusion constant of
    macromolecules within the nucleoid and the cytoplasm region.
    In literature, there is no known differentiation between the diffusion
    constant of a molecule in the nucleoid and in the cytoplasm up to the
    best of our knowledge in 2019. However, there is a good reason why we
    can assume that previously reported diffusion constant are in fact the
    diffusion constant of a protein in the nucleoid region:
    (1) The image traces of a protein within a bacteria usually cross the
        nucleoid regions.
    (2) The nucleoid region, compared to the cytoplasm, should be the main
    limiting factor restricting the magnitude of diffusion constant.
    (3) The same theory of diffusion constant has been implemented to
    mammalian cells, and the term 'rh', the average hydrodynamic radius of
    the biggest crowders, are different in mammalian cytoplasm, and it seems
    to reflect the hydrodynamic radius of the actin filament (note: the
    hydrodynamic radius of actin filament should be computed based on the
    average length of actin fiber, and is not equal to the radius of the
    actin filament itself.) (ref: Nano Lett. 2011, 11, 2157-2163).
    As for E coli, the 'rh' term = 40nm, which may correspond to the 80nm
    DNA fiber. On the other hand, for the diffusion constant of E coli in
    the true cytoplasm, we will expect the value of 'rh' term to be
    approximately 10 nm, which correspond to the radius of active ribosomes.

    However, all the above statements are just hypothesis. If you want to
    compute the diffusion constant of a macromolecule in the whole E coli
    cell, you should set loc = 'nucleoid': the formula for this is obtained
    from actual experimental data set. When you set loc = 'cytoplasm', the
    entire results are merely hypothesis.

    Ref: Kalwarczyk, T., Tabaka, M. & Holyst, R.
    Bioinformatics (2012). doi:10.1093/bioinformatics/bts537

    D_0 = K_B*T/(6*pi*eta_0*rp)
    ln(D_0/D_cyto) = ln(eta/eta_0) = (xi^2/Rh^2 + xi^2/rp^2)^(-a/2)
    D_0 = the diffusion constant of a macromolecule in pure solvent
    eta_0 = the viscosity of pure solvent, in this case, water
    eta = the size-dependent viscosity experienced by the macromolecule.
    xi = average distance between the surface of proteins
    rh = average hydrodynamic radius of the biggest crowders
    a = some constant of the order of 1
    rp = hydrodynamic radius of probed molecule

    In this formula, since we allow the changes in temperature, we also
    consider the viscosity changes of water under different temperature:
    Ref: Dortmund Data Bank
    eta_0 = A*10^(B/(T-C))
    A = 2.414*10^(-5) Pa*sec
    B = 247.8 K
    C = 140 K

    Args:
        mw: molecular weight(unit: Da) of the macromolecule
        mtype: There are 5 possible mtype options: protein, RNA, linear_DNA,
        circular_DNA, and supercoiled_DNA.
        loc: The location of the molecule: 'nucleoid' or 'cytoplasm'.
        temp: The temperature of interest. unit: K.
        parameters: The 4 parameters required to compute the diffusion
            constant: xi, a, rh_nuc, rh_cyto.
            The default values are E coli specific.

    Returns:
        dc: the diffusion constant of the macromolecule, units: um**2/sec
    """
    if mtype is None:
        mtype = "protein"
    if loc is None:
        loc = "nucleoid"
    if temp is None:
        temp = 300 * units.K
    if parameters is None:
        parameters = (0.51, 0.53, 42, 10)

    rp = compute_hydrodynamic_radius(mw, mtype=mtype)
    dc = compute_diffusion_constant_from_rp(
        rp, loc=loc, temp=temp, parameters=parameters
    )
    return dc




[docs]
def compute_diffusion_constant_from_rp(rp, loc=None, temp=None, parameters=None):
    """
    Warning: The default values of the 'parameters' are E coli specific.

    This is the same function as 'compute_diffusion_constant_from_mw'
    except that it accepts the hydrodynamic radius of the macromolecules
    as the input. All hypothesis and statements in
    'compute_diffusion_constant_from_mw' apply here as well.

    Args:
        rp: the hydrodynamic radius of the macromolecule, units: nm.
        loc: The location of the molecule: 'nucleoid' or 'cytoplasm'.
        temp: The temperature of interest. unit: K.
        parameters: The 4 parameters required to compute the diffusion
            constant: xi, a, rh_nuc, rh_cyto.
            The default values are E coli specific.

    Returns:
        dc: the diffusion constant of the macromolecule, units: um**2/sec
    """
    if loc is None:
        loc = "nucleoid"
    if temp is None:
        temp = 300 * units.K
    if parameters is None:
        parameters = (0.51, 0.53, 42, 10)

    # unpack constants required for the calculation
    xi, a, rh_nuc, rh_cyto = parameters  # unit: nm, 1, nm, nm

    # viscosity of water
    a_visc = 2.414 * 10 ** (-5) * units.Pa * units.s  # unit: Pa*sec
    b_visc = 247.8 * units.K  # unit: K
    c_visc = 140 * units.K  # unit: K
    eta_0 = a_visc * 10 ** (b_visc / (temp - c_visc))  # unit: Pa*sec

    # determine Rh
    if loc == "nucleoid":
        rh = rh_nuc  # unit: nm
    elif loc == "cytoplasm":
        rh = rh_cyto  # unit: nm
    else:
        raise NameError("The location can only be 'nucleoid' or 'cytoplasm'.")

    if not isinstance(rp, Unum):
        rp = units.nm * rp

    # compute DC(diffusion constant)
    dc_0 = K_B * temp / (6 * np.pi * eta_0 * rp)
    dc = dc_0 * np.exp(
        -((xi**2 / rh**2 + xi**2 / rp.asNumber(units.nm) ** 2) ** (-a / 2))
    )
    return dc




[docs]
def compute_nucleoid_size(
    l_cell, d_cell, length_scaling_parameters=None, nucleoid_area_ratio=None
):
    """
    Warning 1: This function contains default values that are specific to
    E coli grew on M9 media under aerobic condition only. The shape and size
    of nucleoid of a bacteria can be very different across species.
    Warning 2: This function is not suitable to compute the nucleoid size of
    E coli when its shape turn filamentaous. This is because the scaling
    formula of the length of the nucleoid is obtained from a dnaC mutant
    E coli strain. According to our reference, when the cells are allowed
    to replicate their DNA normally, a constant N/C area ratio is maintained
    even for filamentous variants (treated with cephalexin). However, if the
    DNA is not allowed to replicate, the N/C area ratio will decrease as the
    cell elongate. It is therefore recommended to carefully examine the
    condition when the length of the cells grow beyond 3 um.
    Warning 3: the default values of length_scaling_parameters and
    nucleoid_area_ratio are set to be E coli specific, grew on M9 media
    under aerobic condition. The N/C area ratio for E coli grew on LB under
    aerobic condition is close to 0.4. For E coli grew under anaerobic
    condition it is close to 0.5.
    Warning 4: It is also important to note that a single cell can contain
    more than 1 nucleoid, and this formula may not be suitable for these
    cases.

    Reference on nucleoid length:
    Wu, F. et al. Curr. Biol. (2019). doi:10.1016/j.cub.2019.05.015
    Reference on nucleoid/cytoplasm area ratio:
    Gray, W. T. et al. Cell (2019). doi:10.1016/j.cell.2019.05.017

    Args:
        l_cell: the length of the cell, in units of um
        d_cell: the width of the cell, in units of um
        length_scaling_parameters: the parameters used for the scaling
            formula of the length of the nucleoid with respect to the length
            of the whole cell.
        nucleoid_area_ratio: the nucleoid/cytoplasm area ratio measured
            under microscope.
    Returns:
        l_nuc: the length of the nucleoid
        d_nuc: the diameter of the nucleoid
    """
    if length_scaling_parameters is None:
        length_scaling_parameters = (6.6, 8.3)
    if nucleoid_area_ratio is None:
        nucleoid_area_ratio = 0.60

    l_sat, l_c = length_scaling_parameters  # unit: um
    if isinstance(l_cell, Unum):
        l_cell = l_cell.asNumber(units.um)
    if isinstance(d_cell, Unum):
        d_cell = d_cell.asNumber(units.um)
    if isinstance(l_sat, Unum):
        l_sat = l_sat.asNumber(units.um)
    if isinstance(l_c, Unum):
        l_c = l_c.asNumber(units.um)

    l_nuc = l_sat * (1 - np.exp(-l_cell / l_c))
    d_nuc = nucleoid_area_ratio * l_cell * d_cell / l_nuc
    return units.um * l_nuc, units.um * d_nuc




[docs]
def compute_free_volume_ratio(q, eta):
    """
    This is a function that computes the free volume ratio of existing
    macromolecules with respect to a new molecule based on the scaled
    particle theory. This is not E coli specific. It is important to note
    that the results may not be accurate for large q or eta > 0.50.

    References:
    Malloggi, F. Soft Matter at Aqueous Interfaces.
    Lecture Notes in Physics (2016). doi:10.1007/978-3-319-24502-7.
    Chapter 3-4-2.

    Args:
        q: the size ratio between the new particle and existing particles.
            For example, if the radius of the new particle = delta, and the
            radius of the existing particle = R, then q = delta/R.
        eta: the compaction ratio, or the volume occupancy of the existing
            particle. eta = n*v_particle/v_total

    Returns:
        alpha: the free volume ratio of the space = v_free/v_total.
    """
    a = 3 * q + 3 * q**2 + q**3
    b = 9 / 2 * q**2 + 3 * q**3
    c = 3 * q**3
    y = eta / (1 - eta)
    q_capital = a * y + b * y**2 + c * y**3
    alpha = (1 - eta) * np.exp(-q_capital)
    return alpha




[docs]
def compute_n_blob(choice_model, choice_unit, bp_dna=None):
    """
    This function computes the number of DNA blobs within a bacteria. This
    functions contain multiple assumptions that demand careful examination.

    Previous research (Ref: Skoko D, Wong B, Johnson R, Marko J (2004)
    Biochemistry 43: 13867-13874.) directly observed the structure of
    chromosomal DNA of E coli under AFM. They observed that the DNA are
    composed of 40nm & 80nm fibers. The 80nm fibers are the main type within
    the cells, but it is hypothesized to be folded from 2 threads of 40nm
    fibers. Therefore, we hypothesized that chromosomal DNA are composed of
    DNA blobs with diameter of 40nm.

    For the model choice, there are 2 possible options. In the Flory chain
    option, the DNA within a blob is assumed to be self-avoidant. In the
    ideal chain option, the DNA within a blob is assumed to be in a melted
    state.

    For the unit choice, there are 3 possible options. You can choose single
    base pair of DNA as the smallest unit in the formation of DNA blob.
    In conventional physics, 2*persistence length is regarded as the
    standard choice of the smallest unit in a polymer. It is important to
    note that, the persistence length of DNA can be drastically different
    under in vitro & in vivo conditions. The persistence length of DNA in
    vitro is ~ 50nm, while the persistence length of DNA in vivo can be
    down to 20nm. This is because the binding of DNA binding proteins &
    DNA supercoiling can decrease the energy of DNA and makes it softer.
    Another possible choice of the smallest unit of DNA is 10bp, which
    corresponds to the average binding distance between HU protein, a
    structural DNA binding protein.

    Choosing different model setting and different smallest unit of DNA can
    result in drastically different results.

    The default value of genomic size is set to be E coli specific.

    Args:
        choice_model: the physical model of DNA within a blob. The 2
            possible options are 'ideal' & 'flory'.
        choice_unit: the choice of the smallest unit within a blob. The 3
            possible options are 'single', 'lp', or an integer number
            indicating the number of base pairs within a single unit.
        bp_dna: the number of base pairs in the chromosome. The default
            value is set to be the chromosome size of E coli.

    Returns:
        n_blob: the number of DNA blobs in a bacteria.
    """
    if bp_dna is None:
        bp_dna = 4639221

    lp = 20  # persistence length of the DNA in vivo, unit: nm
    r_blob = 20  # radius of the DNA blob deduced from AFM, unit: nm
    length_bp_conv = 0.34  # unit: nm/bp
    bp_per_lp = lp / length_bp_conv  # unit: bp
    d_dna = 2  # unit: nm

    if isinstance(choice_unit, Unum):
        choice_unit = choice_unit.asNumber(units.nt)

    if isinstance(bp_dna, Unum):
        bp_dna = bp_dna.asNumber(units.nt)

    if choice_model == "ideal":
        if choice_unit == "single":
            n_bp_per_blob = 6 * (r_blob / length_bp_conv) ** 2
            n_blob = bp_dna / n_bp_per_blob
        elif choice_unit == "lp":
            n_lp_per_blob = 6 * (r_blob / (2 * lp)) ** 2
            n_bp_per_blob = n_lp_per_blob * bp_per_lp * 2
            n_blob = bp_dna / n_bp_per_blob
        elif isinstance(choice_unit, int):
            length_per_unit = choice_unit * length_bp_conv
            n_unit_per_blob = 6 * (r_blob / length_per_unit) ** 2
            n_bp_per_blob = choice_unit * n_unit_per_blob
            n_blob = bp_dna / n_bp_per_blob
        else:
            raise NameError(
                "Your choice_unit should either be 'single', "
                "'lp', or an integer indicating"
                "the number of base pair within a unit."
            )
    elif choice_model == "flory":
        if choice_unit == "single":
            n_bp_per_blob = (
                r_blob / (3 / 8 * length_bp_conv**4 * d_dna) ** (1 / 5)
            ) ** (5 / 3)
            n_blob = bp_dna / n_bp_per_blob
        elif choice_unit == "lp":
            n_lp_per_blob = (r_blob / (3 / 8 * lp**4 * d_dna) ** (1 / 5)) ** (5 / 3)
            n_bp_per_blob = n_lp_per_blob * bp_per_lp
            n_blob = bp_dna / n_bp_per_blob
        elif isinstance(choice_unit, int):
            length_per_unit = choice_unit * length_bp_conv
            n_unit_per_blob = (
                r_blob / (3 / 8 * length_per_unit**4 * d_dna) ** (1 / 5)
            ) ** (5 / 3)
            n_bp_per_blob = choice_unit * n_unit_per_blob
            n_blob = bp_dna / n_bp_per_blob
        else:
            raise NameError(
                "Your choice_unit should either be 'single', "
                "'lp', or an integer indicating"
                "the number of base pair within a unit."
            )
    else:
        raise NameError("Your choice_model should either be 'ideal' or 'flory'.")
    return n_blob