"""
SimulationData mass data
"""
import numpy as np
import numpy.typing as npt
from scipy import interpolate, stats
from scipy.optimize import minimize
import unum
from wholecell.utils import fitting, units
NORMAL_CRITICAL_MASS = 975 * units.fg
SLOW_GROWTH_FACTOR = 1.2 # adjustment for smaller cells
[docs]
class Mass(object):
"""Mass"""
def __init__(self, raw_data, sim_data):
self._doubling_time = sim_data.doubling_time
self._build_constants(raw_data, sim_data)
self._build_submasses(raw_data, sim_data)
self._build_CD_periods(raw_data, sim_data)
self.avg_cell_dry_mass = self.get_avg_cell_dry_mass(self._doubling_time)
self.mass_fractions = self.get_mass_fractions(self._doubling_time)
self.avg_cell_component_masses = self.get_component_masses(self._doubling_time)
self._build_dependent_constants()
self._build_trna_data(raw_data, sim_data)
## Setup constants
[docs]
def _build_constants(self, raw_data, sim_data):
mass_parameters = raw_data.mass_parameters
self.__dict__.update(mass_parameters)
self.cell_dry_mass_fraction = 1.0 - self.cell_water_mass_fraction
self.avg_cell_to_initial_cell_conversion_factor = np.exp(
np.log(2) * self.avg_cell_cell_cycle_progress
)
self._cellDensity = sim_data.constants.cell_density
self._glycogenFractions = raw_data.mass_fractions.glycogen_fractions
self._mureinFractions = raw_data.mass_fractions.murein_fractions
self._LPSFractions = raw_data.mass_fractions.LPS_fractions
self._lipidFractions = raw_data.mass_fractions.lipid_fractions
self._ionFractions = raw_data.mass_fractions.ion_fractions
self._solubleFractions = raw_data.mass_fractions.soluble_fractions
metIds = (
[x["metaboliteId"] for x in self._glycogenFractions]
+ [x["metaboliteId"] for x in self._mureinFractions]
+ [x["metaboliteId"] for x in self._LPSFractions]
+ [x["metaboliteId"] for x in self._lipidFractions]
+ [x["metaboliteId"] for x in self._ionFractions]
+ [x["metaboliteId"] for x in self._solubleFractions]
+ ["WATER[c]"]
)
mws = sim_data.getter.get_masses(metIds)
self._mws = dict(zip(metIds, mws))
self._metTargetIds = [
x["Metabolite"] + "[c]" for x in raw_data.metabolite_concentrations
]
[docs]
def _build_dependent_constants(self):
self.avg_cell_dry_mass_init = (
self.avg_cell_dry_mass / self.avg_cell_to_initial_cell_conversion_factor
)
avgCellWaterMass = (
self.avg_cell_dry_mass / self.cell_dry_mass_fraction
) * self.cell_water_mass_fraction
self.avg_cell_water_mass_init = (
avgCellWaterMass / self.avg_cell_to_initial_cell_conversion_factor
)
# Setup sub-masses
[docs]
def _build_submasses(self, raw_data, sim_data):
self._doubling_time_vector = units.min * np.array(
[
float(x["doublingTime"].asNumber(units.min))
for x in raw_data.dry_mass_composition
]
)
dryMass = np.array(
[
float(x["averageDryMass"].asNumber(units.fg))
for x in raw_data.dry_mass_composition
]
)
self._dryMassParams = linear_regression(
self._doubling_time_vector.asNumber(units.min), 1.0 / dryMass
)
self._proteinMassFractionParams = self._getFitParameters(
raw_data.dry_mass_composition, "proteinMassFraction"
)
self._rnaMassFractionParams = self._getFitParameters(
raw_data.dry_mass_composition, "rnaMassFraction"
)
self._lipidMassFractionParams = self._getFitParameters(
raw_data.dry_mass_composition, "lipidMassFraction"
)
self._lpsMassFractionParams = self._getFitParameters(
raw_data.dry_mass_composition, "lpsMassFraction"
)
self._mureinMassFractionParams = self._getFitParameters(
raw_data.dry_mass_composition, "mureinMassFraction"
)
self._glycogenMassFractionParams = self._getFitParameters(
raw_data.dry_mass_composition, "glycogenMassFraction"
)
self._solublePoolMassFractionParams = self._getFitParameters(
raw_data.dry_mass_composition, "solublePoolMassFraction"
)
self._inorganicIonMassFractionParams = self._getFitParameters(
raw_data.dry_mass_composition, "inorganicIonMassFraction"
)
self.chromosome_sequence_mass = (
sim_data.getter.get_mass(sim_data.molecule_ids.full_chromosome)
/ sim_data.constants.n_avogadro
).asUnit(units.g)
[docs]
def _getFitParameters(self, dry_mass_composition, mass_fraction_name):
mass_fraction = np.array(
[float(x[mass_fraction_name]) for x in dry_mass_composition]
)
# Doubling time interpolation
x = self._doubling_time_vector.asNumber(units.min)[::-1]
y = mass_fraction[::-1]
dt_params = interpolate.splrep(x, y)
if (
np.sum(
np.absolute(
interpolate.splev(
self._doubling_time_vector.asNumber(units.min), dt_params
)
- mass_fraction
)
)
/ mass_fraction.size
> 1.0
):
raise Exception(
"Fitting {} with double exponential, residuals are huge!".format(
mass_fraction_name
)
)
# RNA/protein ratio interpolation
rna = np.array([float(x["rnaMassFraction"]) for x in dry_mass_composition])
protein = np.array(
[float(x["proteinMassFraction"]) for x in dry_mass_composition]
)
rp_ratio = rna / protein
rp_params = interpolate.splrep(rp_ratio, mass_fraction)
return dt_params, rp_params
[docs]
def _build_CD_periods(self, raw_data, sim_data):
self._c_period = sim_data.constants.c_period
self._d_period = sim_data.constants.d_period
# Set based on growth rate avgCellDryMass
[docs]
def get_avg_cell_dry_mass(self, doubling_time: unum.Unum) -> unum.Unum:
"""
Gets the dry mass for an average cell at the given doubling time.
Args:
doubling_time: expected doubling time
Returns:
average cell dry mass
"""
doubling_time = doubling_time.asNumber(units.min)
inverse_mass = self._dryMassParams[0] * doubling_time + self._dryMassParams[1]
if inverse_mass < 0:
raise ValueError(
"Doubling time ({} min) is too short, could not get mass.".format(
doubling_time
)
)
return units.fg / inverse_mass
[docs]
def get_dna_critical_mass(self, doubling_time: unum.Unum) -> unum.Unum:
"""
Returns the critical mass for replication initiation. Faster growing
cells maintain a consistent initiation mass but slower growing cells
are smaller and will never reach this mass so it needs to be adjusted
lower for them.
Args:
doubling_time: expected doubling time of cell
Returns:
Critical mass for DNA replication initiation
"""
mass = self.get_avg_cell_dry_mass(doubling_time) / self.cell_dry_mass_fraction
critical_mass = min(mass * SLOW_GROWTH_FACTOR, NORMAL_CRITICAL_MASS)
return critical_mass
# Set mass fractions based on growth rate
[docs]
def get_mass_fractions(self, doubling_time):
if type(doubling_time) != unum.Unum:
raise Exception("Doubling time was not set!")
D = {}
dnaMassFraction = self._calculateGrowthRateDependentDnaMass(
doubling_time
) / self.get_avg_cell_dry_mass(doubling_time)
dnaMassFraction.normalize()
dnaMassFraction.checkNoUnit()
D["dna"] = dnaMassFraction.asNumber()
doubling_time = self._clipTau_d(doubling_time)
D["protein"] = float(
interpolate.splev(
doubling_time.asNumber(units.min), self._proteinMassFractionParams[0]
)
)
D["rna"] = float(
interpolate.splev(
doubling_time.asNumber(units.min), self._rnaMassFractionParams[0]
)
)
D["lipid"] = float(
interpolate.splev(
doubling_time.asNumber(units.min), self._lipidMassFractionParams[0]
)
)
D["lps"] = float(
interpolate.splev(
doubling_time.asNumber(units.min), self._lpsMassFractionParams[0]
)
)
D["murein"] = float(
interpolate.splev(
doubling_time.asNumber(units.min), self._mureinMassFractionParams[0]
)
)
D["glycogen"] = float(
interpolate.splev(
doubling_time.asNumber(units.min), self._glycogenMassFractionParams[0]
)
)
D["solublePool"] = float(
interpolate.splev(
doubling_time.asNumber(units.min),
self._solublePoolMassFractionParams[0],
)
)
D["inorganicIon"] = float(
interpolate.splev(
doubling_time.asNumber(units.min),
self._inorganicIonMassFractionParams[0],
)
)
total = np.sum([y for y in D.values()])
for key, value in D.items():
if key != "dna":
D[key] = value / total
assert np.absolute(np.sum([x for x in D.values()]) - 1.0) < 1e-3
return D
[docs]
def get_mass_fractions_from_rna_protein_ratio(self, ratio):
D = {}
D["lipid"] = float(interpolate.splev(ratio, self._lipidMassFractionParams[1]))
D["lps"] = float(interpolate.splev(ratio, self._lpsMassFractionParams[1]))
D["murein"] = float(interpolate.splev(ratio, self._mureinMassFractionParams[1]))
D["glycogen"] = float(
interpolate.splev(ratio, self._glycogenMassFractionParams[1])
)
D["solublePool"] = float(
interpolate.splev(ratio, self._solublePoolMassFractionParams[1])
)
D["inorganicIon"] = float(
interpolate.splev(ratio, self._inorganicIonMassFractionParams[1])
)
return D
[docs]
def get_component_masses(self, doubling_time):
D = {}
massFraction = self.get_mass_fractions(doubling_time)
for key, value in massFraction.items():
D[key + "Mass"] = value * self.get_avg_cell_dry_mass(doubling_time)
return D
[docs]
def get_basal_rna_fractions(self):
"""
Measured RNA subgroup mass fractions. Fractions should change in other
conditions with growth rate (see transcription.get_rna_fractions()).
"""
return {
"rRNA": self._rrna23s_mass_sub_fraction
+ self._rrna16s_mass_sub_fraction
+ self._rrna5s_mass_sub_fraction,
"tRNA": self._trna_mass_sub_fraction,
"mRNA": self._mrna_mass_sub_fraction,
}
[docs]
def getBiomassAsConcentrations(self, doubling_time, rp_ratio=None):
avgCellDryMassInit = (
self.get_avg_cell_dry_mass(doubling_time)
/ self.avg_cell_to_initial_cell_conversion_factor
)
avgCellWaterMassInit = (
avgCellDryMassInit / self.cell_dry_mass_fraction
) * self.cell_water_mass_fraction
initWaterMass = avgCellWaterMassInit.asNumber(units.g)
initDryMass = avgCellDryMassInit.asNumber(units.g)
initCellMass = initWaterMass + initDryMass
initCellVolume = initCellMass / self._cellDensity.asNumber(
units.g / units.L
) # L
if rp_ratio is None:
massFraction = self.get_mass_fractions(doubling_time)
else:
massFraction = self.get_mass_fractions_from_rna_protein_ratio(rp_ratio)
metaboliteIDs = []
metaboliteConcentrations = []
for entry in self._glycogenFractions:
metaboliteID = entry["metaboliteId"]
assert metaboliteID not in metaboliteIDs + self._metTargetIds
massFrac = entry["massFraction"] * massFraction["glycogen"]
molWeight = self._mws[metaboliteID].asNumber(units.g / units.mol)
massInit = massFrac * initDryMass
molesInit = massInit / molWeight
concentration = molesInit / initCellVolume
metaboliteIDs.append(metaboliteID)
metaboliteConcentrations.append(concentration)
for entry in self._mureinFractions:
metaboliteID = entry["metaboliteId"]
assert metaboliteID not in metaboliteIDs + self._metTargetIds
massFrac = entry["massFraction"] * massFraction["murein"]
molWeight = self._mws[metaboliteID].asNumber(units.g / units.mol)
massInit = massFrac * initDryMass
molesInit = massInit / molWeight
concentration = molesInit / initCellVolume
metaboliteIDs.append(metaboliteID)
metaboliteConcentrations.append(concentration)
for entry in self._LPSFractions:
metaboliteID = entry["metaboliteId"]
assert metaboliteID not in metaboliteIDs + self._metTargetIds
massFrac = entry["massFraction"] * massFraction["lps"]
molWeight = self._mws[metaboliteID].asNumber(units.g / units.mol)
massInit = massFrac * initDryMass
molesInit = massInit / molWeight
concentration = molesInit / initCellVolume
metaboliteIDs.append(metaboliteID)
metaboliteConcentrations.append(concentration)
for entry in self._lipidFractions:
metaboliteID = entry["metaboliteId"]
assert metaboliteID not in metaboliteIDs + self._metTargetIds
massFrac = entry["massFraction"] * massFraction["lipid"]
molWeight = self._mws[metaboliteID].asNumber(units.g / units.mol)
massInit = massFrac * initDryMass
molesInit = massInit / molWeight
concentration = molesInit / initCellVolume
metaboliteIDs.append(metaboliteID)
metaboliteConcentrations.append(concentration)
for entry in self._ionFractions:
metaboliteID = entry["metaboliteId"]
assert metaboliteID not in metaboliteIDs + self._metTargetIds
massFrac = entry["massFraction"] * massFraction["inorganicIon"]
molWeight = self._mws[metaboliteID].asNumber(units.g / units.mol)
massInit = massFrac * initDryMass
molesInit = massInit / molWeight
concentration = molesInit / initCellVolume
metaboliteIDs.append(metaboliteID)
metaboliteConcentrations.append(concentration)
for entry in self._solubleFractions:
metaboliteID = entry["metaboliteId"]
if metaboliteID not in self._metTargetIds:
massFrac = entry["massFraction"] * massFraction["solublePool"]
molWeight = self._mws[metaboliteID].asNumber(units.g / units.mol)
massInit = massFrac * initDryMass
molesInit = massInit / molWeight
concentration = molesInit / initCellVolume
metaboliteIDs.append(metaboliteID)
metaboliteConcentrations.append(concentration)
# H2O: reported water content of E. coli
h2oMolWeight = self._mws["WATER[c]"].asNumber(units.g / units.mol)
h2oMoles = initWaterMass / h2oMolWeight
h2oConcentration = h2oMoles / initCellVolume
metaboliteIDs.append("WATER[c]")
metaboliteConcentrations.append(h2oConcentration)
metaboliteConcentrations = (units.mol / units.L) * np.array(
metaboliteConcentrations
)
return dict(zip(metaboliteIDs, metaboliteConcentrations))
[docs]
def _calculateGrowthRateDependentDnaMass(self, doubling_time):
c_plus_d_period = self._c_period + self._d_period
if doubling_time < self._d_period:
raise Exception("Can't have doubling time shorter than cytokinesis time!")
doubling_time_unit = units.getUnit(doubling_time)
# TODO: If you really care, this should be a loop.
# It is optimized to run quickly over the range of T_d
# and C and D periods that we have.
return self.chromosome_sequence_mass * (
1
+ 1
* (
np.maximum(0.0 * doubling_time_unit, c_plus_d_period - doubling_time)
/ self._c_period
)
+ 2
* (
np.maximum(
0.0 * doubling_time_unit, c_plus_d_period - 2 * doubling_time
)
/ self._c_period
)
+ 4
* (
np.maximum(
0.0 * doubling_time_unit, c_plus_d_period - 4 * doubling_time
)
/ self._c_period
)
)
[docs]
def _clipTau_d(self, doubling_time):
# Clip values to be in the range that we have data for
if hasattr(doubling_time, "dtype"):
doubling_time[doubling_time > max(self._doubling_time_vector)] = max(
self._doubling_time_vector
)
doubling_time[doubling_time < min(self._doubling_time_vector)] = min(
self._doubling_time_vector
)
else:
if doubling_time > max(self._doubling_time_vector):
doubling_time = max(self._doubling_time_vector)
elif doubling_time < min(self._doubling_time_vector):
doubling_time = min(self._doubling_time_vector)
return doubling_time
[docs]
def _build_trna_data(self, raw_data, sim_data):
growth_rate_unit = units.getUnit(
raw_data.trna_data.trna_growth_rates[0]["growth rate"]
)
self._trna_growth_rates = growth_rate_unit * np.array(
[x["growth rate"].asNumber() for x in raw_data.trna_data.trna_growth_rates]
)
trna_ratio_to_16SrRNA_by_growth_rate = []
for gr in self._trna_growth_rates: # TODO: This is a little crazy...
trna_ratio_to_16SrRNA_by_growth_rate.append(
[
x["ratio to 16SrRNA"]
for x in getattr(
raw_data.trna_data,
"trna_ratio_to_16SrRNA_" + str(gr.asNumber()).replace(".", "p"),
)
]
)
self._trna_ratio_to_16SrRNA_by_growth_rate = np.array(
trna_ratio_to_16SrRNA_by_growth_rate
)
self._trna_ids = [
x["rna id"] for x in raw_data.trna_data.trna_ratio_to_16SrRNA_0p4
]
[docs]
def get_trna_distribution(self, doubling_time):
assert isinstance(doubling_time, unum.Unum)
assert isinstance(doubling_time.asNumber(), float)
growth_rate = 1 / doubling_time
growth_rate = growth_rate.asNumber(1 / units.h)
trna_abundance_interpolation_functions = [
interpolate.interp1d(
self._trna_growth_rates.asNumber(1 / units.h),
self._trna_ratio_to_16SrRNA_by_growth_rate[:, i],
)
for i in range(self._trna_ratio_to_16SrRNA_by_growth_rate.shape[1])
]
id_length = max(len(id_) for id_ in self._trna_ids)
abundance = np.zeros(
len(self._trna_ids),
dtype=[
("id", "U{}".format(id_length)),
("molar_ratio_to_16SrRNA", np.float64),
],
)
abundance["id"] = self._trna_ids
abundance["molar_ratio_to_16SrRNA"] = [
x(growth_rate) for x in trna_abundance_interpolation_functions
]
return abundance
[docs]
class GrowthRateParameters(object):
"""
GrowthRateParameters
"""
def __init__(self, raw_data, sim_data):
self._doubling_time = sim_data.doubling_time
_loadTableIntoObjectGivenDoublingTime(
self, raw_data.growth_rate_dependent_parameters
)
self.ribosome_elongation_rate_params = _get_fit_parameters(
raw_data.growth_rate_dependent_parameters, "ribosomeElongationRate"
)
self.RNAP_elongation_rate_params = _get_fit_parameters(
raw_data.growth_rate_dependent_parameters, "rnaPolymeraseElongationRate"
)
self.RNAP_active_fraction_params = _get_fit_parameters(
raw_data.growth_rate_dependent_parameters, "fractionActiveRnap"
)
self.ribosome_active_fraction_params = _get_fit_parameters(
raw_data.growth_rate_dependent_parameters, "fractionActiveRibosome"
)
# ppGpp concentration by linear fit
self.per_dry_mass_to_per_volume = sim_data.constants.cell_density * (
1.0 - raw_data.mass_parameters["cell_water_mass_fraction"]
)
doubling_time = _loadRow(
"doublingTime", raw_data.growth_rate_dependent_parameters
)
ppgpp_conc = (
_loadRow("ppGpp_conc", raw_data.growth_rate_dependent_parameters)
* self.per_dry_mass_to_per_volume
)
ribosome_elongation_rate = _loadRow(
"ribosomeElongationRate", raw_data.growth_rate_dependent_parameters
)
self._ppGpp_concentration = _get_linearized_fit(doubling_time, ppgpp_conc)
self._ribosome_elongation_rate_by_ppgpp = (
self._fit_ribosome_elongation_rate_by_ppgpp(
ppgpp_conc, ribosome_elongation_rate
)
)
# Estimate of the amount that the max elongation rate is reduced by having
# charging at less than 100% since measured data is not the max elongation
# rate but the observed elongation rate
# TODO (travis): calculate this value based on expected charging
self._charging_fraction_of_max_elong_rate = 0.9
[docs]
def _fit_ribosome_elongation_rate_by_ppgpp(self, ppgpp, rate):
ppgpp_units = units.umol / units.L
rate_units = units.getUnit(rate)
def f(x):
return np.linalg.norm(
x[0] / (1 + (ppgpp.asNumber(ppgpp_units) / x[1]) ** x[2])
- rate.asNumber(rate_units)
)
x0 = [22, 100, 1.5]
sol = minimize(f, x0)
vmax, KI, H = sol.x
# Make sure optimization was successful, is a good fit, and the max rate is not more
# than 10% different than the max measured rate (arbitrary level to ensure close fit)
if (
not sol.success
or sol.fun > 1
or np.abs(vmax / rate.asNumber(rate_units).max() - 1) > 0.1
):
raise RuntimeError("Problem fitting ppGpp regulated elongation rate.")
return ppgpp_units, rate_units, vmax, KI, H
[docs]
def get_ribosome_elongation_rate(self, doubling_time):
return _useFitParameters(doubling_time, **self.ribosome_elongation_rate_params)
[docs]
def get_rnap_elongation_rate(self, doubling_time):
return _useFitParameters(doubling_time, **self.RNAP_elongation_rate_params)
[docs]
def get_fraction_active_rnap(self, doubling_time):
return _useFitParameters(doubling_time, **self.RNAP_active_fraction_params)
[docs]
def get_fraction_active_ribosome(self, doubling_time):
return _useFitParameters(doubling_time, **self.ribosome_active_fraction_params)
[docs]
def get_ppGpp_conc(self, doubling_time):
return _use_linearized_fit(doubling_time, self._ppGpp_concentration)
[docs]
def get_ribosome_elongation_rate_by_ppgpp(self, ppgpp, max_rate=None):
ppgpp_units, rate_units, fit_vmax, KI, H = (
self._ribosome_elongation_rate_by_ppgpp
)
vmax = fit_vmax if max_rate is None else max_rate
return (
rate_units
* vmax
/ (1 + (ppgpp.asNumber(ppgpp_units) / KI) ** H)
/ self._charging_fraction_of_max_elong_rate
)
[docs]
def _get_fit_parameters(list_of_dicts, key):
# Load rows of data
x = _loadRow("doublingTime", list_of_dicts)
y = _loadRow(key, list_of_dicts)
# Save and strip units
y_units = 1
x_units = 1
if units.hasUnit(y):
y_units = units.getUnit(y)
y = y.asNumber(y_units)
if units.hasUnit(x):
x_units = units.getUnit(x)
x = x.asNumber(x_units)
# Sort data for spine fitting (must be ascending order)
idx_order = x.argsort()
x = x[idx_order]
y = y[idx_order]
# Generate fit
cs = interpolate.CubicSpline(x, y, bc_type="natural")
if np.sum(np.absolute(cs(x) - y)) / y.size > 1.0:
raise Exception("Fitting {} with 3d spline, residuals are huge!".format(key))
return {"function": cs, "x_units": x_units, "y_units": y_units, "dtype": y.dtype}
[docs]
def _get_linearized_fit(x, y, **kwargs):
if units.hasUnit(x):
x_units = units.getUnit(x)
x = x.asNumber(x_units)
else:
x_units = None
if units.hasUnit(y):
y_units = units.getUnit(y)
y = y.asNumber(y_units)
else:
y_units = 1.0
return x_units, y_units, fitting.fit_linearized_transforms(x, y, **kwargs)
[docs]
def _use_linearized_fit(x, params):
x_units, y_units, fit_params = params
x_unitless = x if x_units is None else x.asNumber(x_units)
return y_units * fitting.interpolate_linearized_fit(x_unitless, *fit_params)
[docs]
def _useFitParameters(x_new, function, x_units, y_units, dtype):
# Convert to same unit base
if units.hasUnit(x_units):
x_new = x_new.asNumber(x_units)
elif units.hasUnit(x_new):
raise Exception("New x value has units but fit does not!")
# Calculate new interpolated y value
y_new = function(x_new)
# If value should be an integer (i.e. an elongation rate)
# round to the nearest integer
if dtype == int:
y_new = int(np.round(y_new))
return y_units * y_new
[docs]
def _loadRow(key, list_of_dicts):
if units.hasUnit(list_of_dicts[0][key]):
row_units = units.getUnit(list_of_dicts[0][key])
return row_units * np.array([x[key].asNumber(row_units) for x in list_of_dicts])
else:
return np.array([x[key] for x in list_of_dicts])
[docs]
def _loadTableIntoObjectGivenDoublingTime(obj, list_of_dicts):
table_keys = list(list_of_dicts[0].keys())
if "doublingTime" not in table_keys:
raise Exception(
"This data has no doubling time column but it is supposed to be growth rate dependent!"
)
else:
table_keys.pop(table_keys.index("doublingTime"))
for key in table_keys:
fitParameters = _get_fit_parameters(list_of_dicts, key)
attrValue = _useFitParameters(obj._doubling_time, **fitParameters)
setattr(obj, key, attrValue)
[docs]
def linear_regression(
x: npt.NDArray[np.float64],
y: npt.NDArray[np.float64],
r_tol: float = 0.999,
p_tol: float = 1e-5,
) -> tuple[float, float]:
"""
Perform linear regression on a data set and check that statistics are
within expected values to confirm a good linear fit.
Args:
x: x values for regression
y: y values for regression
r_tol: lower limit for r statistic
p_tol: upper limit for p statistic
Returns:
2-element tuple containing
- slope: linear fit slope
- intercept: linear fit intercept
"""
result = stats.linregress(x, y)
if result.rvalue < r_tol:
raise ValueError(
"Could not fit linear regression with high enough r"
" value: {} < {}".format(result.rvalue, r_tol)
)
if result.pvalue > p_tol:
raise ValueError(
"Could not fit linear regression with low enough p"
" value: {} > {}".format(result.pvalue, p_tol)
)
return result.slope, result.intercept