Source code for ecoli.library.data_predicates

"""
Data Predicates

Defines several assertions about data that are useful for tests,
e.g. checks for monotonicity, whether the data approximately follows a Poisson distribution, etc.

All functions expect a 1D numpy array as first parameter.

TODO:
- implement faster numpy-based solution for tests of increasing/decreasing
"""

import numpy as np
from scipy.stats import chisquare, poisson
from collections import Counter



[docs]
def strictly_increasing(data):
    return all(a < b for a, b in zip(data, data[1:]))




[docs]
def strictly_decreasing(data):
    return all(a > b for a, b in zip(data, data[1:]))




[docs]
def monotonically_increasing(data):
    return all(a <= b for a, b in zip(data, data[1:]))




[docs]
def monotonically_decreasing(data):
    return all(a >= b for a, b in zip(data, data[1:]))




[docs]
def all_positive(data):
    return np.all(data > 0)




[docs]
def all_negative(data):
    return np.all(data < 0)




[docs]
def all_nonnegative(data):
    return np.all(data >= 0)




[docs]
def all_nonpositive(data):
    return np.all(data <= 0)




[docs]
def approx_poisson(data, rate=None, significance=0.05, verbose=False):
    """
    Test whether data appears to follow Poisson distribution, using Chi-sq goodness of fit.
    Does not do particularly well comparing poisson data of rate r_1 vs. poisson distribution of rate r_2.
    Args:
        data: 1D array where index i corresponds the number of events observed in interval i.
        rate: rate (lambda) of the Poisson distribution against which to compare. If None, rate is estimated from the data.
        significance: for p > significance, fail to reject that the data is not Poisson-distributed.
        verbose: if True, prints estimated rate, and results (chi-sq, p-value) of the goodness-of-fit test.
    """

    if rate is None:
        rate = np.mean(data)

    counts = Counter(list(data))
    counts = [counts[i] if i in counts.keys() else 0 for i in range(max(data) + 1)]

    res = chisquare(
        np.array(counts) / sum(counts),
        poisson(rate).pmf(range(len(counts)))
        / sum(poisson(rate).pmf(range(len(counts)))),
    )

    if verbose:
        print(f"Estimated rate (lambda): {rate}")
        print(f"Chi-sq: {res[0]}")
        print(f"p: {res[1]}")

    return res[1] > significance



def test_data_predicates():
    assert strictly_increasing(np.array([1, 2, 3])) and not strictly_increasing(
        np.array([1, 1, 2])
    )
    assert strictly_decreasing(np.array([3, 2, 1])) and not strictly_decreasing(
        np.array([3, 3, 2])
    )
    assert monotonically_increasing(
        np.array([1, 1, 2])
    ) and not monotonically_increasing(np.array([1, 0, 1]))
    assert monotonically_decreasing(
        np.array([2, 2, 1])
    ) and not monotonically_decreasing(np.array([1, 2, 1]))
    assert all_positive(np.array([1, 2, 3])) and not all_positive(np.array([1, 1, 0]))
    assert all_negative(np.array([-1, -2, -3])) and not all_negative(
        np.array([-1, -1, 0])
    )
    assert all_nonnegative(np.array([0, 1, 2])) and not all_nonnegative(
        np.array([-1, 0, 1])
    )
    assert all_nonpositive(np.array([0, -1, -2])) and not all_nonpositive(
        np.array([-1, 0, 1])
    )

    poisson_data = np.random.poisson(lam=2, size=1000)
    geom_data = np.random.geometric(p=0.1, size=1000)
    assert approx_poisson(poisson_data) and not approx_poisson(geom_data)

    print("Passed all tests.")


if __name__ == "__main__":
    test_data_predicates()