"""
Data Predicates
Defines several assertions about data that are useful for tests,
e.g. checks for monotonicity, whether the data approximately follows a Poisson distribution, etc.
All functions expect a 1D numpy array as first parameter.
TODO:
- implement faster numpy-based solution for tests of increasing/decreasing
"""
import numpy as np
from scipy.stats import chisquare, poisson
from collections import Counter
[docs]
def strictly_increasing(data):
return all(a < b for a, b in zip(data, data[1:]))
[docs]
def strictly_decreasing(data):
return all(a > b for a, b in zip(data, data[1:]))
[docs]
def monotonically_increasing(data):
return all(a <= b for a, b in zip(data, data[1:]))
[docs]
def monotonically_decreasing(data):
return all(a >= b for a, b in zip(data, data[1:]))
[docs]
def all_positive(data):
return np.all(data > 0)
[docs]
def all_negative(data):
return np.all(data < 0)
[docs]
def all_nonnegative(data):
return np.all(data >= 0)
[docs]
def all_nonpositive(data):
return np.all(data <= 0)
[docs]
def approx_poisson(data, rate=None, significance=0.05, verbose=False):
"""
Test whether data appears to follow Poisson distribution, using Chi-sq goodness of fit.
Does not do particularly well comparing poisson data of rate r_1 vs. poisson distribution of rate r_2.
Args:
data: 1D array where index i corresponds the number of events observed in interval i.
rate: rate (lambda) of the Poisson distribution against which to compare. If None, rate is estimated from the data.
significance: for p > significance, fail to reject that the data is not Poisson-distributed.
verbose: if True, prints estimated rate, and results (chi-sq, p-value) of the goodness-of-fit test.
"""
if rate is None:
rate = np.mean(data)
counts = Counter(list(data))
counts = [counts[i] if i in counts.keys() else 0 for i in range(max(data) + 1)]
res = chisquare(
np.array(counts) / sum(counts),
poisson(rate).pmf(range(len(counts)))
/ sum(poisson(rate).pmf(range(len(counts)))),
)
if verbose:
print(f"Estimated rate (lambda): {rate}")
print(f"Chi-sq: {res[0]}")
print(f"p: {res[1]}")
return res[1] > significance
def test_data_predicates():
assert strictly_increasing(np.array([1, 2, 3])) and not strictly_increasing(
np.array([1, 1, 2])
)
assert strictly_decreasing(np.array([3, 2, 1])) and not strictly_decreasing(
np.array([3, 3, 2])
)
assert monotonically_increasing(
np.array([1, 1, 2])
) and not monotonically_increasing(np.array([1, 0, 1]))
assert monotonically_decreasing(
np.array([2, 2, 1])
) and not monotonically_decreasing(np.array([1, 2, 1]))
assert all_positive(np.array([1, 2, 3])) and not all_positive(np.array([1, 1, 0]))
assert all_negative(np.array([-1, -2, -3])) and not all_negative(
np.array([-1, -1, 0])
)
assert all_nonnegative(np.array([0, 1, 2])) and not all_nonnegative(
np.array([-1, 0, 1])
)
assert all_nonpositive(np.array([0, -1, -2])) and not all_nonpositive(
np.array([-1, 0, 1])
)
poisson_data = np.random.poisson(lam=2, size=1000)
geom_data = np.random.geometric(p=0.1, size=1000)
assert approx_poisson(poisson_data) and not approx_poisson(geom_data)
print("Passed all tests.")
if __name__ == "__main__":
test_data_predicates()