"""
Compare two .cPickle files or all the .cPickle files in a pair of directories.
Show the differences or optionally just a count of difference lines.
Usage (PATH is a path like 'out/manual/intermediates'):
runscripts/debug/comparePickles.py PATH1 PATH2
"""
import argparse
from collections.abc import Mapping, Sequence
import functools
import numbers
import os
import pickle
from pprint import pformat
import re
import sys
import types
from typing import Dict
import Bio.Seq
import numpy as np
import scipy.interpolate
import sympy
from sympy.matrices import dense
import unum
from wholecell.utils import constants
import wholecell.utils.unit_struct_array
NULP = 0 # float comparison tolerance, in Number of Units in the Last Place
# Objects with a list of attributes to compare
SPECIAL_OBJECTS = {
scipy.interpolate.CubicSpline: ["x", "c", "axis"],
wholecell.utils.unit_struct_array.UnitStructArray: ["struct_array", "units"],
}
LEAF_TYPES = (
unum.Unum,
Bio.Seq.Seq,
sympy.Basic,
numbers.Number,
functools.partial,
types.FunctionType,
dense.MutableDenseMatrix,
wholecell.utils.unit_struct_array.UnitStructArray,
)
WHITESPACE = re.compile(r"\s+")
[docs]
class Repr(object):
"""A Repr has the given repr() string without quotes and != any other value."""
def __init__(self, repr_):
self.repr_ = repr_
def __repr__(self):
return self.repr_
[docs]
def has_python_vars(obj):
"""
Returns true if the given object has any Python instance variables, that is
ordinary fields or compact slots. If not, it's presumably a built-in type
or extension type implemented entirely in C and Cython.
"""
return hasattr(obj, "__dict__") or hasattr(obj, "__slots__")
[docs]
def all_vars(obj):
"""
Returns a dict of all the object's instance variables stored in ordinary
fields and in compact slots. This expands on the built-in function `vars()`.
If the object implements the pickling method `__getstate__`, call that
instead to get its defining state.
"""
if hasattr(obj, "__getstate__"):
# noinspection PyCallingNonCallable
return obj.__getstate__()
attrs = getattr(obj, "__dict__", {})
attrs.update({key: getattr(obj, key) for key in getattr(obj, "__slots__", ())})
return attrs
[docs]
def is_leaf(value, leaves=LEAF_TYPES):
"""
Predicate to determine if we have reached the end of how deep we want to traverse
through the object tree.
"""
if isinstance(value, (Mapping, Sequence)):
return isinstance(value, (bytes, str))
return (
callable(value) # it's callable
or isinstance(value, leaves) # it's an instance of a declared leaf type
or not has_python_vars(value)
) # an object without Python instance variables
[docs]
def object_tree(obj, path="", debug=None):
"""
Diagnostic tool to inspect a complex data structure.
Given an object, exhaustively traverse down all attributes it contains until leaves
are reached, and convert everything found into a dictionary or a list. The resulting
dictionary will mirror the structure of the original object, but instead of
attributes with values it will be a dictionary where the keys are the attribute
names. The type of the dictionarified object will be encoded under the key `!type`
which is assumed to not be in conflict with any other attributes. The result should
aid in serialization and deserialization of the object and is intended to be a
translation of a pickled object.
Args:
obj (object): The object to inspect.
path (optional str): The root path of this object tree. This will be built upon
for each child of the current object found and reported in a value is
provided for `debug`.
debug (optional str): If provided, prints paths of the attributes encountered.
If the value is 'ALL', it will print every path. If the value is 'CALLABLE',
it will only print methods and functions it finds.
"""
if debug == "ALL":
print(path)
if is_leaf(obj):
if callable(obj) and (debug == "CALLABLE"):
print("{}: {}".format(path, obj))
return obj
elif isinstance(obj, Mapping):
return {
key: object_tree(value, "{}['{}']".format(path, key), debug)
for (key, value) in obj.items()
}
elif isinstance(obj, Sequence):
return [
object_tree(subobj, "{}[{}]".format(path, index), debug)
for index, subobj in enumerate(obj)
]
else:
attrs = all_vars(obj)
tree = {
key: object_tree(value, "{}.{}".format(path, key), debug)
for (key, value) in attrs.items()
}
tree["!type"] = type(obj)
return tree
[docs]
def size_tree(o, cutoff=0.1):
"""
Find the size of attributes in an object tree. Sizes greater than the cutoff
(in MB) will be returned for displaying. Sizes include all values contained
within an attribute (eg. a Dict will be represented by the size of all keys
and values in addition to the Dict size itself).
TODO: double check total size vs disk size - might be missing some types
"""
def return_val(total, value):
if total > cutoff and value:
return total, value
else:
return (total,)
def get_size(o):
return sys.getsizeof(o) / 2**20 # convert to MB
size = get_size(o)
# special handling of leaf to get size of defining attributes
if isinstance(o, unum.Unum):
size += size_tree(o._unit)[0]
size += get_size(o._value)
return (size,)
# special handling of leaf to get size of str sequence
elif isinstance(o, Bio.Seq.Seq):
size += get_size(o._data)
return (size,)
# special handling of leaf, each entry is allocated the same amount of space
elif isinstance(o, wholecell.utils.unit_struct_array.UnitStructArray):
size += size_tree(o.units)[0]
n_entries = len(o.struct_array)
if n_entries:
size += get_size(o.struct_array[0]) * n_entries
return (size,)
# if a special object, check predefined attributes for equality
elif type(o) in SPECIAL_OBJECTS:
sizes = {}
attrs = SPECIAL_OBJECTS[type(o)]
for attr in attrs:
subsizes = size_tree(getattr(o, attr), cutoff)
size += subsizes[0]
if subsizes[0] > cutoff:
formatted = float("{:.2f}".format(subsizes[0]))
if len(subsizes) == 1:
val = formatted
else:
val = (formatted, subsizes[1])
sizes[attr] = val
return return_val(size, sizes)
# if it is a leaf, just return the size
# TODO: any special handling for types that are not already accounted for above
elif is_leaf(o):
return (size,)
# if it is a dictionary, then get the size of keys and values
elif isinstance(o, Mapping):
sizes = {}
total_size = size
for key, value in o.items():
subsizes = size_tree(value, cutoff)
entry_size = subsizes[0] + get_size(key)
total_size += entry_size
if entry_size > cutoff:
formatted = float("{:.2f}".format(entry_size))
if len(subsizes) == 1:
val = formatted
else:
val = (formatted, subsizes[1])
sizes[key] = val
return return_val(total_size, sizes)
# if it is a sequence, then get the size of each element
elif isinstance(o, Sequence):
sizes = []
total_size = size
for value in o:
subsizes = size_tree(value, cutoff)
total_size += subsizes[0]
if subsizes[0] > cutoff:
formatted = float("{:.2f}".format(subsizes[0]))
if len(subsizes) == 1:
val = formatted
else:
val = (formatted, subsizes[1])
sizes.append(val)
return return_val(total_size, sizes)
else:
return (size,)
[docs]
def _are_instances_of(a, b, a_type):
"""
Return True if `a` and `b` are both instances of the given type (or tuple
of types).
"""
return isinstance(a, a_type) and isinstance(b, a_type)
[docs]
def diff_trees(a, b):
"""
Find the differences between two trees or leaf nodes a and b. Return a
falsey value if the inputs match OR a truthy value that explains or
summarizes their differences, where each point in the tree where the inputs
differ will be a tuple (a's value, b's value, optional description).
Floating point numbers are compared with the tolerance set by the constant
NULP (Number of Units in the Last Place), allowing for NaN and infinite
values. (Adjust the tolerance level NULP if needed.)
This operation is symmetrical.
"""
# treat str and Python 2 unicode as the same leaf type
# ditto for int and Python 2 long
if _are_instances_of(a, b, str) or _are_instances_of(a, b, int):
if a != b:
return elide(a), elide(b)
# if they aren't they same type, they are clearly different. Also this lets us
# safely assume throughout the rest of the function that a and b are the same type
elif type(a) is not type(b):
return elide(a, max_len=400), elide(b, max_len=400)
# if they are floats, handle various kinds of values
elif isinstance(a, float):
return compare_floats(a, b)
# if they are numpy arrays, compare them using a numpy testing function
elif isinstance(a, np.ndarray):
return compare_ndarrays(a, b)
# if they are Unums compare their contents with matching units
elif isinstance(a, unum.Unum):
a0, b0 = a.matchUnits(b)
return diff_trees(a0.asNumber(), b0.asNumber())
# if a special object, check predefined attributes for equality
elif type(a) in SPECIAL_OBJECTS:
diff = {}
attrs = SPECIAL_OBJECTS[type(a)]
for attr in attrs:
subdiff = diff_trees(getattr(a, attr), getattr(b, attr))
if subdiff:
diff[attr] = subdiff
return diff
# if they are leafs (including strings) use python equality comparison
elif is_leaf(a):
if a != b:
return elide(a), elide(b)
# if they are dictionaries then diff the value under each key
elif isinstance(a, Mapping):
diff = {}
na = Repr("--")
nb = Repr("--")
for key in set(a.keys()) | set(b.keys()):
subdiff = diff_trees(a.get(key, na), b.get(key, nb))
if subdiff:
diff[key] = subdiff
return diff
# if they are sequences then compare each element at each index
elif isinstance(a, Sequence):
if len(a) > len(b):
b = list(b) + (len(a) - len(b)) * [Repr("--")]
elif len(b) > len(a):
a = list(a) + (len(b) - len(a)) * [Repr("--")]
diff = []
for index in range(len(a)):
subdiff = diff_trees(a[index], b[index])
if subdiff:
diff.append(subdiff)
return diff
# this should never happen
else:
print("value not considered by `diff_trees`: {} {}".format(a, b))
[docs]
def elide(value, max_len=200):
"""Return a value with the same repr but elided if it'd be longer than max."""
repr_ = repr(value)
if len(repr_) > max_len:
return Repr(repr_[:max_len] + "...")
return value
[docs]
def simplify_error_message(message):
return elide(Repr(WHITESPACE.sub(" ", message).strip()))
[docs]
def compare_floats(f1, f2):
"""Compare two floats, allowing some tolerance, NaN, and Inf values. This
considers all types of NaN to match.
Return 0.0 (which is falsey) if they match, else (f1, f2).
"""
if f1 == f2 or np.isnan(f1) and np.isnan(f2):
return 0.0
try:
np.testing.assert_array_almost_equal_nulp(f1, f2, nulp=NULP)
return 0.0
except AssertionError:
# FWIW, the string error.args[0] tells the NULP difference.
return f1, f2
[docs]
def compare_ndarrays(array1, array2):
"""Compare two ndarrays, checking the shape and all elements, allowing for
NaN values and non-numeric values. Return () if they match, else a tuple of
diff info or just a diff description.
TODO(jerry): Allow tolerance for float elements of structured arrays and
handle NaN and Inf values.
"""
def summarize_array(ndarray):
return Repr(f"array({ndarray.shape} {ndarray.dtype})")
if array1.shape != array2.shape:
return summarize_array(array1), summarize_array(array2)
object_dtype = np.dtype(object)
if issubclass(array1.dtype.type, np.floating):
try:
# This handles float tolerance but not NaN and Inf.
with np.errstate(invalid="ignore"):
np.testing.assert_array_almost_equal_nulp(array1, array2, nulp=NULP)
return ()
except AssertionError as _:
# return elide(array1), elide(array2), simplify_error_message(e.args[0])
pass # try again, below
# Handle ragged arrays created with an object dtype
elif array1.dtype == object_dtype and array2.dtype == object_dtype:
try:
assert array1.shape == array2.shape
for sub1, sub2 in zip(array1, array2):
np.testing.assert_equal(sub1, sub2)
return ()
except AssertionError as e:
return simplify_error_message(e.args[0])
try:
# This handles non-float dtypes, also NaN and Inf, but no tolerance.
np.testing.assert_array_equal(array1, array2)
return ()
except AssertionError as e:
return simplify_error_message(e.args[0])
[docs]
def load_tree(path):
"""Load a .cPickle file as an object_tree."""
with open(path, "rb") as f:
data = pickle.load(f, fix_imports=True, encoding="latin1")
return object_tree(data)
[docs]
def load_fit_tree(out_subdir):
"""Load the parameter calculator's (Parca's) output as an object_tree."""
# For convenience, optionally add the prefix 'out/'.
if not os.path.isabs(out_subdir) and not os.path.isdir(out_subdir):
out_subdir = os.path.join("out", out_subdir)
path = os.path.join(
out_subdir, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME
)
return load_tree(path)
[docs]
def pprint_diffs(diffs, *, width=160, print_diff_lines=True, print_count=True):
"""Pretty-print the diff info: optionally print the detailed diff lines,
optionally print the diff line count as a single figure of merit; then
return the line count.
"""
if diffs:
diff_lines = pformat(diffs, width=width)
if print_diff_lines:
print(diff_lines)
line_count = len(diff_lines.strip().splitlines())
else:
line_count = 0
if print_count:
print("==> lines of differences: {}".format(line_count))
return line_count
[docs]
def diff_files(path1: str, path2: str, print_diff_lines: bool = True) -> int:
"""Diff the pair of named pickle files. Return the diff line count."""
tree1 = load_tree(path1)
tree2 = load_tree(path2)
diffs = diff_trees(tree1, tree2)
return pprint_diffs(diffs, print_diff_lines=print_diff_lines)
[docs]
def list_pickles(directory: str) -> Dict[str, str]:
"""Return a map of .cPickle file names to paths in the given directory
sorted by file modification time then by filename."""
entries = [
(entry.stat().st_mtime, entry.name, entry.path)
for entry in os.scandir(directory)
if entry.is_file() and entry.name.endswith(".cPickle")
]
files = {e[1]: e[2] for e in sorted(entries)}
return files
[docs]
def diff_dirs(dir1: str, dir2: str, print_diff_lines: bool = True) -> int:
"""Diff the pickle files in the pair of named directories. Return the total
diff line count."""
print(f'Comparing pickle files in "{dir1}" vs. "{dir2}".')
pickles1 = list_pickles(dir1)
pickles2 = list_pickles(dir2)
count = 0
for name, path1 in pickles1.items():
print(f'\n*** {name} {"*" * (75 - len(name))}')
path2 = pickles2.get(name)
if path2:
count += diff_files(path1, path2, print_diff_lines)
else:
print(f"{name} is in {dir1} but not {dir2}")
count += 1
only_in_dir2 = pickles2.keys() - pickles1.keys()
if only_in_dir2:
print(
f"\n*** Pickle files in {dir2} but not {dir1}:\n" f"{sorted(only_in_dir2)}"
)
count += len(only_in_dir2)
print(
f"\n====> Total differences: {count} lines for {len(pickles1)} pickle"
f" files in {dir1} against {len(pickles2)} pickle files in {dir2}."
)
return count
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Compare two .cPickle files"
" or all the .cPickle files in two directories (in"
" modification-time order)."
" Print a count and optionally a summary of the differences."
)
parser.add_argument(
"-c",
"--count",
action="store_true",
help="Print just the diff line count for each file, skipping the"
" detailed diff lines.",
)
parser.add_argument(
"-f",
"--final-sim-data",
action="store_true",
help="Append /kb/simData.cPickle to the two PATH args to make it a"
" little easier compare the final Parca output sim_data.",
)
parser.add_argument(
"path",
metavar="PATH",
nargs=2,
help="The two pickle files or directories to compare.",
)
args = parser.parse_args()
path1, path2 = args.path
if args.final_sim_data:
path1 = os.path.join(
path1, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME
)
path2 = os.path.join(
path2, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME
)
if os.path.isfile(path1):
diff_count = diff_files(path1, path2, print_diff_lines=not args.count)
else:
diff_count = diff_dirs(path1, path2, print_diff_lines=not args.count)
sys.exit(3 if diff_count else 0)