Source code for reconstruction.spreadsheets

"""
Subclasses of DictWriter and DictReader that parse plaintext as JSON strings,
allowing for basic type parsing and fields that are dictionaries or lists. The
reader also supports units and comment lines.
"""

from contextlib import contextmanager
import csv
import io
from itertools import filterfalse
import json
import re
import numpy as np
from typing import Any, Iterator

from wholecell.utils import units


CSV_DIALECT = csv.excel_tab



[docs]
class JsonWriter(csv.DictWriter):
    def __init__(self, *args, **kwargs):
        """Writer for a .tsv file to be read by JsonReader. This writes a
        header with quotes and dict rows in TSV format, JSON encoding, and
        UTF-8 encoding.

        NOTE: The caller needs to remove units from the dict values and add
        them to the fieldnames. JsonWriter does not handle units.

        The first argument should be a file-like writer open in text mode.

        By default, dialect=CSV_DIALECT, which is excel-tab.
        """
        kwargs.setdefault("dialect", CSV_DIALECT)
        super(JsonWriter, self).__init__(
            quotechar="'",
            quoting=csv.QUOTE_MINIMAL,
            lineterminator="\n",
            *args,
            **kwargs,
        )


[docs]
    def writeheader(self):
        # Bypass DictWriter's writeheader() and _dict_to_list(). [Consider
        # reimplementing on csv.writer rather than subclassing DictWriter.]
        header = ['"{}"'.format(name) for name in self.fieldnames]
        self.writer.writerow(header)



[docs]
    def _dict_to_list(self, rowdict):
        rowdict_ = {
            key: json.dumps(array_to_list(value), ensure_ascii=False)
            for key, value in rowdict.items()
        }
        # noinspection PyUnresolvedReferences
        return super(JsonWriter, self)._dict_to_list(rowdict_)





[docs]
class JsonReader(object):
    def __init__(self, *args, **kwargs):
        """
        Reader for a .tsv file that supports units and json-coded values.
        Units are denoted with a fieldname in the format 'name (units)' e.g.
        "flux standard deviation (units.mmol / units.g / units.h)". Fields
        whose names start with an underscore are removed from self._fieldnames,
        and discarded from each row during iteration.

        The first argument should be a file-like reader open in text mode.

        By default, dialect=CSV_DIALECT, which is excel-tab.
        """
        kwargs.setdefault("dialect", CSV_DIALECT)
        self.tsv_dict_reader = csv.DictReader(
            quotechar="'", quoting=csv.QUOTE_MINIMAL, *args, **kwargs
        )

        fieldnames = self.tsv_dict_reader.fieldnames

        # Strip extra quotes from the field names
        fieldnames = [fieldname.strip('"') for fieldname in fieldnames]
        self.tsv_dict_reader.fieldnames = fieldnames

        # Discard private field names that begin with underscore and empty field names
        self._fieldnames = [
            fieldname
            for fieldname in fieldnames
            if not fieldname.startswith("_") and fieldname != ""
        ]

    def __iter__(self):
        return self

    def __next__(self) -> dict[str, Any]:
        return self._decode_row(self.tsv_dict_reader.__next__())

    @property
    def fieldnames(self) -> list[str]:
        return self._fieldnames


[docs]
    def _decode_row(self, row_dict: dict[str, str]) -> dict[str, Any]:
        """Decode a DictReader row.

        NOTE: Each returned row contains unicode/str keys and values.
        """
        attributeDict: dict[str, Any] = {}

        for fieldname in self._fieldnames:
            raw_value = row_dict[fieldname]
            key = fieldname

            try:
                value = json.loads(raw_value) if raw_value else ""

            except (ValueError, TypeError) as e:
                repr(e)  # TODO(jerry): Why call repr() and discard the result?
                raise ValueError("failed to parse json string:{}".format(raw_value))

            match = re.search(r"(.*?) \((.*?)\)", key)
            if match:
                # Entry has units so apply the units to the value and strip
                # them from the key.
                _ = units  # don't warn about `units`; it's imported for eval()
                attribute = match.group(1)
                value_units = eval(match.group(2))

                # Units do not work with empty values
                if value != "":
                    if isinstance(value, dict):
                        # Apply units to each dictionary value
                        for k, v in value.items():
                            value[k] = value_units * v
                            value[k].normalize()
                    else:
                        # Units do not work with lists so need to convert to ndarray
                        if isinstance(value, list):
                            value_with_units = value_units * np.array(value)
                        else:
                            value_with_units = value_units * value
                        # Normalize to catch any unit issues now instead of later
                        value = value_with_units.normalize()

                attributeDict[attribute] = value
            else:
                attributeDict[key] = value

        return attributeDict


    @property
    def dialect(self) -> CSV_DIALECT:
        return self.tsv_dict_reader.dialect

    @property
    def line_num(self) -> int:
        return self.tsv_dict_reader.line_num




[docs]
def comment_line(line: str) -> bool:
    return line.lstrip().startswith("#")



# TODO(jerry): Implementing this on wholecell.io.tsv.dict_reader/dict_writer
#  would simplify it a little.



[docs]
@contextmanager
def tsv_reader(filename: str) -> Iterator[JsonReader]:
    """A context manager that opens a TSV JsonReader on the given filename with
    an input filter to skip comment lines.
    """
    # ########################################################################
    # NOTE: Python 3 csv requires opening the file as text 'r' with the right
    # character encoding while Python 2 csv requires opening it as bytes 'rb'
    # then decoding from UTF-8 after csv reads it but before DictReader or at
    # least before json.loads(). The file can be in UTF-8 or its ASCII subset.
    # ########################################################################

    with io.open(filename, mode="r", encoding="utf-8", newline="") as fh:
        reader = JsonReader(filterfalse(comment_line, fh), dialect=CSV_DIALECT)
        yield reader




[docs]
def read_tsv(filename: str) -> list[dict[str, Any]]:
    """Read an entire .tsv file using JsonReader and skip comment lines."""
    with tsv_reader(filename) as reader:
        return list(reader)




[docs]
@contextmanager
def tsv_writer(filename: str, fieldnames: list[str]) -> Iterator[JsonWriter]:
    """A context manager that opens a TSV JsonWriter on the given filename.
    Just call its writerow() and writerows() methods.
    """
    # ########################################################################
    # NOTE: Python 3 csv requires opening the file as text 'w' with the right
    # character encoding while Python 2 csv requires opening it as bytes 'wb'
    # and encoding the data to UTF-8 before csv writes it.
    # ########################################################################

    fieldnames = list(fieldnames)
    with io.open(filename, mode="w", encoding="utf-8", newline="") as fh:
        writer = JsonWriter(fh, fieldnames, dialect=CSV_DIALECT)
        writer.writeheader()

        yield writer

        fh.flush()




[docs]
def array_to_list(value):
    if isinstance(value, np.ndarray):
        value = value.tolist()

    return value