#! /usr/bin/env python
"""
Aggregate data from Javi's repo to compare to fold_changes.tsv included in wcm.
"""
import argparse
import io
import os
import numpy as np
from reconstruction.spreadsheets import tsv_writer
from wholecell.io import tsv
from typing import Any, Iterable
FILE_LOCATION = os.path.dirname(os.path.realpath(__file__))
SRC_FILE = os.path.join(FILE_LOCATION, "kb.tsv")
WCM_FILE = os.path.join(FILE_LOCATION, "fold_changes.tsv")
SHIFTS_FILE = os.path.join(FILE_LOCATION, "shifts.tsv")
GENES_FILES = os.path.join(FILE_LOCATION, "gene_names.tsv")
OUTPUT_FILE = os.path.join(FILE_LOCATION, "updated_fold_changes.tsv")
[docs]
def load_file(filename: str) -> list[list[str]]:
"""Load a tsv file."""
with io.open(filename, "rb") as f:
reader: Iterable = tsv.reader(f)
return list(reader)
[docs]
def load_src(attempt_match: bool) -> dict[str, dict[str, dict[str, float]]]:
"""
Loads and extracts gene regulation data from the source data.
Args:
attempt_match: if True, handles data in way that was most likely the
original processing, otherwise uses expected directionality
Returns:
data: mean and standard deviation for each regulatory pair
{TF gene name: {regulated gene: {'mean': mean, 'std': std}}}
"""
raw_data = load_file(SRC_FILE)
shifts = load_shifts()
# Extract fold changes from data
data: dict[str, dict[str, list[float]]] = {}
for line in raw_data:
# Columns of interest
condition = (line[4], line[5])
tf = line[7]
regulated_gene = line[9]
magnitude = float(line[10])
if tf not in data:
data[tf] = {}
direction = shifts[condition][tf]
data[tf][regulated_gene] = data[tf].get(regulated_gene, []) + [
direction * magnitude
]
# Calculate mean and std from data
processed_data = {}
for tf, regulated in data.items():
tf_data = {}
for gene, fcs in regulated.items():
if attempt_match:
fcs1 = np.abs(
fcs
) # Bad!! - ignores annotated condition comparison regulation direction
else:
fcs1 = np.array(fcs)
tf_data[gene] = {
"mean": float(np.mean(fcs1)),
"std": float(np.std(fcs1, ddof=1)),
}
processed_data[tf] = tf_data
return processed_data
[docs]
def load_wcm(attempt_match: bool) -> dict[str, dict[str, dict[str, float]]]:
"""
Loads and extracts gene regulation data from the whole-cell model.
Args:
attempt_match: if True, handles data in way that was most likely the
original processing, otherwise uses expected directionality
Returns:
data: mean and standard deviation for each regulatory pair
{TF gene name: {regulated gene: {'mean': mean, 'std': std}}}
"""
raw_data = load_file(WCM_FILE)[1:]
# Extract mean and std from data
data: dict[str, dict[str, dict[str, Any]]] = {}
for line in raw_data:
if line[0].startswith("#"):
continue
# Columns of interest
tf = line[0].strip()
regulated_gene = line[1].strip()
mean = float(line[2])
std = float(line[3])
if attempt_match:
sign = 1
else:
sign = int(np.sign(float(line[5])))
uncertain = float(line[5]) > 2
if tf not in data:
data[tf] = {}
data[tf][regulated_gene] = {
"mean": sign * mean,
"std": std,
"uncertain": uncertain,
}
return data
[docs]
def load_shifts() -> dict[tuple[str, str], dict[str, int]]:
"""
Load TF regulation information for each shift.
Returns:
shifts: gene regulation direction for each condition
1 for active TF in condition1 vs condition2
-1 for active TF in condition2 vs condition1
{(condition1, condition2): {TF1: 1, TF2: -1, ...}}
"""
genes = {int(line[0]): line[2] for line in load_file(GENES_FILES)}
shifts = {}
for line in load_file(SHIFTS_FILE):
condition = (line[1], line[2])
gene_dir = {
genes[np.abs(int(v))]: int(np.sign(int(v))) for v in line[12:26] if v != "0"
}
shifts[condition] = gene_dir
return shifts
[docs]
def compare_data(
data1: dict[str, dict[str, dict[str, float]]],
data2: dict[str, dict[str, dict[str, float]]],
desc: str,
verbose: bool = True,
):
"""
Compares regulation from source data to wcm data.
Args:
data1: mean and standard deviation for each regulatory pair
data2: mean and standard deviation for each regulatory pair
desc: description for comparison
verbose: if True, prints additional regulation information
Notes:
dictionary structure for both inputs:
{TF gene name: {regulated gene: {'mean': mean, 'std': std}}}
"""
# Track statistics
total_regulation = {}
discrepancies = {}
direction_discrepancies = {}
no_match = {}
print("\n*** {} ***".format(desc))
# Print regulation that is different in the datasets
if verbose:
print("TF -> regulated gene: data1 vs data2")
for tf, regulation in data1.items():
total_regulation[tf] = len(regulation)
discrepancies[tf] = 0
direction_discrepancies[tf] = 0
no_match[tf] = 0
for gene, data in regulation.items():
mean1 = data["mean"]
mean2 = data2.get(tf, {}).get(gene, {}).get("mean", 0)
if np.abs(mean1 - mean2) > 0.01 and mean2 != 0:
if np.sign(mean1) != np.sign(mean2):
direction_discrepancies[tf] += 1
discrepancies[tf] += 1
if verbose:
uncertain = (
"\tuncertain"
if data2.get(tf, {}).get(gene, {}).get("uncertain", False)
else ""
)
print(
"{} -> {}: {:.2f} vs {:.2f}{}".format(
tf, gene, mean1, mean2, uncertain
)
)
elif mean2 == 0:
no_match[tf] += 1
if verbose:
print()
# Print summary statistics for each TF
total_direction_discrepancies = 0
total_discrepancies = 0
total_no_match = 0
total_interactions = 0
print("TF: opposite direction, different mean, not in second dataset, total")
for tf, total in total_regulation.items():
tf_dir_disc = direction_discrepancies[tf]
tf_disc = discrepancies[tf]
tf_no_match = no_match[tf]
total_direction_discrepancies += tf_dir_disc
total_discrepancies += tf_disc
total_no_match += tf_no_match
total_interactions += total
if verbose:
print(
"{:5s}: {:3} {:3} {:3} {:3}".format(
tf, tf_dir_disc, tf_disc, tf_no_match, total
)
)
print(
"Total: {:3} {:3} {:3} {:3}".format(
total_direction_discrepancies,
total_discrepancies,
total_no_match,
total_interactions,
)
)
[docs]
def replace_uncertain_entries(data: dict[str, dict[str, dict[str, float]]]):
"""
Replaces entries that were marked as uncertain in fold changes
(Regulation_direct 3 or 4) if the mean fold change is the opposite sign
when correcting for the regulation direction.
Args:
data: mean and standard deviation for each regulatory pair
"""
raw_data = load_file(WCM_FILE)
headers = raw_data[0]
with tsv_writer(OUTPUT_FILE, headers) as writer:
# Check each fold change and update if needed
for line in raw_data[1:]:
tf = line[0].strip('#" ')
gene = line[1].strip()
direction = int(line[5])
# Update value if uncertain fold change and opposite direction as mean
if np.abs(direction) > 2: # uncertain if category 3 or 4
entry = data.get(tf, {}).get(gene, {})
mean = entry.get("mean", 0)
std = entry.get("std", 0)
new_direction = int(np.sign(mean))
if new_direction * direction < 0: # both have opposite signs
line[2] = "{:.2f}".format(np.abs(mean))
line[3] = "{:.2f}".format(std)
line[5] = "{:.0f}".format(new_direction)
# Write updated lines to file
row = [
tf,
gene,
float(line[2]),
float(line[3]),
float(line[4]),
int(line[5]),
int(line[6]),
int(line[7]),
float(line[8]),
]
d = {header: value for header, value in zip(headers, row)}
writer.writerow(d)
[docs]
def parse_args() -> argparse.Namespace:
"""
Parses arguments from the command line.
Returns:
values of variables parsed from the command line
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--match",
action="store_true",
help="If set, processes source data to best match wcm.",
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="If set, prints more information."
)
parser.add_argument(
"--replace",
action="store_true",
help="Replace uncertain fold changes if new direction is different.",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
src_data = load_src(args.match)
wcm_data = load_wcm(args.match)
compare_data(src_data, wcm_data, "Javi repo vs wcm", args.verbose)
if args.replace:
replace_uncertain_entries(src_data)