Source code for wholecell.io.ingestion

"""
Utilities for ingesting experimental data (e.g. RNA-seq transcriptomes)
using the canonical Pandera schemas in ``wholecell.io.schemas``.

This module is intentionally narrow for now:
- Load TSVs into pandas DataFrames.
- Validate them against the RNA-seq schemas.
- Provide a small convenience wrapper to fetch a single transcriptome
  given a manifest and ``dataset_id``.
"""

from __future__ import annotations

import os
from pathlib import Path
from typing import Tuple, Union

import pandas as pd

from wholecell.io.schemas.rnaseq import (
    RnaseqSamplesManifestSchema,
    RnaseqTpmTableSchema,
)

PathLike = Union[str, Path]


[docs] def _read_tsv(path: PathLike) -> pd.DataFrame: """Read a tab-delimited file into a DataFrame.""" path = Path(path) return pd.read_csv(path, sep="\t")
[docs] def ingest_rnaseq_tpm_table(path: PathLike) -> pd.DataFrame: """ Load and validate a single RNA-seq TPM table. Parameters ---------- path: Path to a TSV file with columns matching ``RnaseqTpmTableSchema``. Returns ------- pandas.DataFrame Validated DataFrame; extra columns are preserved but only the required/optional schema columns are validated. """ df = _read_tsv(path) return RnaseqTpmTableSchema.validate(df)
[docs] def ingest_rnaseq_manifest(path: PathLike) -> pd.DataFrame: """ Load and validate an RNA-seq samples manifest. Relative ``file_path`` entries are resolved relative to the manifest directory for convenience. Parameters ---------- path: Path to the manifest TSV file. Returns ------- pandas.DataFrame Validated manifest with ``file_path`` normalized to absolute paths. """ path = Path(path) df = _read_tsv(path) manifest = RnaseqSamplesManifestSchema.validate(df) base_dir = path.parent def _normalize_file_path(p: str) -> str: if os.path.isabs(p): return p return str((base_dir / p).resolve()) manifest["file_path"] = manifest["file_path"].astype(str).map(_normalize_file_path) return manifest
[docs] def ingest_transcriptome( manifest_path: PathLike, dataset_id: str ) -> Tuple[pd.DataFrame, dict]: """ Ingest a single transcriptome (TPM table) specified by ``dataset_id``. This is a convenience wrapper that: 1) Validates the manifest. 2) Looks up the row with the given ``dataset_id``. 3) Loads and validates the corresponding TPM table. Parameters ---------- manifest_path: Path to the RNA-seq samples manifest TSV. dataset_id: Identifier of the dataset to load (must match a ``dataset_id`` row). Returns ------- (pandas.DataFrame, dict) - Validated TPM table for the requested dataset. - Metadata dict for the selected manifest row. Raises ------ KeyError If ``dataset_id`` is not found in the manifest. ValueError If multiple rows share the same ``dataset_id``. """ manifest = ingest_rnaseq_manifest(manifest_path) matches = manifest[manifest["dataset_id"] == dataset_id] if matches.empty: raise KeyError( f"Dataset_id {dataset_id!r} not found in manifest {manifest_path!r}." ) if len(matches) > 1: raise ValueError( f"Dataset_id {dataset_id!r} appears more than once in manifest " f"{manifest_path!r}." ) row = matches.iloc[0] tpm_path = row["file_path"] tpm_table = ingest_rnaseq_tpm_table(tpm_path) return tpm_table, row.to_dict()