Source code for wholecell.tests.io.test_ingestion

"""Tests for wholecell.io.ingestion."""

from __future__ import annotations

import os
import tempfile
import unittest
from pathlib import Path

import pandas as pd
import pandera.errors as pa_errors

from wholecell.io import ingestion

# Silence Sphinx autodoc warning
unittest.TestCase.__module__ = "unittest"

# Fixtures: real-shaped TPM + manifest (same layout as experimental_data)
FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures"


[docs] class TestIngestion(unittest.TestCase):
[docs] def _write_tsv(self, path: Path, df: pd.DataFrame) -> None: df.to_csv(path, sep="\t", index=False)
def test_ingest_rnaseq_tpm_table_valid(self): with tempfile.TemporaryDirectory() as tmpdir: tpm_path = Path(tmpdir) / "rnaseq_test_tpm.tsv" df = pd.DataFrame( { "gene_id": ["EG10001", "EG10002"], "tpm_mean": [10.0, 20.0], } ) self._write_tsv(tpm_path, df) validated = ingestion.ingest_rnaseq_tpm_table(tpm_path) self.assertEqual(list(validated["gene_id"]), ["EG10001", "EG10002"]) self.assertTrue((validated["tpm_mean"] >= 0).all()) def test_ingest_rnaseq_manifest_normalizes_paths(self): with tempfile.TemporaryDirectory() as tmpdir: tmpdir_path = Path(tmpdir) tpm_path = tmpdir_path / "rnaseq_test_tpm.tsv" df_tpm = pd.DataFrame( { "gene_id": ["EG10001"], "tpm_mean": [1.0], } ) self._write_tsv(tpm_path, df_tpm) manifest_path = tmpdir_path / "manifest.tsv" df_manifest = pd.DataFrame( { "dataset_id": ["ds1"], "dataset_description": ["test dataset"], "file_path": [tpm_path.name], # relative path "data_source": ["test_source"], } ) self._write_tsv(manifest_path, df_manifest) manifest = ingestion.ingest_rnaseq_manifest(manifest_path) self.assertEqual(len(manifest), 1) file_path = manifest.loc[0, "file_path"] self.assertTrue(os.path.isabs(file_path)) self.assertTrue(file_path.endswith(str(tpm_path.name))) def test_ingest_transcriptome_round_trip(self): with tempfile.TemporaryDirectory() as tmpdir: tmpdir_path = Path(tmpdir) tpm_path = tmpdir_path / "rnaseq_test_tpm.tsv" df_tpm = pd.DataFrame( { "gene_id": ["EG10001", "EG10002"], "tpm_mean": [5.0, 15.0], } ) self._write_tsv(tpm_path, df_tpm) manifest_path = tmpdir_path / "manifest.tsv" df_manifest = pd.DataFrame( { "dataset_id": ["ds1"], "dataset_description": ["test dataset"], "file_path": [tpm_path.name], "data_source": ["test_source"], } ) self._write_tsv(manifest_path, df_manifest) tpm_table, metadata = ingestion.ingest_transcriptome(manifest_path, "ds1") self.assertEqual(list(tpm_table["gene_id"]), ["EG10001", "EG10002"]) self.assertEqual(metadata["dataset_id"], "ds1") self.assertTrue(os.path.isabs(metadata["file_path"])) def test_ingest_transcriptome_from_fixture(self): """Load real-shaped fixture TPM via manifest (validates fixture format).""" manifest_path = FIXTURES_DIR / "rnaseq_manifest_small.tsv" if not manifest_path.exists(): self.skipTest("fixtures not found") tpm_table, metadata = ingestion.ingest_transcriptome(manifest_path, "ref_small") self.assertGreater(len(tpm_table), 0) self.assertIn("gene_id", tpm_table.columns) self.assertIn("tpm_mean", tpm_table.columns) self.assertEqual(metadata["dataset_id"], "ref_small") # --- Validation failures (missing columns / wrong types) --- def test_ingest_rnaseq_tpm_table_fails_missing_column(self): with tempfile.TemporaryDirectory() as tmpdir: path = Path(tmpdir) / "bad.tsv" df = pd.DataFrame({"tpm_mean": [1.0, 2.0]}) # missing gene_id df.to_csv(path, sep="\t", index=False) with self.assertRaises(pa_errors.SchemaError): ingestion.ingest_rnaseq_tpm_table(path) def test_ingest_rnaseq_tpm_table_fails_wrong_type(self): with tempfile.TemporaryDirectory() as tmpdir: path = Path(tmpdir) / "bad.tsv" df = pd.DataFrame( { "gene_id": ["EG10001"], "tpm_mean": ["not_a_number"], } ) df.to_csv(path, sep="\t", index=False) with self.assertRaises((pa_errors.SchemaError, ValueError)): ingestion.ingest_rnaseq_tpm_table(path) def test_ingest_rnaseq_manifest_fails_missing_column(self): with tempfile.TemporaryDirectory() as tmpdir: path = Path(tmpdir) / "manifest.tsv" df = pd.DataFrame( { "dataset_id": ["ds1"], "dataset_description": ["desc"], # missing file_path, data_source } ) df.to_csv(path, sep="\t", index=False) with self.assertRaises(pa_errors.SchemaError): ingestion.ingest_rnaseq_manifest(path) def test_ingest_rnaseq_tpm_table_fails_duplicate_gene_id(self): with tempfile.TemporaryDirectory() as tmpdir: path = Path(tmpdir) / "bad.tsv" df = pd.DataFrame( { "gene_id": ["EG10001", "EG10001"], "tpm_mean": [10.0, 20.0], } ) df.to_csv(path, sep="\t", index=False) with self.assertRaises(pa_errors.SchemaError): ingestion.ingest_rnaseq_tpm_table(path) def test_ingest_transcriptome_fails_unknown_dataset_id(self): with tempfile.TemporaryDirectory() as tmpdir: tmpdir_path = Path(tmpdir) tpm_path = tmpdir_path / "tpm.tsv" pd.DataFrame({"gene_id": ["EG10001"], "tpm_mean": [1.0]}).to_csv( tpm_path, sep="\t", index=False ) manifest_path = tmpdir_path / "manifest.tsv" pd.DataFrame( { "dataset_id": ["ds1"], "dataset_description": ["d"], "file_path": [tpm_path.name], "data_source": ["s"], } ).to_csv(manifest_path, sep="\t", index=False) with self.assertRaises(KeyError): ingestion.ingest_transcriptome(manifest_path, "nonexistent_id")
if __name__ == "__main__": unittest.main()