Source code for reconstruction.ecoli.knowledge_base_raw

"""
KnowledgeBase for Ecoli
Whole-cell knowledge base for Ecoli. Contains all raw, un-fit data processed
directly from CSV flat files.

"""

import io
import os
import json
from typing import List, Dict
import warnings

from reconstruction.spreadsheets import read_tsv
from wholecell.io import tsv
from wholecell.utils import units  # used by eval()

FLAT_DIR = os.path.join(os.path.dirname(__file__), "flat")
LIST_OF_DICT_FILENAMES = [
    "amino_acid_export_kms.tsv",
    "amino_acid_export_kms_removed.tsv",
    "amino_acid_pathways.tsv",
    "amino_acid_uptake_rates.tsv",
    "amino_acid_uptake_rates_removed.tsv",
    "biomass.tsv",
    "compartments.tsv",
    "complexation_reactions.tsv",
    "complexation_reactions_added.tsv",
    "complexation_reactions_modified.tsv",
    "complexation_reactions_removed.tsv",
    "disabled_kinetic_reactions.tsv",
    "dna_sites.tsv",
    "dry_mass_composition.tsv",
    "endoRNases.tsv",
    "equilibrium_reaction_rates.tsv",
    "equilibrium_reactions.tsv",
    "equilibrium_reactions_added.tsv",
    "equilibrium_reactions_removed.tsv",
    "fold_changes.tsv",
    "fold_changes_nca.tsv",
    "fold_changes_removed.tsv",
    "footprint_sizes.tsv",
    "genes.tsv",
    "growth_rate_dependent_parameters.tsv",
    "linked_metabolites.tsv",
    "metabolic_reactions.tsv",
    "metabolic_reactions_added.tsv",
    "metabolic_reactions_modified.tsv",
    "metabolic_reactions_removed.tsv",
    "metabolism_kinetics.tsv",
    "metabolite_concentrations.tsv",
    "metabolite_concentrations_removed.tsv",
    "metabolites.tsv",
    "metabolites_added.tsv",
    "modified_proteins.tsv",
    "molecular_weight_keys.tsv",
    "ppgpp_fc.tsv",
    "ppgpp_regulation.tsv",
    "ppgpp_regulation_added.tsv",
    "ppgpp_regulation_removed.tsv",
    "protein_half_lives_measured.tsv",
    "protein_half_lives_n_end_rule.tsv",
    "protein_half_lives_pulsed_silac.tsv",
    "proteins.tsv",
    "relative_metabolite_concentrations.tsv",
    "rna_half_lives.tsv",
    "rna_maturation_enzymes.tsv",
    "rnas.tsv",
    "secretions.tsv",
    "sequence_motifs.tsv",
    "transcription_factors.tsv",
    # "transcription_units.tsv",  # special cased in the constructor
    "transcription_units_added.tsv",
    "transcription_units_removed.tsv",
    "transcriptional_attenuation.tsv",
    "transcriptional_attenuation_removed.tsv",
    "tf_one_component_bound.tsv",
    "translation_efficiency.tsv",
    "trna_charging_reactions.tsv",
    "trna_charging_reactions_added.tsv",
    "trna_charging_reactions_removed.tsv",
    "two_component_systems.tsv",
    "two_component_system_templates.tsv",
    os.path.join("mass_fractions", "glycogen_fractions.tsv"),
    os.path.join("mass_fractions", "ion_fractions.tsv"),
    os.path.join("mass_fractions", "LPS_fractions.tsv"),
    os.path.join("mass_fractions", "lipid_fractions.tsv"),
    os.path.join("mass_fractions", "murein_fractions.tsv"),
    os.path.join("mass_fractions", "soluble_fractions.tsv"),
    os.path.join("trna_data", "trna_ratio_to_16SrRNA_0p4.tsv"),
    os.path.join("trna_data", "trna_ratio_to_16SrRNA_0p7.tsv"),
    os.path.join("trna_data", "trna_ratio_to_16SrRNA_1p6.tsv"),
    os.path.join("trna_data", "trna_ratio_to_16SrRNA_1p07.tsv"),
    os.path.join("trna_data", "trna_ratio_to_16SrRNA_2p5.tsv"),
    os.path.join("trna_data", "trna_growth_rates.tsv"),
    os.path.join("rna_seq_data", "rnaseq_rsem_tpm_mean.tsv"),
    os.path.join("rna_seq_data", "rnaseq_rsem_tpm_std.tsv"),
    os.path.join("rna_seq_data", "rnaseq_seal_rpkm_mean.tsv"),
    os.path.join("rna_seq_data", "rnaseq_seal_rpkm_std.tsv"),
    os.path.join("rrna_options", "remove_rrff", "genes_removed.tsv"),
    os.path.join("rrna_options", "remove_rrff", "rnas_removed.tsv"),
    os.path.join("rrna_options", "remove_rrff", "transcription_units_modified.tsv"),
    os.path.join(
        "rrna_options", "remove_rrna_operons", "transcription_units_added.tsv"
    ),
    os.path.join(
        "rrna_options", "remove_rrna_operons", "transcription_units_removed.tsv"
    ),
    os.path.join("condition", "tf_condition.tsv"),
    os.path.join("condition", "condition_defs.tsv"),
    os.path.join("condition", "environment_molecules.tsv"),
    os.path.join("condition", "timelines_def.tsv"),
    os.path.join("condition", "media_recipes.tsv"),
    os.path.join("condition", "media", "5X_supplement_EZ.tsv"),
    os.path.join("condition", "media", "MIX0-55.tsv"),
    os.path.join("condition", "media", "MIX0-57.tsv"),
    os.path.join("condition", "media", "MIX0-58.tsv"),
    os.path.join("condition", "media", "MIX0-844.tsv"),
    os.path.join("base_codes", "amino_acids.tsv"),
    os.path.join("base_codes", "dntp.tsv"),
    os.path.join("base_codes", "nmp.tsv"),
    os.path.join("base_codes", "ntp.tsv"),
    os.path.join("adjustments", "amino_acid_pathways.tsv"),
    os.path.join("adjustments", "balanced_translation_efficiencies.tsv"),
    os.path.join("adjustments", "translation_efficiencies_adjustments.tsv"),
    os.path.join("adjustments", "rna_expression_adjustments.tsv"),
    os.path.join("adjustments", "rna_deg_rates_adjustments.tsv"),
    os.path.join("adjustments", "protein_deg_rates_adjustments.tsv"),
    os.path.join("adjustments", "relative_metabolite_concentrations_changes.tsv"),
]
SEQUENCE_FILE = "sequence.fasta"
LIST_OF_PARAMETER_FILENAMES = [
    "dna_supercoiling.tsv",
    "parameters.tsv",
    "mass_parameters.tsv",
    os.path.join("new_gene_data", "new_gene_baseline_expression_parameters.tsv"),
]

REMOVED_DATA = {
    "amino_acid_export_kms": "amino_acid_export_kms_removed",
    "amino_acid_uptake_rates": "amino_acid_uptake_rates_removed",
    "complexation_reactions": "complexation_reactions_removed",
    "equilibrium_reactions": "equilibrium_reactions_removed",
    "fold_changes": "fold_changes_removed",
    "fold_changes_nca": "fold_changes_removed",
    "metabolic_reactions": "metabolic_reactions_removed",
    "metabolite_concentrations": "metabolite_concentrations_removed",
    "ppgpp_regulation": "ppgpp_regulation_removed",
    "transcriptional_attenuation": "transcriptional_attenuation_removed",
    "trna_charging_reactions": "trna_charging_reactions_removed",
}
MODIFIED_DATA = {
    "complexation_reactions": "complexation_reactions_modified",
    "metabolic_reactions": "metabolic_reactions_modified",
}

ADDED_DATA = {
    "complexation_reactions": "complexation_reactions_added",
    "equilibrium_reactions": "equilibrium_reactions_added",
    "metabolic_reactions": "metabolic_reactions_added",
    "metabolites": "metabolites_added",
    "ppgpp_regulation": "ppgpp_regulation_added",
    "trna_charging_reactions": "trna_charging_reactions_added",
}



[docs]
class DataStore(object):
    def __init__(self):
        pass




[docs]
class KnowledgeBaseEcoli(object):
    """KnowledgeBaseEcoli"""

    def __init__(
        self,
        operons_on: bool,
        remove_rrna_operons: bool,
        remove_rrff: bool,
        stable_rrna: bool,
        new_genes_option: str = "off",
    ):
        self.operons_on = operons_on
        self.stable_rrna = stable_rrna
        self.new_genes_option = new_genes_option

        if not operons_on and remove_rrna_operons:
            warnings.warn(
                "Setting the 'remove_rrna_operons' option to 'True'"
                " has no effect on the simulations when the 'operon'"
                " option is set to 'off'."
            )

        self.compartments: List[
            dict
        ] = []  # mypy can't track setattr(self, attr_name, rows)
        self.transcription_units: List[dict] = []

        # Make copies to prevent issues with sticky global variables when
        # running multiple operon workflows through Fireworks
        self.list_of_dict_filenames: List[str] = LIST_OF_DICT_FILENAMES.copy()
        self.list_of_parameter_filenames: List[str] = LIST_OF_PARAMETER_FILENAMES.copy()
        self.removed_data: Dict[str, str] = REMOVED_DATA.copy()
        self.modified_data: Dict[str, str] = MODIFIED_DATA.copy()
        self.added_data: Dict[str, str] = ADDED_DATA.copy()

        self.new_gene_added_data: Dict[str, str] = {}
        self.parameter_file_attribute_names: List[str] = [
            os.path.splitext(os.path.basename(filename))[0]
            for filename in self.list_of_parameter_filenames
        ]

        if self.operons_on:
            self.list_of_dict_filenames.append("transcription_units.tsv")
            if remove_rrna_operons:
                # Use alternative file with all rRNA transcription units if
                # remove_rrna_operons option was used
                self.removed_data.update(
                    {
                        "transcription_units": "rrna_options.remove_rrna_operons.transcription_units_removed",
                    }
                )
                self.added_data.update(
                    {
                        "transcription_units": "rrna_options.remove_rrna_operons.transcription_units_added",
                    }
                )
            else:
                self.removed_data.update(
                    {
                        "transcription_units": "transcription_units_removed",
                    }
                )
                self.added_data.update(
                    {
                        "transcription_units": "transcription_units_added",
                    }
                )

        if remove_rrff:
            self.list_of_parameter_filenames.append(
                os.path.join(
                    "rrna_options", "remove_rrff", "mass_parameters_modified.tsv"
                )
            )
            self.removed_data.update(
                {
                    "genes": "rrna_options.remove_rrff.genes_removed",
                    "rnas": "rrna_options.remove_rrff.rnas_removed",
                }
            )
            self.modified_data.update(
                {
                    "mass_parameters": "rrna_options.remove_rrff.mass_parameters_modified",
                }
            )
            if self.operons_on:
                self.modified_data.update(
                    {
                        "transcription_units": "rrna_options.remove_rrff.transcription_units_modified",
                    }
                )

        if self.new_genes_option != "off":
            new_gene_subdir = new_genes_option
            new_gene_path = os.path.join("new_gene_data", new_gene_subdir)
            assert os.path.isdir(
                os.path.join(FLAT_DIR, new_gene_path)
            ), "This new_genes_data subdirectory is invalid."
            nested_attr = "new_gene_data." + new_gene_subdir + "."

            # These files do not need to be joined to existing files
            self.list_of_dict_filenames.append(
                os.path.join(new_gene_path, "insertion_location.tsv")
            )
            self.list_of_dict_filenames.append(
                os.path.join(new_gene_path, "gene_sequences.tsv")
            )

            # These files need to be joined to existing files
            new_gene_shared_files = [
                "genes",
                "rnas",
                "proteins",
                "rna_half_lives",
                "protein_half_lives_measured",
            ]
            for f in new_gene_shared_files:
                file_path = os.path.join(new_gene_path, f + ".tsv")
                # If these files are empty, fill in with default values at a
                # later point
                assert os.path.isfile(os.path.join(FLAT_DIR, file_path)), (
                    f"File {f}.tsv must be present in the new_genes_data"
                    f" subdirectory {new_gene_subdir}."
                )
                self.list_of_dict_filenames.append(file_path)
                self.new_gene_added_data.update({f: nested_attr + f})

            rnaseq_path = os.path.join(new_gene_path, "rnaseq_rsem_tpm_mean.tsv")
            if os.path.isfile(os.path.join(FLAT_DIR, rnaseq_path)):
                self.list_of_dict_filenames.append(rnaseq_path)
                self.new_gene_added_data.update(
                    {
                        "rna_seq_data.rnaseq_rsem_tpm_mean": nested_attr
                        + "rnaseq_rsem_tpm_mean"
                    }
                )

        # Load raw data from TSV files
        for filename in self.list_of_dict_filenames:
            self._load_tsv(FLAT_DIR, os.path.join(FLAT_DIR, filename))

        for filename in self.list_of_parameter_filenames:
            self._load_parameters(FLAT_DIR, os.path.join(FLAT_DIR, filename))

        self.genome_sequence = self._load_sequence(
            os.path.join(FLAT_DIR, SEQUENCE_FILE)
        )

        self._prune_data()

        self._join_data()
        self._modify_data()

        if self.new_genes_option != "off":
            self._check_new_gene_ids(nested_attr)

            insert_pos = self._update_gene_insertion_location(nested_attr)

            insertion_sequence = self._get_new_gene_sequence(nested_attr)

            insert_end = self._update_gene_locations(nested_attr, insert_pos)
            self.new_gene_added_data.update({"genes": nested_attr + "genes"})

            self.genome_sequence = (
                self.genome_sequence[:insert_pos]
                + insertion_sequence
                + self.genome_sequence[insert_pos:]
            )
            assert (
                self.genome_sequence[insert_pos : (insert_end + 1)]
                == insertion_sequence
            )

            self.added_data = self.new_gene_added_data
            self._join_data()


[docs]
    def _load_tsv(self, dir_name, file_name):
        path = self
        for sub_path in file_name[len(dir_name) + 1 :].split(os.path.sep)[:-1]:
            if not hasattr(path, sub_path):
                setattr(path, sub_path, DataStore())
            path = getattr(path, sub_path)
        attr_name = file_name.split(os.path.sep)[-1].split(".")[0]
        setattr(path, attr_name, [])

        rows = read_tsv(file_name)
        setattr(path, attr_name, rows)



[docs]
    def _load_sequence(self, file_path):
        from Bio import SeqIO

        with open(file_path, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                return record.seq



[docs]
    def _load_parameters(self, dir_name, file_name):
        path = self
        for sub_path in file_name[len(dir_name) + 1 :].split(os.path.sep)[:-1]:
            if not hasattr(path, sub_path):
                setattr(path, sub_path, DataStore())
            path = getattr(path, sub_path)
        attr_name = file_name.split(os.path.sep)[-1].split(".")[0]
        param_dict = {}

        with io.open(file_name, "rb") as csvfile:
            reader = tsv.dict_reader(csvfile)

            for row in reader:
                value = json.loads(row["value"])
                if row["units"] != "":
                    # `eval()` the units [risky!] then strip it to just a unit
                    # since `a_list * a_float` (like `1.0 [1/s]`) fails, and
                    # `a_list * an_int` repeats the list, which is also broken.
                    unit = eval(row["units"])  # risky!
                    unit = units.getUnit(unit)  # strip
                    value = value * unit
                param_dict[row["name"]] = value

        setattr(path, attr_name, param_dict)



[docs]
    def _prune_data(self):
        """
        Remove rows that are specified to be removed. Data will only be removed
        if all data in a row in the file specifying rows to be removed matches
        the same columns in the raw data file.
        """

        # Check each pair of files to be removed
        for data_attr, attr_to_remove in self.removed_data.items():
            # Build the set of data to identify rows to be removed
            data_to_remove = getattr(self, attr_to_remove.split(".")[0])
            for attr in attr_to_remove.split(".")[1:]:
                data_to_remove = getattr(data_to_remove, attr)
            removed_cols = list(data_to_remove[0].keys())
            ids_to_remove = set()
            for row in data_to_remove:
                ids_to_remove.add(tuple([row[col] for col in removed_cols]))

            # Remove any matching rows
            data = getattr(self, data_attr)
            n_entries = len(data)
            removed_ids = set()
            for i, row in enumerate(data[::-1]):
                checked_id = tuple([row[col] for col in removed_cols])
                if checked_id in ids_to_remove:
                    data.pop(n_entries - i - 1)
                    removed_ids.add(checked_id)

            # Print warnings for entries that were marked to be removed that
            # does not exist in the original data file. Fold changes are
            # excluded since the original entries are split between two files.
            if not data_attr.startswith("fold_changes"):
                for unremoved_id in ids_to_remove - removed_ids:
                    print(
                        f"Warning: Could not remove row {unremoved_id} "
                        f"in flat file {data_attr} because the row does not "
                        f"exist."
                    )



[docs]
    def _join_data(self):
        """
        Add rows that are specified in additional files. Data will only be added
        if all the loaded columns from both datasets match.
        """

        # Join data for each file with data to be added
        for data_attr, attr_to_add in self.added_data.items():
            # Get datasets to join
            data = getattr(self, data_attr.split(".")[0])
            for attr in data_attr.split(".")[1:]:
                data = getattr(data, attr)

            added_data = getattr(self, attr_to_add.split(".")[0])
            for attr in attr_to_add.split(".")[1:]:
                added_data = getattr(added_data, attr)

            if added_data:  # Some new gene additional files may be empty
                # Check columns are the same for each dataset
                col_diff = set(data[0].keys()).symmetric_difference(
                    added_data[0].keys()
                )
                if col_diff:
                    raise ValueError(
                        f"Could not join datasets {data_attr} and {attr_to_add} "
                        f"because columns do not match (different columns: {col_diff})."
                    )

            # Join datasets
            for row in added_data:
                data.append(row)



[docs]
    def _modify_data(self):
        """
        Modify entires in rows that are specified to be modified. Rows must be
        identified by their entries in the first column (usually the ID column).
        """
        # Check each pair of files to be modified
        for data_attr, modify_attr in self.modified_data.items():
            # Build the set of data to identify rows to be modified
            data_to_modify = getattr(self, modify_attr.split(".")[0])
            for attr in modify_attr.split(".")[1:]:
                data_to_modify = getattr(data_to_modify, attr)
            data = getattr(self, data_attr)

            # If modifying a parameter file, replace values in dictionary
            if data_attr in self.parameter_file_attribute_names:
                for key, value in data_to_modify.items():
                    if key not in data:
                        raise ValueError(
                            f"Could not modify data {data_attr}"
                            f"with {modify_attr} because the name {key} does "
                            f"not exist in {data_attr}."
                        )

                    data[key] = value

            # If modifying a table file, replace rows
            else:
                id_col_name = list(data_to_modify[0].keys())[0]

                id_to_modified_cols = {}
                for row in data_to_modify:
                    id_to_modified_cols[row[id_col_name]] = row

                # Modify any matching rows with identical IDs
                if list(data[0].keys())[0] != id_col_name:
                    raise ValueError(
                        f"Could not modify data {data_attr} with "
                        f"{modify_attr} because the names of the first columns "
                        f"do not match."
                    )

                modified_entry_ids = set()
                for i, row in enumerate(data):
                    if row[id_col_name] in id_to_modified_cols:
                        modified_cols = id_to_modified_cols[row[id_col_name]]
                        for col_name in data[i]:
                            if col_name in modified_cols:
                                data[i][col_name] = modified_cols[col_name]
                        modified_entry_ids.add(row[id_col_name])

                # Check for entries in modification data that do not exist in
                # original data
                id_diff = set(id_to_modified_cols.keys()).symmetric_difference(
                    modified_entry_ids
                )
                if id_diff:
                    raise ValueError(
                        f"Could not modify data {data_attr} with "
                        f"{modify_attr} because of one or more entries in "
                        f"{modify_attr} that do not exist in {data_attr} "
                        f"(nonexistent entries: {id_diff})."
                    )



[docs]
    def _check_new_gene_ids(self, nested_attr):
        """
        Check to ensure each new gene, RNA, and protein id starts with NG.
        """
        nested_data = getattr(self, nested_attr[:-1].split(".")[0])
        for attr in nested_attr[:-1].split(".")[1:]:
            nested_data = getattr(nested_data, attr)

        new_genes_data = getattr(nested_data, "genes")
        new_RNA_data = getattr(nested_data, "rnas")
        new_protein_data = getattr(nested_data, "proteins")

        for row in new_genes_data:
            assert row["id"].startswith("NG"), "ids of new genes must start with NG"
        for row in new_RNA_data:
            assert row["id"].startswith("NG"), "ids of new gene rnas must start with NG"
        for row in new_protein_data:
            assert row["id"].startswith(
                "NG"
            ), "ids of new gene proteins must start with NG"
        return



[docs]
    def _update_gene_insertion_location(self, nested_attr):
        """
        Update insertion location of new genes to prevent conflicts.
        """

        genes_data = getattr(self, "genes")
        tu_data = getattr(self, "transcription_units")
        dna_sites_data = getattr(self, "dna_sites")

        nested_data = getattr(self, nested_attr[:-1].split(".")[0])
        for attr in nested_attr[:-1].split(".")[1:]:
            nested_data = getattr(nested_data, attr)

        insert_loc_data = getattr(nested_data, "insertion_location")

        assert (
            len(insert_loc_data) == 1
        ), "each noncontiguous insertion should be in its own directory"
        insert_pos = insert_loc_data[0]["insertion_pos"]

        if not tu_data:
            # Check if specified insertion location is in another gene
            data_to_check = genes_data.copy()
        else:
            # Check if specified insertion location is in a transcription unit
            data_to_check = tu_data.copy()

        # Add important DNA sites to the list of locations to check
        # TODO: Check for other DNA sites if we include any in the future
        sites_data_to_check = [
            site
            for site in dna_sites_data
            if site["common_name"] == "oriC" or site["common_name"] == "TerC"
        ]
        data_to_check += sites_data_to_check

        conflicts = [
            row
            for row in data_to_check
            if ((row["left_end_pos"] is not None) and (row["left_end_pos"] != ""))
            and ((row["right_end_pos"] is not None) and (row["left_end_pos"] != ""))
            and (row["left_end_pos"] < insert_pos)
            and (row["right_end_pos"] >= insert_pos)
        ]
        # Change insertion location to after conflicts
        if conflicts:
            shift = max([sub["right_end_pos"] for sub in conflicts]) - insert_pos + 1
            insert_pos = insert_pos + shift

        return insert_pos



[docs]
    def _update_global_coordinates(self, data, insert_pos, insert_len):
        """
        Updates the left and right end positions for all elements in data if
        their positions will be impacted by the new gene insertion.

        Args:
            data: Data attribute to update
            insert_pos: Location of new gene insertion
            insert_len: Length of new gene insertion

        """
        for row in data:
            left = row["left_end_pos"]
            right = row["right_end_pos"]
            if (
                (left is not None and left != "")
                and (right is not None and right != "")
                and left >= insert_pos
            ):
                row.update({"left_end_pos": left + insert_len})
                row.update({"right_end_pos": right + insert_len})



[docs]
    def _update_gene_locations(self, nested_attr, insert_pos):
        """
        Modify positions of original genes based upon the insertion location
        of new genes. Returns end position of the gene insertion.
        """

        genes_data = getattr(self, "genes")
        tu_data = getattr(self, "transcription_units")
        dna_sites_data = getattr(self, "dna_sites")

        nested_data = getattr(self, nested_attr[:-1].split(".")[0])
        for attr in nested_attr[:-1].split(".")[1:]:
            nested_data = getattr(nested_data, attr)

        new_genes_data = getattr(nested_data, "genes")
        new_genes_data = sorted(new_genes_data, key=lambda d: d["left_end_pos"])

        for i in range(len(new_genes_data) - 1):
            assert (
                new_genes_data[i + 1]["left_end_pos"]
                == new_genes_data[i]["right_end_pos"] + 1
            ), "gaps in new gene insertions are not supported at this time"

        insert_end = new_genes_data[-1]["right_end_pos"] + insert_pos
        insert_len = insert_end - insert_pos + 1

        # Update global positions of original genes
        self._update_global_coordinates(genes_data, insert_pos, insert_len)

        # Update global positions of transcription units
        if tu_data:
            self._update_global_coordinates(tu_data, insert_pos, insert_len)

        # Update DNA site positions
        # (including the origin and terminus of replication)
        self._update_global_coordinates(dna_sites_data, insert_pos, insert_len)

        # Change relative insertion positions to global in reference genome
        for row in new_genes_data:
            left = row["left_end_pos"]
            right = row["right_end_pos"]
            row.update({"left_end_pos": left + insert_pos})
            row.update({"right_end_pos": right + insert_pos})

        return insert_end



[docs]
    def _get_new_gene_sequence(self, nested_attr):
        """
        Determine genome sequnce for insertion using the sequences and
        relative locations of the new genes.
        """
        from Bio import Seq

        nested_data = getattr(self, nested_attr[:-1].split(".")[0])
        for attr in nested_attr[:-1].split(".")[1:]:
            nested_data = getattr(nested_data, attr)

        new_genes_data = getattr(nested_data, "genes")
        seq_data = getattr(nested_data, "gene_sequences")

        insertion_seq = Seq.Seq("")
        new_genes_data = sorted(new_genes_data, key=lambda d: d["left_end_pos"])
        assert (
            new_genes_data[0]["left_end_pos"] == 0
        ), "first gene in new sequence must start at relative coordinate 0"

        for gene in new_genes_data:
            if gene["direction"] == "+":
                seq_row = next(
                    (row for row in seq_data if row["id"] == gene["id"]), None
                )
                seq_string = seq_row["gene_seq"]
                seq_addition = Seq.Seq(seq_string)
                insertion_seq += seq_addition
            else:
                seq_row = next(
                    (row for row in seq_data if row["id"] == gene["id"]), None
                )
                seq_string = seq_row["gene_seq"]
                seq_addition = Seq.Seq(seq_string)
                insertion_seq += seq_addition.reverse_complement()

            assert len(seq_addition) == (
                gene["right_end_pos"] - gene["left_end_pos"] + 1
            ), (
                "left and right end positions must agree with actual "
                "sequence length for " + gene["id"]
            )

        return insertion_seq