diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bfa641a..755d4c5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,7 +35,7 @@ repos: # Ruff for linting and formatting Python files - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.15.4 + rev: v0.15.5 hooks: - id: ruff-check args: ["--fix"] diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb index 6a6d223..e7ea279 100644 --- a/notebooks/0.download-data/1.download-data.ipynb +++ b/notebooks/0.download-data/1.download-data.ipynb @@ -7,7 +7,7 @@ "source": [ "# Downloading Single-Cell Profiles\n", "\n", - "This notebook focuses on downloading metadata and single-cell profiles from three key datasets:\n", + "This notebook downloading metadata and single-cell profiles from three key datasets:\n", "\n", "1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.\n", "2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.\n", @@ -74,7 +74,7 @@ "outputs": [], "source": [ "# setting config path\n", - "config_path = pathlib.Path(\"../nb-configs.yaml\").resolve(strict=True)\n", + "config_path = pathlib.Path(\"dl-configs.yaml\").resolve(strict=True)\n", "\n", "# setting results setting a data directory\n", "data_dir = pathlib.Path(\"./data\").resolve()\n", @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "5b8bfe5f", "metadata": {}, "outputs": [ @@ -126,15 +126,15 @@ "Series: 'Assay_Plate_Barcode' [str]\n", "[\n", "\t\"BR00117054\"\n", - "\t\"BR00117012\"\n", - "\t\"BR00117008\"\n", - "\t\"BR00117016\"\n", "\t\"BR00117055\"\n", - "\t…\n", + "\t\"BR00117010\"\n", + "\t\"BR00117009\"\n", "\t\"BR00117011\"\n", + "\t…\n", "\t\"BR00117013\"\n", - "\t\"BR00117010\"\n", - "\t\"BR00117017\"\n", + "\t\"BR00117008\"\n", + "\t\"BR00117012\"\n", + "\t\"BR00117015\"\n", "\t\"BR00117019\"\n", "]\n", "shape: (12, 13)\n" @@ -241,6 +241,80 @@ "exp_metadata" ] }, + { + "cell_type": "markdown", + "id": "a4665c17", + "metadata": {}, + "source": [ + "\n", + "In this section, we download:\n", + "\n", + "1. **Compound metadata** from the CPJUMP1 repository \n", + "2. **Mechanism of action (MOA) metadata** from the Broad Repurposing Hub\n", + "\n", + "We then merge both datasets into a single compound metadata table.\n", + "\n", + "If a compound has missing MOA information, the value in `Metadata_moa` is replaced with `\"unknown\"`. This indicates that no MOA annotation is currently available for that compound." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "22e417e3", + "metadata": {}, + "outputs": [], + "source": [ + "# downloading compound metadata from cpjump1 repo\n", + "CPJUMP_compound_metadata = pl.read_csv(\n", + " nb_configs[\"links\"][\"CPJUMP1-compound-metadata-source\"],\n", + " separator=\"\\t\",\n", + " has_header=True,\n", + " encoding=\"utf-8\",\n", + ")\n", + "\n", + "# downloading compound moa metadata from broad institute drug repurposing hub\n", + "broad_compound_moa_metadata = pl.read_csv(\n", + " nb_configs[\"links\"][\"Broad-compounds-moa-source\"],\n", + " separator=\"\\t\",\n", + " skip_rows=9,\n", + " encoding=\"utf8-lossy\",\n", + ")\n", + "\n", + "# for both dataframes make sure that all columns have \"Metadata_\" in the column name\n", + "CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename(\n", + " {col: f\"Metadata_{col}\" for col in CPJUMP_compound_metadata.columns}\n", + ")\n", + "broad_compound_moa_metadata = broad_compound_moa_metadata.rename(\n", + " {col: f\"Metadata_{col}\" for col in broad_compound_moa_metadata.columns}\n", + ")\n", + "\n", + "# replace null values in the boroad compound moa to \"unknown\"\n", + "broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(\n", + " pl.col(\"Metadata_moa\").fill_null(\"unknown\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "01db7db8", + "metadata": {}, + "outputs": [], + "source": [ + "complete_compound_metadata = CPJUMP_compound_metadata.join(\n", + " broad_compound_moa_metadata,\n", + " left_on=\"Metadata_pert_iname\",\n", + " right_on=\"Metadata_pert_iname\",\n", + " how=\"left\",\n", + ")\n", + "\n", + "\n", + "# save the complete compound metadata as a tsv file\n", + "complete_compound_metadata.write_csv(\n", + " cpjump1_dir / f\"cpjump1_{pert_type}_compound-metadata.tsv\", separator=\"\\t\"\n", + ")" + ] + }, { "cell_type": "markdown", "id": "7021b414", @@ -255,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "06783224", "metadata": {}, "outputs": [ @@ -284,16 +358,16 @@ "source": [ "## Downloading CFReT Data\n", "\n", - "In this section, we download feature-selected single-cell profiles from the CFReT plate `localhost230405150001`. This plate contains three treatments: DMSO (control), drug_x, and TGFRi. The dataset consists of high-content imaging data that has already undergone feature selection, making it suitable for downstream analysis.\n", + "This section downloads and saves feature-selected single-cell profiles from the CFReT plate `localhost230405150001`.\n", "\n", - "**Key Points:**\n", - "- Only the processed single-cell profiles are downloaded [here](https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/tree/main/3.process_cfret_features/data/single_cell_profiles)\n", - "- The CFReT dataset was used and published in [this study](https://doi.org/10.1161/CIRCULATIONAHA.124.071956)." + "- Only processed single-cell profiles are downloaded (no raw data).\n", + "- Data is saved as a Parquet file for fast access.\n", + "- Used in published cardiac fibrosis research ([study link](https://doi.org/10.1161/CIRCULATIONAHA.124.071956))." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "4d9fd47c", "metadata": {}, "outputs": [ @@ -344,7 +418,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.12.9" } }, "nbformat": 4, diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb index d1178ff..62405bc 100644 --- a/notebooks/0.download-data/2.preprocessing.ipynb +++ b/notebooks/0.download-data/2.preprocessing.ipynb @@ -31,13 +31,12 @@ "import sys\n", "import json\n", "import pathlib\n", - "from typing import Optional\n", "\n", "import polars as pl\n", "\n", "sys.path.append(\"../../\")\n", "from utils.data_utils import split_meta_and_features, add_cell_id_hash\n", - "from utils.io_utils import load_profiles" + "from utils.io_utils import load_and_concat_profiles" ] }, { @@ -57,64 +56,6 @@ "metadata": {}, "outputs": [], "source": [ - "def load_and_concat_profiles(\n", - " profile_dir: str | pathlib.Path,\n", - " shared_features: Optional[list[str]] = None,\n", - " specific_plates: Optional[list[pathlib.Path]] = None,\n", - ") -> pl.DataFrame:\n", - " \"\"\"\n", - " Load all profile files from a directory and concatenate them into a single Polars DataFrame.\n", - "\n", - " Parameters\n", - " ----------\n", - " profile_dir : str or pathlib.Path\n", - " Directory containing the profile files (.parquet).\n", - " shared_features : Optional[list[str]], optional\n", - " List of shared feature names to filter the profiles. If None, all features are loaded.\n", - " specific_plates : Optional[list[pathlib.Path]], optional\n", - " List of specific plate file paths to load. If None, all profiles in the directory are loaded.\n", - "\n", - " Returns\n", - " -------\n", - " pl.DataFrame\n", - " Concatenated Polars DataFrame containing all loaded profiles.\n", - " \"\"\"\n", - " # Ensure profile_dir is a pathlib.Path\n", - " if isinstance(profile_dir, str):\n", - " profile_dir = pathlib.Path(profile_dir)\n", - " elif not isinstance(profile_dir, pathlib.Path):\n", - " raise TypeError(\"profile_dir must be a string or a pathlib.Path object\")\n", - "\n", - " # Validate specific_plates\n", - " if specific_plates is not None:\n", - " if not isinstance(specific_plates, list):\n", - " raise TypeError(\"specific_plates must be a list of pathlib.Path objects\")\n", - " if not all(isinstance(path, pathlib.Path) for path in specific_plates):\n", - " raise TypeError(\n", - " \"All elements in specific_plates must be pathlib.Path objects\"\n", - " )\n", - "\n", - " # Use specific_plates if provided, otherwise gather all .parquet files\n", - " if specific_plates is not None:\n", - " # Validate that all specific plate files exist\n", - " for plate_path in specific_plates:\n", - " if not plate_path.exists():\n", - " raise FileNotFoundError(f\"Profile file not found: {plate_path}\")\n", - " files_to_load = specific_plates\n", - " else:\n", - " files_to_load = list(profile_dir.glob(\"*.parquet\"))\n", - " if not files_to_load:\n", - " raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n", - "\n", - " # Load and concatenate profiles\n", - " loaded_profiles = [\n", - " load_profiles(f, shared_features=shared_features) for f in files_to_load\n", - " ]\n", - "\n", - " # Concatenate all loaded profiles\n", - " return pl.concat(loaded_profiles, rechunk=True)\n", - "\n", - "\n", "def split_data(\n", " pycytominer_output: pl.DataFrame, dataset: str = \"CP_and_DP\"\n", ") -> pl.DataFrame:\n", @@ -199,16 +140,17 @@ "# Setting profiles directory\n", "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n", "\n", - "# setting connectivity map drug repurposing config\n", - "drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n", - " strict=True\n", - ")\n", "\n", "# Experimental metadata\n", "exp_metadata_path = (\n", " profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n", ").resolve(strict=True)\n", "\n", + "# cpjump1 compound metadta\n", + "cmp_metadata_path = (\n", + " profiles_dir / \"cpjump1\" / \"cpjump1_compound_compound-metadata.tsv\"\n", + ").resolve(strict=True)\n", + "\n", "# Setting CFReT profiles directory\n", "cfret_profiles_dir = (profiles_dir / \"cfret\").resolve(strict=True)\n", "cfret_profiles_path = (\n", @@ -321,7 +263,7 @@ "id": "3df9bbf5", "metadata": {}, "source": [ - "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n" + "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each row with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP) and cell type. This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n" ] }, { @@ -333,14 +275,24 @@ "source": [ "# load drug repurposing moa file and add prefix to metadata columns\n", "rep_moa_df = pl.read_csv(\n", - " drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n", - ").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n", + " cmp_metadata_path,\n", + " separator=\"\\t\",\n", + " columns=[\"Metadata_pert_iname\", \"Metadata_target\", \"Metadata_moa\"],\n", + ").unique(subset=[\"Metadata_pert_iname\"])\n", "\n", "# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n", "cpjump1_profiles = cpjump1_profiles.join(\n", " rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n", ")\n", "\n", + "# merge cell type metadata with cpjump1_profiles on Metadata_Plate\n", + "cell_type_metadata = exp_metadata.select([\"Assay_Plate_Barcode\", \"Cell_type\"]).rename(\n", + " {\"Assay_Plate_Barcode\": \"Metadata_Plate\", \"Cell_type\": \"Metadata_cell_type\"}\n", + ")\n", + "cpjump1_profiles = cpjump1_profiles.join(\n", + " cell_type_metadata, on=\"Metadata_Plate\", how=\"left\"\n", + ")\n", + "\n", "# split meta and feature\n", "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n", "\n", @@ -364,7 +316,7 @@ }, { "cell_type": "markdown", - "id": "4a0ba6ad", + "id": "92bacbc9", "metadata": {}, "source": [ "## Preprocessing MitoCheck Dataset\n", @@ -582,6 +534,7 @@ "# load in cfret profiles and add a unique cell ID\n", "cfret_profiles = pl.read_parquet(cfret_profiles_path)\n", "\n", + "\n", "# adding a unique cell ID based on all features\n", "cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)\n", "\n", @@ -623,7 +576,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.12.9" } }, "nbformat": 4, diff --git a/notebooks/0.download-data/3.subset-jump-controls.ipynb b/notebooks/0.download-data/3.subset-jump-controls.ipynb index c736f80..39e1f46 100644 --- a/notebooks/0.download-data/3.subset-jump-controls.ipynb +++ b/notebooks/0.download-data/3.subset-jump-controls.ipynb @@ -142,13 +142,8 @@ "metadata": {}, "outputs": [], "source": [ - "# setting data path\n", - "data_dir = pathlib.Path(\"../0.download-data/data\").resolve(strict=True)\n", - "download_module_results_dir = pathlib.Path(\"../0.download-data/results\").resolve(\n", - " strict=True\n", - ")\n", - "\n", "# setting directory where all the single-cell profiles are stored\n", + "data_dir = pathlib.Path.cwd() / \"data\"\n", "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n", "\n", "cpjump1_data_path = (\n", @@ -161,11 +156,6 @@ " profiles_dir / \"cpjump1\" / \"feature_selected_sc_qc_features.json\"\n", ").resolve(strict=True)\n", "\n", - "# setting cpjump1 data dir\n", - "cpjump_crispr_data_dir = (data_dir / \"sc-profiles\" / \"cpjump1-crispr-negcon\").resolve()\n", - "cpjump_crispr_data_dir.mkdir(exist_ok=True)\n", - "\n", - "\n", "# setting negative control\n", "negcon_data_dir = (profiles_dir / \"cpjump1\" / \"negcon\").resolve()\n", "negcon_data_dir.mkdir(exist_ok=True)\n", @@ -224,7 +214,7 @@ "\n", " # save the file\n", " subsampled_df.write_parquet(\n", - " negcon_data_dir / f\"cpjump1_crispr_negcon_seed{seed_val}.parquet\"\n", + " negcon_data_dir / f\"cpjump1_compound_negcon_seed{seed_val}.parquet\"\n", " )" ] }, @@ -268,7 +258,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.12.9" } }, "nbformat": 4, diff --git a/notebooks/0.download-data/dl-configs.yaml b/notebooks/0.download-data/dl-configs.yaml new file mode 100644 index 0000000..9a698f5 --- /dev/null +++ b/notebooks/0.download-data/dl-configs.yaml @@ -0,0 +1,8 @@ +links: + MitoCheck-profiles-source: https://zenodo.org/records/7967386/files/3.normalize_data__normalized_data.zip?download=1 + CFReT-profiles-source: https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/raw/refs/heads/main/3.process_cfret_features/data/single_cell_profiles/localhost230405150001_sc_feature_selected.parquet?download= + CPJUMP1-experimental-metadata-source: https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/raw/refs/heads/main/benchmark/output/experiment-metadata.tsv + CPJUMP1-compound-metadata-source: https://raw.githubusercontent.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/main/metadata/external_metadata/JUMP-Target-1_compound_metadata_targets.tsv + Broad-compounds-moa-source: https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20180907.txt + CPJUMP-plate-maps-source: https://raw.githubusercontent.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1/refs/heads/main/metadata/platemaps/2020_11_04_CPJUMP1/platemap/JUMP-Target-1_crispr_platemap.txt + CPJUMP1-profiles-source: https://cellpainting-gallery.s3.amazonaws.com/cpg0000-jump-pilot/source_4/workspace/profiles diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py index 092cc3c..a49bb30 100644 --- a/notebooks/0.download-data/nbconverted/1.download-data.py +++ b/notebooks/0.download-data/nbconverted/1.download-data.py @@ -2,7 +2,7 @@ # # Downloading Single-Cell Profiles # -# This notebook focuses on downloading metadata and single-cell profiles from three key datasets: +# This notebook downloading metadata and single-cell profiles from three key datasets: # # 1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis. # 2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis. @@ -37,7 +37,7 @@ # setting config path -config_path = pathlib.Path("../nb-configs.yaml").resolve(strict=True) +config_path = pathlib.Path("dl-configs.yaml").resolve(strict=True) # setting results setting a data directory data_dir = pathlib.Path("./data").resolve() @@ -69,7 +69,7 @@ # # For this notebook, we focus on plates containing both U2OS and A549 parental cell lines that have been treated with compounds for 48 hours. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata). -# In[ ]: +# In[4]: # loading config file and setting experimental metadata URL @@ -102,13 +102,73 @@ exp_metadata +# +# In this section, we download: +# +# 1. **Compound metadata** from the CPJUMP1 repository +# 2. **Mechanism of action (MOA) metadata** from the Broad Repurposing Hub +# +# We then merge both datasets into a single compound metadata table. +# +# If a compound has missing MOA information, the value in `Metadata_moa` is replaced with `"unknown"`. This indicates that no MOA annotation is currently available for that compound. + +# In[5]: + + +# downloading compound metadata from cpjump1 repo +CPJUMP_compound_metadata = pl.read_csv( + nb_configs["links"]["CPJUMP1-compound-metadata-source"], + separator="\t", + has_header=True, + encoding="utf-8", +) + +# downloading compound moa metadata from broad institute drug repurposing hub +broad_compound_moa_metadata = pl.read_csv( + nb_configs["links"]["Broad-compounds-moa-source"], + separator="\t", + skip_rows=9, + encoding="utf8-lossy", +) + +# for both dataframes make sure that all columns have "Metadata_" in the column name +CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename( + {col: f"Metadata_{col}" for col in CPJUMP_compound_metadata.columns} +) +broad_compound_moa_metadata = broad_compound_moa_metadata.rename( + {col: f"Metadata_{col}" for col in broad_compound_moa_metadata.columns} +) + +# replace null values in the boroad compound moa to "unknown" +broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns( + pl.col("Metadata_moa").fill_null("unknown") +) + + +# In[6]: + + +complete_compound_metadata = CPJUMP_compound_metadata.join( + broad_compound_moa_metadata, + left_on="Metadata_pert_iname", + right_on="Metadata_pert_iname", + how="left", +) + + +# save the complete compound metadata as a tsv file +complete_compound_metadata.write_csv( + cpjump1_dir / f"cpjump1_{pert_type}_compound-metadata.tsv", separator="\t" +) + + # ## Downloading MitoCheck Data # # In this section, we download the MitoCheck data generated in [this study](https://pmc.ncbi.nlm.nih.gov/articles/PMC3108885/). # # Specifically, we are downloading data that has already been normalized and feature-selected. The normalization and feature selection pipeline is available [here](https://github.com/WayScience/mitocheck_data/tree/main/3.normalize_data). -# In[5]: +# In[7]: # url source for the MitoCheck data @@ -122,13 +182,13 @@ # ## Downloading CFReT Data # -# In this section, we download feature-selected single-cell profiles from the CFReT plate `localhost230405150001`. This plate contains three treatments: DMSO (control), drug_x, and TGFRi. The dataset consists of high-content imaging data that has already undergone feature selection, making it suitable for downstream analysis. +# This section downloads and saves feature-selected single-cell profiles from the CFReT plate `localhost230405150001`. # -# **Key Points:** -# - Only the processed single-cell profiles are downloaded [here](https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/tree/main/3.process_cfret_features/data/single_cell_profiles) -# - The CFReT dataset was used and published in [this study](https://doi.org/10.1161/CIRCULATIONAHA.124.071956). +# - Only processed single-cell profiles are downloaded (no raw data). +# - Data is saved as a Parquet file for fast access. +# - Used in published cardiac fibrosis research ([study link](https://doi.org/10.1161/CIRCULATIONAHA.124.071956)). -# In[ ]: +# In[8]: # setting the source for the CFReT data diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py index 5004e89..5c20011 100644 --- a/notebooks/0.download-data/nbconverted/2.preprocessing.py +++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py @@ -26,7 +26,7 @@ sys.path.append("../../") from utils.data_utils import add_cell_id_hash, split_meta_and_features -from utils.io_utils import load_profiles +from utils.io_utils import load_and_concat_profiles # ## Helper functions # @@ -35,64 +35,6 @@ # In[2]: -def load_and_concat_profiles( - profile_dir: str | pathlib.Path, - shared_features: list[str] | None = None, - specific_plates: list[pathlib.Path] | None = None, -) -> pl.DataFrame: - """ - Load all profile files from a directory and concatenate them into a single Polars DataFrame. - - Parameters - ---------- - profile_dir : str or pathlib.Path - Directory containing the profile files (.parquet). - shared_features : Optional[list[str]], optional - List of shared feature names to filter the profiles. If None, all features are loaded. - specific_plates : Optional[list[pathlib.Path]], optional - List of specific plate file paths to load. If None, all profiles in the directory are loaded. - - Returns - ------- - pl.DataFrame - Concatenated Polars DataFrame containing all loaded profiles. - """ - # Ensure profile_dir is a pathlib.Path - if isinstance(profile_dir, str): - profile_dir = pathlib.Path(profile_dir) - elif not isinstance(profile_dir, pathlib.Path): - raise TypeError("profile_dir must be a string or a pathlib.Path object") - - # Validate specific_plates - if specific_plates is not None: - if not isinstance(specific_plates, list): - raise TypeError("specific_plates must be a list of pathlib.Path objects") - if not all(isinstance(path, pathlib.Path) for path in specific_plates): - raise TypeError( - "All elements in specific_plates must be pathlib.Path objects" - ) - - # Use specific_plates if provided, otherwise gather all .parquet files - if specific_plates is not None: - # Validate that all specific plate files exist - for plate_path in specific_plates: - if not plate_path.exists(): - raise FileNotFoundError(f"Profile file not found: {plate_path}") - files_to_load = specific_plates - else: - files_to_load = list(profile_dir.glob("*.parquet")) - if not files_to_load: - raise FileNotFoundError(f"No profile files found in {profile_dir}") - - # Load and concatenate profiles - loaded_profiles = [ - load_profiles(f, shared_features=shared_features) for f in files_to_load - ] - - # Concatenate all loaded profiles - return pl.concat(loaded_profiles, rechunk=True) - - def split_data( pycytominer_output: pl.DataFrame, dataset: str = "CP_and_DP" ) -> pl.DataFrame: @@ -167,16 +109,17 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Setting profiles directory profiles_dir = (data_dir / "sc-profiles").resolve(strict=True) -# setting connectivity map drug repurposing config -drug_repurposing_config_path = (data_dir / "repurposing_drugs_20180907.txt").resolve( - strict=True -) # Experimental metadata exp_metadata_path = ( profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv" ).resolve(strict=True) +# cpjump1 compound metadta +cmp_metadata_path = ( + profiles_dir / "cpjump1" / "cpjump1_compound_compound-metadata.tsv" +).resolve(strict=True) + # Setting CFReT profiles directory cfret_profiles_dir = (profiles_dir / "cfret").resolve(strict=True) cfret_profiles_path = ( @@ -258,7 +201,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr cpjump1_profiles = add_cell_id_hash(cpjump1_profiles) -# Next, we annotate the compound treatments in the CPJUMP1 dataset. We annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status. +# Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each row with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP) and cell type. This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status. # # In[6]: @@ -266,14 +209,24 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # load drug repurposing moa file and add prefix to metadata columns rep_moa_df = pl.read_csv( - drug_repurposing_config_path, separator="\t", skip_rows=9, encoding="utf8-lossy" -).rename(lambda x: f"Metadata_{x}" if not x.startswith("Metadata_") else x) + cmp_metadata_path, + separator="\t", + columns=["Metadata_pert_iname", "Metadata_target", "Metadata_moa"], +).unique(subset=["Metadata_pert_iname"]) # merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname cpjump1_profiles = cpjump1_profiles.join( rep_moa_df, on="Metadata_pert_iname", how="left" ) +# merge cell type metadata with cpjump1_profiles on Metadata_Plate +cell_type_metadata = exp_metadata.select(["Assay_Plate_Barcode", "Cell_type"]).rename( + {"Assay_Plate_Barcode": "Metadata_Plate", "Cell_type": "Metadata_cell_type"} +) +cpjump1_profiles = cpjump1_profiles.join( + cell_type_metadata, on="Metadata_Plate", how="left" +) + # split meta and feature meta_cols, features_cols = split_meta_and_features(cpjump1_profiles) @@ -472,6 +425,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # load in cfret profiles and add a unique cell ID cfret_profiles = pl.read_parquet(cfret_profiles_path) + # adding a unique cell ID based on all features cfret_profiles = add_cell_id_hash(cfret_profiles, force=True) diff --git a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py index 0be0975..31283ad 100644 --- a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py +++ b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py @@ -113,13 +113,8 @@ def load_group_stratified_data( # In[3]: -# setting data path -data_dir = pathlib.Path("../0.download-data/data").resolve(strict=True) -download_module_results_dir = pathlib.Path("../0.download-data/results").resolve( - strict=True -) - # setting directory where all the single-cell profiles are stored +data_dir = pathlib.Path.cwd() / "data" profiles_dir = (data_dir / "sc-profiles").resolve(strict=True) cpjump1_data_path = ( @@ -132,11 +127,6 @@ def load_group_stratified_data( profiles_dir / "cpjump1" / "feature_selected_sc_qc_features.json" ).resolve(strict=True) -# setting cpjump1 data dir -cpjump_crispr_data_dir = (data_dir / "sc-profiles" / "cpjump1-crispr-negcon").resolve() -cpjump_crispr_data_dir.mkdir(exist_ok=True) - - # setting negative control negcon_data_dir = (profiles_dir / "cpjump1" / "negcon").resolve() negcon_data_dir.mkdir(exist_ok=True) @@ -175,7 +165,7 @@ def load_group_stratified_data( # save the file subsampled_df.write_parquet( - negcon_data_dir / f"cpjump1_crispr_negcon_seed{seed_val}.parquet" + negcon_data_dir / f"cpjump1_compound_negcon_seed{seed_val}.parquet" ) diff --git a/utils/data_utils.py b/utils/data_utils.py index 64f24a5..c8ef769 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -1,8 +1,9 @@ """ -Module: utils.py +Module: data_utils.py -A collection of common utility functions for data processing, -as well as for saving, loading, and writing files. +Utility functions for processing image-based single-cell profiles, including +feature/metadata splitting, hash-based cell ID generation, data shuffling, +consensus signature generation, and profile loading/concatenation. """ import hashlib @@ -15,139 +16,17 @@ from pycytominer.cyto_utils import infer_cp_features -def _sort_features_by_compartment_organelles( - features: list[str], - compartment_pos: int = 0, - organelle_pos: int = 3, - organelles: list[str] = ["DNA", "RNA", "ER", "Mito", "AGP"], -) -> dict: - """Sort features by compartment and organelle. - - This function takes a list of feature names and organizes them into a nested dictionary - structure where the first level is compartments and the second level is organelles. - It filters out features that do not match the specified organelle list. - - Parameters - ---------- - features : list[str] - list of morpholgy features - compartment_pos : int, optional - position where the compartment name resides with the feature name - , by default 0 - organelle_pos : int, optional - position where the organelle name resides within the feature name - , by default 3 - organelles : list[str], optional - List of organelles that are measured in the feature space, - by default ["DNA", "RNA", "ER", "Mito", "AGP"] - - Returns - ------- - dict - Nested dictionary: compartment -> organelle -> features - """ - - result = defaultdict(list) - for feature in features: - # Skip AreaShape features as they don't contain organelle information - if "AreaShape" in feature: - continue - - # Split feature name and validate structure - split_feature = feature.split("_") - if len(split_feature) < 4: - continue - - # Extract compartment and organelle from feature name - compartment = split_feature[compartment_pos] - organelle = split_feature[organelle_pos] - - # Only include features with valid organelles - if organelle in organelles: - result[compartment].append(feature) - - # Create nested dictionary: compartment -> organelle -> features - compartment_organelle_dict = defaultdict(dict) - for compartment, features_list in result.items(): - organelle_dict = defaultdict(list) - - # Group features by organelle within each compartment - for feature in features_list: - organelle = feature.split("_")[organelle_pos] - organelle_dict[organelle].append(feature) - - compartment_organelle_dict[compartment] = organelle_dict - - return compartment_organelle_dict - - -def _generate_organelle_counts(compartment_organelle_dict: dict) -> dict: - """Generate a count of organelles per compartment for each gene. - - This function processes a nested dictionary containing gene signatures organized - by compartment and organelle, and returns the count of features for each - organelle within each compartment for every gene. - - Parameters - ---------- - compartment_organelle_dict : dict - Nested dictionary structure: - gene -> signature_type -> compartment -> organelle -> list of features - Where signature_type is 'on_morph_sig' or 'off_morph_sig' - - Returns - ------- - dict - Dictionary structure: gene -> signature_type -> compartment -> organelle -> count - Where count is the number of features for each organelle in each compartment - - Raises - ------ - TypeError - If the organelle_dict for any gene is not a dictionary - """ - # Initialize a nested dictionary to hold the counts - # This will be structured as: gene -> signature_type -> compartment -> organelle -> count - feature_count_per_organelle = defaultdict(lambda: defaultdict(dict)) - - # Iterate through every gene's on and off morphology signatures that are sorted by - # compartment and organelle - for gene, signature_dict in compartment_organelle_dict.items(): - if not isinstance(signature_dict, dict): - raise TypeError( - f"Expected signature_dict to be a dict for gene {gene}, got {type(signature_dict)}" - ) - - # Process each signature type (on_morph_sig, off_morph_sig) - counted_organelle_per_signature = defaultdict(dict) - for sig_type, compartment_dict in signature_dict.items(): - # For each compartment-organelle combination, count the number of features - counted_organelle_dict = defaultdict(dict) - for compartment, organelle_dict in compartment_dict.items(): - for organelle, features in organelle_dict.items(): - counted_organelle_dict[compartment][organelle] = len(features) - counted_organelle_per_signature[sig_type] = counted_organelle_dict - - # Store the counted organelle dictionary per gene and signature type - feature_count_per_organelle[gene] = counted_organelle_per_signature - - return feature_count_per_organelle - - def split_meta_and_features( profile: pd.DataFrame | pl.DataFrame, compartments: list[str] = ["Nuclei", "Cells", "Cytoplasm"], metadata_tag: bool | None = False, ) -> tuple[list[str], list[str]]: - """Splits metadata and feature column names - - This function takes a DataFrame containing image-based profiles and splits - the column names into metadata and feature columns. It uses the Pycytominer's - `infer_cp_features` function to identify feature columns based on the specified compartments. - If the `metadata_tag` is set to False, it assumes that metadata columns do not have a specific tag - and identifies them by excluding feature columns. If `metadata_tag` is True, it uses - the `infer_cp_features` function with the `metadata` argument set to True. + """Split column names of an image-based profile into metadata and feature lists. + Uses pycytominer's `infer_cp_features` to identify CellProfiler feature columns + based on the specified compartments. Metadata columns are identified as all + remaining columns not in the feature set (when `metadata_tag=False`), or via + `infer_cp_features(metadata=True)` when `metadata_tag=True`. Parameters ---------- @@ -167,8 +46,8 @@ def split_meta_and_features( Notes ----- - - If a polars DataFrame is provided, it will be converted to a pandas DataFrame in order - to maintain compatibility with the `infer_cp_features` function. + - If a polars DataFrame is provided, it will be converted to a pandas DataFrame in + order to maintain compatibility with the `infer_cp_features` function. """ # type checking @@ -183,7 +62,8 @@ def split_meta_and_features( # identify features names features_cols = infer_cp_features(profile, compartments=compartments) - # iteratively search metadata features and retain order if the Metadata tag is not added + # iteratively search metadata features and retain order if the Metadata tag is not + # added if metadata_tag is False: meta_cols = [ colname @@ -196,256 +76,22 @@ def split_meta_and_features( return (meta_cols, features_cols) -def group_signature_by_compartment(signatures: dict, compartment_pos: int = 0): - """Group gene features in each signature by their compartment. - - This function takes a dictionary of gene signatures and groups the features - by their compartment. The compartment is determined by the position in the - feature string, which is specified by the `compartment_pos` parameter. - - Parameters - ---------- - signatures : dict - A dictionary containing gene signatures. - compartment_pos : int, optional - The position of the compartment in the feature string, by default 0 - - Returns - ------- - dict - A dictionary with genes as keys and their grouped features as values. - The structure is: gene --> signature_type -> compartment -> features - """ - # Type validation - if not isinstance(signatures, dict): - raise TypeError("signatures must be a dictionary") - if not isinstance(compartment_pos, int): - raise TypeError("compartment_pos must be an integer") - - # Initialize the result dictionary - gene_signature_grouped_by_compartment = defaultdict(lambda: defaultdict(dict)) - - # Process each gene and its signatures - for gene, signature_dict in signatures.items(): - # get features from each signature type - for sig_type, features in signature_dict.items(): - # Group features by compartment for this signature type - compartment_groups = defaultdict(list) - for feature in features: - try: - compartment = feature.split("_")[compartment_pos] - compartment_groups[compartment].append(feature) - - # Handle features that don't have enough parts when split - except IndexError: - continue - - # Store the grouped features - gene_signature_grouped_by_compartment[gene][sig_type] = dict( - compartment_groups - ) - - return gene_signature_grouped_by_compartment - - -def group_features_by_compartment_organelle( - signatures: dict, - compartments: list[str] = ["Nuclei", "Cytoplasm", "Cells"], - organelles: list[str] = ["DNA", "RNA", "ER", "Mito", "AGP"], - compartment_pos: int = 0, - organelle_pos: int = 3, -) -> dict: - """Group features by compartment and organelle from gene on- and off-morphology - signatures. - - This function processes on- off- signatures of each gene to organize morphological - features into nested dictionaries based on compartment and organelle groupings. - It applies validation checks and uses the helper function `_sort_compartment_organelles` - to structure the data. - - Keep note that some features are removed since this function is solely looking - for features that contain organelle information. For example, features that have AreaShape - measurements do not contain organelle information and therefore are excluded. - - Parameters - ---------- - signatures : dict - Dictionary where keys are gene names and values are dictionaries containing - 'on_morph_sig' and 'off_morph_sig' lists of morphological features - compartments : list[str], optional - List of valid compartment names, by default ["Nuclei", "Cytoplasm", "Cells"] - organelles : list[str], optional - List of valid organelle names, by default ["DNA", "RNA", "ER", "Mito", "AGP"] - compartment_pos : int, optional - Position index for compartment name in feature string, by default 0 - organelle_pos : int, optional - Position index for organelle name in feature string, by default 3 - - Returns - ------- - dict - Nested dictionary structure: - gene -> {'on_morph_sig': {compartment: {organelle: [features]}}, - 'off_morph_sig': {compartment: {organelle: [features]}}} - - Raises - ------ - TypeError - If signatures is not a dict with proper structure, or if compartments/organelles - are not lists of strings, or if position parameters are not integers - ValueError - If position parameters are negative or equal to each other - """ - - # type checking for compartments and organelles - if not isinstance(signatures, dict): - raise TypeError("Signatures must be a dictionary.") - if not isinstance(compartments, list) or not isinstance(organelles, list): - raise TypeError("Compartments and organelles must be lists.") - if not all(isinstance(compartment, str) for compartment in compartments): - raise TypeError("All compartments must be strings.") - if not all(isinstance(organelle, str) for organelle in organelles): - raise TypeError("All organelles must be strings.") - if not isinstance(compartment_pos, int) or not isinstance(organelle_pos, int): - raise TypeError("Compartment and organelle positions must be integers.") - if compartment_pos < 0 or organelle_pos < 0: - raise ValueError("Compartment and organelle positions must be non-negative.") - if compartment_pos == organelle_pos: - raise ValueError("Compartment and organelle positions must be different.") - - # Group features by compartment that contain organelle information - sorted_compartment_and_organelle_per_gene = defaultdict(dict) - for gene, signature_dict in signatures.items(): - # extracting features from signatures - on_sig_features = _sort_features_by_compartment_organelles( - signature_dict["on_morph_sig"] - ) - off_sig_features = _sort_features_by_compartment_organelles( - signature_dict["off_morph_sig"] - ) - - # Combine the sorted features for the gene - sorted_compartment_and_organelle_per_gene[gene] = { - "on_morph_sig": on_sig_features, - "off_morph_sig": off_sig_features, - } - - return sorted_compartment_and_organelle_per_gene - - -def organelle_count_table_per_gene( - sorted_signatures: dict, stratify_by_compartment: bool = False -) -> pd.DataFrame: - """Generate a count table of organelles per gene from morphological signatures. - - This function processes gene signatures that have been organized by compartment - and organelle to create a summary table showing the count of features for each - organelle within each gene's on- and off-morphology signatures. - - Parameters - ---------- - sorted_signatures : dict - Nested dictionary structure containing gene signatures organized by compartment - and organelle. Expected format: - gene -> signature_type -> compartment -> organelle -> list of features - where signature_type is 'on_morph_sig' or 'off_morph_sig' - stratify_by_compartment : bool, optional - If True, creates separate columns for each compartment-organelle combination - (e.g., "Cyto_DNA", "Nuc_RNA"). If False, sums counts across all compartments - for each organelle, by default False - - Returns - ------- - pd.DataFrame - DataFrame with organelle counts per gene and signature type. Structure depends - on stratify_by_compartment parameter: - - If True: columns are compartment_organelle combinations (e.g., "Cyto_DNA") - - If False: columns are organelle names with counts summed across compartments - Index contains gene names, with 'sig_type' column indicating 'on' or 'off' - - Notes - ----- - - Each gene will have two rows in the output: one for 'on' signatures and one for 'off' - - Compartment names are abbreviated: "Cytoplasm" -> "Cyto", "Nuclei" -> "Nuc" - - Missing organelle counts are filled with 0 - - The function uses the helper function `_generate_organelle_counts` to process - the input data structure - - - """ - # count organelles per compartment - organelle_counts = _generate_organelle_counts(sorted_signatures) - - # initialize an empty DataFrame to hold the counts - organelle_counted_per_gene = pd.DataFrame() - - # iterate through each gene and its morphological signatures - for gene, morph_signatures in organelle_counts.items(): - # iterate through each signature type (on_morph_sig, off_morph_sig) - for sig_type, compartment_organelle_counts in morph_signatures.items(): - # convert nested dict to DataFrame with compartments as index and organelles as columns - count_table = ( - pd.DataFrame.from_dict(compartment_organelle_counts, orient="index") - .fillna(0) - .astype(int) - ) - - if stratify_by_compartment: - # create compartment-organelle combinations as columns - flattened_data = [] - column_names = [] - - for compartment in count_table.index: - # abbreviate compartment names - compartment_abbrev = ( - "Cyto" - if compartment == "Cytoplasm" - else "Nuc" - if compartment == "Nuclei" - else compartment - ) - - # add compartment-organelle combinations - for organelle in count_table.columns: - column_names.append(f"{compartment_abbrev}_{organelle}") - flattened_data.append(count_table.loc[compartment, organelle]) - - # create DataFrame with flattened structure - gene_row = pd.DataFrame( - [flattened_data], columns=column_names, index=[gene] - ) - else: - # sum counts across all compartments for each organelle - gene_row = count_table.sum().to_frame().T - gene_row.index = [gene] - - # add signature type column - gene_row.insert(0, "sig_type", sig_type.split("_")[0]) - - # concatenate to main DataFrame - organelle_counted_per_gene = pd.concat( - [organelle_counted_per_gene, gene_row] - ).fillna(0) - - return organelle_counted_per_gene - - def generate_consensus_signatures( signatures_dict, features: list[str], min_consensus_threshold=0.5 ) -> dict: """ - Generate consensus morphological signatures from multiple comparisons. + Generate consensus on/off morphological signatures across multiple comparisons. - This function aggregates on-morphology signatures across different negative control samples - for each positive control, finding features that consistently appear across multiple comparisons. - The off-morphology signatures are then defined as the complement of on-morphology features - from the full feature set. + For each positive control, aggregates on-morphology features across all + comparisons and retains only those features that appear in at least + `min_consensus_threshold` fraction of comparisons. Off-morphology features are + defined as all features NOT in the consensus on-set. Parameters ---------- signatures_dict : dict Dictionary containing signature results with structure: - {comparison_id: {"controls": {"positive": gene, "negative": seed}, + {comparison_id: {"controls": {"positive": label, "negative": seed}, "signatures": {"on": [...], "off": [...]}}} features : list[str] Complete list of all available morphological features @@ -457,8 +103,9 @@ def generate_consensus_signatures( ------- dict Dictionary with structure: - {gene: {"on": [feature1, feature2, ...], "off": [feature1, feature2, ...]}} - where "off" features are the complement of "on" features from the full feature set + {label: {"on": [feature1, feature2, ...], "off": [feature1, feature2, ...]}} + where "off" features are the complement of "on" features from the full feature + set Raises ------ @@ -471,29 +118,30 @@ def generate_consensus_signatures( # Input validation if not 0.0 <= min_consensus_threshold <= 1.0: raise ValueError( - f"min_consensus_threshold must be between 0.0 and 1.0, got {min_consensus_threshold}" + "min_consensus_threshold must be between 0.0 and 1.0, " + f"got {min_consensus_threshold}" ) if not signatures_dict: return {} - # Group on-morphology signatures by positive control gene - on_signatures_by_gene = defaultdict(list) + # Group on-morphology signatures by positive control label + on_signatures_by_label = defaultdict(list) try: for _, sig_results in signatures_dict.items(): positive_control = sig_results["controls"]["positive"] on_signature_features = sig_results["signatures"]["on"] - on_signatures_by_gene[positive_control].append(on_signature_features) + on_signatures_by_label[positive_control].append(on_signature_features) except KeyError as e: raise KeyError(f"Missing required key in signatures_dict: {e}") - # Generate consensus signatures for each gene + # Generate consensus signatures for each label consensus_signatures = {} full_features_set = set(features) - for gene, feature_lists in on_signatures_by_gene.items(): + for label, feature_lists in on_signatures_by_label.items(): # Calculate consensus on-features if not feature_lists: consensus_on_features = [] @@ -529,7 +177,7 @@ def generate_consensus_signatures( ) # Store results - consensus_signatures[gene] = { + consensus_signatures[label] = { "on": consensus_on_features, "off": consensus_off_features, } @@ -566,10 +214,6 @@ def add_cell_id_hash( If True, overwrites existing 'Metadata_cell_id' column. If False and the column exists, returns the DataFrame unchanged with a warning message, by default False. - null_replacement : str, optional - String to represent null values in the hash (does not modify original data), - by default "NULL". - Returns ------- pl.DataFrame @@ -603,7 +247,7 @@ def add_cell_id_hash( else: profiles = profiles.drop("Metadata_cell_id") - # Create hash column using temporary null-filled versionx + # Create hash column using a null-filled version of each column (nulls → "NULL") hash_column = ( pl.concat_str( [pl.col(col).cast(pl.Utf8).fill_null("NULL") for col in profiles.columns] @@ -627,11 +271,12 @@ def shuffle_feature_profiles( seed: int = 42, ) -> pl.DataFrame: """ - Create a shuffled version of the dataset where each morphological feature - column is independently shuffled (values permuted within each column). + Return a shuffled copy of the profiles DataFrame for use as a null baseline. - This breaks the correlation structure between features while preserving - the marginal distributions, creating a null baseline for comparison. + - ``method="row"``: shuffles entire rows, preserving feature correlations within + cells. + - ``method="column"``: shuffles each feature column independently, breaking + inter-feature correlations while preserving each feature's marginal distribution. Parameters ---------- @@ -685,3 +330,70 @@ def shuffle_feature_profiles( return shuffled_df else: raise ValueError(f"Unknown shuffle method: {method}") + + +def split_data( + pycytominer_output: pl.DataFrame, dataset: str = "CP_and_DP" +) -> pl.DataFrame: + """ + Filter a pycytominer output DataFrame to retain only metadata and the + selected feature modality columns. + + Parameters + ---------- + pycytominer_output : pl.DataFrame + Polars DataFrame from pycytominer containing both metadata and feature columns. + dataset : str, optional + Feature modality to retain. One of: + - ``"CP"`` — CellProfiler features only (columns containing ``"CP__"``) + - ``"DP"`` — DeepProfiler features only (columns containing ``"DP__"``) + - ``"CP_and_DP"`` — both modalities (default) + + Returns + ------- + pl.DataFrame + Polars DataFrame with metadata and selected features + """ + all_cols = pycytominer_output.columns + + # Get DP, CP, or both features from all columns depending on desired dataset + if dataset == "CP": + feature_cols = [col for col in all_cols if "CP__" in col] + elif dataset == "DP": + feature_cols = [col for col in all_cols if "DP__" in col] + elif dataset == "CP_and_DP": + feature_cols = [col for col in all_cols if "P__" in col] + else: + raise ValueError( + f"Invalid dataset '{dataset}'. Choose from 'CP', 'DP', or 'CP_and_DP'." + ) + + # Metadata columns is all columns except feature columns + metadata_cols = [col for col in all_cols if "P__" not in col] + + # Select metadata and feature columns + selected_cols = metadata_cols + feature_cols + + return pycytominer_output.select(selected_cols) + + +def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFrame: + """ + Strip a feature-modality prefix from all matching column names. + + For example, ``"CP__Cells_AreaShape_Area"`` becomes ``"Cells_AreaShape_Area"`` + when ``prefix="CP__"``. + + Parameters + ---------- + df : pl.DataFrame + Input DataFrame whose column names may contain the prefix. + prefix : str, default ``"CP__"`` + Prefix string to strip from matching column names. + + Returns + ------- + pl.DataFrame + DataFrame with the prefix removed from all matching column names. + """ + return df.rename(lambda x: x.replace(prefix, "") if prefix in x else x) diff --git a/utils/io_utils.py b/utils/io_utils.py index 6d9f1a9..127a8e7 100644 --- a/utils/io_utils.py +++ b/utils/io_utils.py @@ -1,3 +1,11 @@ +""" +Module: io_utils.py + +Utility functions for file I/O, including loading single-cell profiles, +configuration files (YAML/JSON/pickle), downloading files from URLs, +extracting compressed archives, and concatenating profile DataFrames. +""" + import gzip import json import pathlib @@ -29,7 +37,8 @@ def load_profiles( fpath : str | pathlib.Path Path to the file containing single-cell profiles. convert_to_f32 : bool, optional - If True, converts all Float64 columns to Float32 to save memory. Default is False + If True, converts all Float64 columns to Float32 to save memory. Default is + False verbose : bool, optional If True, prints information about the loaded profiles. Default is False. shared_features : list[str] | None, optional @@ -48,7 +57,8 @@ def load_profiles( FileNotFoundError If the file at `fpath` does not exist. ValueError - If the file format is not supported. Supported formats are: .parquet, .pq, .arrow. + If the file format is not supported. Supported formats are: .parquet, .pq, + .arrow. """ # type checking @@ -86,10 +96,12 @@ def load_profiles( if verbose: print(f"Loading profiles from {fpath}...") print( - f"Loaded profiles shape: rows: {loaded_profiles.shape[0]}, columns: {loaded_profiles.shape[1]}" + f"Loaded profiles shape: rows: {loaded_profiles.shape[0]}, " + f"columns: {loaded_profiles.shape[1]}" ) print( - f"Estimated loaded dataframe size: {round(loaded_profiles.estimated_size('mb'), 2)} MB" + "Estimated loaded dataframe size:", + f"{round(loaded_profiles.estimated_size('mb'), 2)} MB", ) return loaded_profiles @@ -97,6 +109,7 @@ def load_profiles( def load_configs(fpath: str | pathlib.Path) -> dict: """Load a configuration file and return its contents as a dictionary. + Parameters ---------- fpath : str or pathlib.Path @@ -143,7 +156,8 @@ def load_configs(fpath: str | pathlib.Path) -> dict: raise ValueError(f"Error parsing pickle file {fpath}: {e}") else: raise ValueError( - f"Unsupported file format: {fpath.suffix}. Expected .yaml, .json, .pkl, or .pickle" + f"Unsupported file format: {fpath.suffix}. Expected .yaml, .json, .pkl, or " + ".pickle" ) return config @@ -153,11 +167,9 @@ def download_file( output_path: pathlib.Path | str, chunk_size: int = 8192, ) -> pathlib.Path: - """Downloads a file from a URL with progress tracking. + """Download a file from a URL and save it to disk with progress tracking. - Downloads a file from the specified URL and saves it to the given output path. - The download is performed in chunks to handle large files efficiently, and the progress is displayed using - the `tqdm` library. + Downloads in chunks for memory efficiency. Progress is displayed via `tqdm`. Parameters ---------- @@ -198,17 +210,12 @@ def download_file( if output_path.exists() and not output_path.is_file(): raise FileExistsError(f"Output path {output_path} exists and is not a file.") - # starting downloading process try: - # sending GET request to the source URL with requests.get(source_url, stream=True) as response: - # raise an error if the request was unsuccessful response.raise_for_status() - # get the total size of the file from the response headers total_size = int(response.headers.get("content-length", 0)) - # using tqdm to track the download progress with ( open(output_path, "wb") as file, tqdm( @@ -219,12 +226,9 @@ def download_file( unit_divisor=1024, ) as pbar, ): - # iterating over the response content in chunks for chunk in response.iter_content(chunk_size=chunk_size): if chunk: file.write(chunk) - - # this updates the progress bar pbar.update(len(chunk)) return output_path @@ -247,7 +251,8 @@ def extract_file( file_path : pathlib.Path | str Path to the compressed file. extract_dir : pathlib.Path | str, optional - Directory where the file should be extracted. If None, extracts to the same directory as the file. + Directory where the file should be extracted. If None, extracts to the same + directory as the file. Returns ------- @@ -300,28 +305,91 @@ def download_compressed_file( output_path: pathlib.Path | str, chunk_size: int = 8192, extract: bool = True, -) -> None: +) -> pathlib.Path: """ - Download and optionally extract a compressed file from a URL. + Download a file from a URL and optionally extract it. Parameters ---------- source_url : str - The URL of the compressed file to download. + URL of the file to download. output_path : pathlib.Path | str - The local path where the downloaded file should be saved. + Local path where the downloaded file will be saved. + Must include the correct file extension (e.g. ``.zip``) so the archive + format can be inferred during extraction. chunk_size : int, optional - The size of chunks to download in bytes, by default 8192. + Download chunk size in bytes, by default 8192. extract : bool, optional - Whether to extract the file after downloading, by default True. + If True, extracts the archive after downloading, by default True. Returns ------- pathlib.Path - The path to the downloaded (and possibly extracted) file. + Path to the downloaded file. """ downloaded_path = download_file(source_url, output_path, chunk_size) if extract: extract_file(downloaded_path) return downloaded_path + + +def load_and_concat_profiles( + profile_dir: str | pathlib.Path, + shared_features: list[str] | None = None, + specific_plates: list[pathlib.Path] | None = None, +) -> pl.DataFrame: + """ + Load all profile files from a directory and concatenate them into a single Polars + DataFrame. + + Parameters + ---------- + profile_dir : str or pathlib.Path + Directory containing the profile files (.parquet). + shared_features : Optional[list[str]], optional + List of shared feature names to filter the profiles. If None, all features are + loaded. + specific_plates : Optional[list[pathlib.Path]], optional + List of specific plate file paths to load. If None, all profiles in the + directory are loaded. + + Returns + ------- + pl.DataFrame + Concatenated Polars DataFrame containing all loaded profiles. + """ + # Ensure profile_dir is a pathlib.Path + if isinstance(profile_dir, str): + profile_dir = pathlib.Path(profile_dir) + elif not isinstance(profile_dir, pathlib.Path): + raise TypeError("profile_dir must be a string or a pathlib.Path object") + + # Validate specific_plates + if specific_plates is not None: + if not isinstance(specific_plates, list): + raise TypeError("specific_plates must be a list of pathlib.Path objects") + if not all(isinstance(path, pathlib.Path) for path in specific_plates): + raise TypeError( + "All elements in specific_plates must be pathlib.Path objects" + ) + + # Use specific_plates if provided, otherwise gather all .parquet files + if specific_plates is not None: + # Validate that all specific plate files exist + for plate_path in specific_plates: + if not plate_path.exists(): + raise FileNotFoundError(f"Profile file not found: {plate_path}") + files_to_load = specific_plates + else: + files_to_load = list(profile_dir.glob("*.parquet")) + if not files_to_load: + raise FileNotFoundError(f"No profile files found in {profile_dir}") + + # Load and concatenate profiles + loaded_profiles = [ + load_profiles(f, shared_features=shared_features) for f in files_to_load + ] + + # Concatenate all loaded profiles + return pl.concat(loaded_profiles, rechunk=True) diff --git a/utils/validator.py b/utils/validator.py deleted file mode 100644 index a0d7e1b..0000000 --- a/utils/validator.py +++ /dev/null @@ -1,133 +0,0 @@ -""" -Utility functions for validating parameter grids for clustering optimization. -""" - -from typing import Any - -# Global static variables for valid parameter names and types -VALID_CLUSTER_PARAMS: set[str] = { - "cluster_method", - "cluster_resolution", - "dim_reduction", - "n_neighbors", - "neighbor_distance_metric", - "pca_variance_explained", - "pca_n_components_to_capture_variance", - "pca_svd_solver", -} - -VALID_PARAM_TYPES: set[str] = {"float", "int", "categorical"} - - -def _validate_param_grid(param_grid: dict[str, Any]) -> None: - """Validate the parameter grid for optimized_clustering function. - - This function checks that the provided param_grid contains valid parameter names - and types for the cluster_profiles function. It raises a ValueError if any invalid - parameters are found. - - Parameters - ---------- - param_grid : dict[str, Any] - Dictionary defining the parameter search space. Each key should be a parameter - name from cluster_profiles, and each value should be a dictionary with 'type' - and range info. - - Raises - ------ - ValueError - If param_grid contains unsupported parameter types or invalid parameter names. - TypeError - If param_grid structure is invalid (missing required keys, wrong value types). - """ - - for param_name, param_config in param_grid.items(): - # 1. Check if parameter name is valid - if param_name not in VALID_CLUSTER_PARAMS: - raise ValueError( - f"Invalid parameter name: '{param_name}'. " - f"Valid parameters are: {sorted(VALID_CLUSTER_PARAMS)}" - ) - - # 2. Check if param_config is a dictionary - if not isinstance(param_config, dict): - raise TypeError( - f"Parameter config for '{param_name}' must be a dictionary, " - f"got {type(param_config).__name__}" - ) - - # 3. Check if 'type' key exists - if "type" not in param_config: - raise TypeError( - f"Parameter config for '{param_name}' must contain a 'type' key" - ) - - param_type = param_config["type"] - - # 4. Check if type is valid - if param_type not in VALID_PARAM_TYPES: - raise ValueError( - f"Invalid parameter type '{param_type}' for '{param_name}'. " - f"Valid types are: {sorted(VALID_PARAM_TYPES)}" - ) - - # 5. Validate type-specific requirements - if param_type in ["float", "int"]: - # Check for 'low' and 'high' keys - if "low" not in param_config: - raise TypeError( - f"Parameter config for '{param_name}' with type '{param_type}' " - f"must contain a 'low' key" - ) - if "high" not in param_config: - raise TypeError( - f"Parameter config for '{param_name}' with type '{param_type}' " - f"must contain a 'high' key" - ) - - # Check that low and high are numbers - if not isinstance(param_config["low"], (int, float)): - raise TypeError( - f"'low' value for '{param_name}' must be a number, " - f"got {type(param_config['low']).__name__}" - ) - if not isinstance(param_config["high"], (int, float)): - raise TypeError( - f"'high' value for '{param_name}' must be a number, " - f"got {type(param_config['high']).__name__}" - ) - - # Check that low < high - if param_config["low"] >= param_config["high"]: - raise ValueError( - f"'low' must be less than 'high' for '{param_name}'. " - f"Got low={param_config['low']}, high={param_config['high']}" - ) - - # If 'log' is present, check it's a boolean - if "log" in param_config and not isinstance(param_config["log"], bool): - raise TypeError( - f"'log' value for '{param_name}' must be a boolean, " - f"got {type(param_config['log']).__name__}" - ) - - elif param_type == "categorical": - # Check for 'choices' key - if "choices" not in param_config: - raise TypeError( - f"Parameter config for '{param_name}' with type 'categorical' " - f"must contain a 'choices' key" - ) - - # Check that choices is a list or tuple - if not isinstance(param_config["choices"], (list, tuple)): - raise TypeError( - f"'choices' for '{param_name}' must be a list or tuple, " - f"got {type(param_config['choices']).__name__}" - ) - - # Check that choices is not empty - if len(param_config["choices"]) == 0: - raise ValueError( - f"'choices' for '{param_name}' must contain at least one option" - )