WayScience · axiomcura · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -35,7 +35,7 @@ repos:
 
   # Ruff for linting and formatting Python files
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.4
+    rev: v0.15.5
     hooks:
     -   id: ruff-check
         args: ["--fix"]

diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Downloading Single-Cell Profiles\n",
     "\n",
-    "This notebook focuses on downloading metadata and single-cell profiles from three key datasets:\n",
+    "This notebook downloading metadata and single-cell profiles from three key datasets:\n",
-    "This notebook downloading metadata and single-cell profiles from three key datasets:\n",
+    "This notebook downloads metadata and single-cell profiles from three key datasets:\n",
-    "This notebook downloading metadata and single-cell profiles from three key datasets:\n",
+    "This notebook downloads metadata and single-cell profiles from three key datasets:\n",
     "\n",
     "1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.\n",
     "2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.\n",
@@ -74,7 +74,7 @@
    "outputs": [],
    "source": [
     "# setting config path\n",
-    "config_path = pathlib.Path(\"../nb-configs.yaml\").resolve(strict=True)\n",
+    "config_path = pathlib.Path(\"dl-configs.yaml\").resolve(strict=True)\n",
     "\n",
     "# setting results setting a data directory\n",
     "data_dir = pathlib.Path(\"./data\").resolve()\n",
@@ -114,7 +114,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "5b8bfe5f",
    "metadata": {},
    "outputs": [
@@ -126,15 +126,15 @@
       "Series: 'Assay_Plate_Barcode' [str]\n",
       "[\n",
       "\t\"BR00117054\"\n",
-      "\t\"BR00117012\"\n",
-      "\t\"BR00117008\"\n",
-      "\t\"BR00117016\"\n",
       "\t\"BR00117055\"\n",
-      "\t…\n",
+      "\t\"BR00117010\"\n",
+      "\t\"BR00117009\"\n",
       "\t\"BR00117011\"\n",
+      "\t…\n",
       "\t\"BR00117013\"\n",
-      "\t\"BR00117010\"\n",
-      "\t\"BR00117017\"\n",
+      "\t\"BR00117008\"\n",
+      "\t\"BR00117012\"\n",
+      "\t\"BR00117015\"\n",
       "\t\"BR00117019\"\n",
       "]\n",
       "shape:  (12, 13)\n"
@@ -241,6 +241,80 @@
     "exp_metadata"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "a4665c17",
+   "metadata": {},
+   "source": [
+    "\n",
+    "In this section, we download:\n",
+    "\n",
+    "1. **Compound metadata** from the CPJUMP1 repository  \n",
+    "2. **Mechanism of action (MOA) metadata** from the Broad Repurposing Hub\n",
+    "\n",
+    "We then merge both datasets into a single compound metadata table.\n",
+    "\n",
+    "If a compound has missing MOA information, the value in `Metadata_moa` is replaced with `\"unknown\"`. This indicates that no MOA annotation is currently available for that compound."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "22e417e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# downloading compound metadata from cpjump1 repo\n",
+    "CPJUMP_compound_metadata = pl.read_csv(\n",
+    "    nb_configs[\"links\"][\"CPJUMP1-compound-metadata-source\"],\n",
+    "    separator=\"\\t\",\n",
+    "    has_header=True,\n",
+    "    encoding=\"utf-8\",\n",
+    ")\n",
+    "\n",
+    "# downloading compound moa metadata from broad institute drug repurposing hub\n",
+    "broad_compound_moa_metadata = pl.read_csv(\n",
+    "    nb_configs[\"links\"][\"Broad-compounds-moa-source\"],\n",
+    "    separator=\"\\t\",\n",
+    "    skip_rows=9,\n",
+    "    encoding=\"utf8-lossy\",\n",
+    ")\n",
+    "\n",
+    "# for both dataframes make sure that all columns have \"Metadata_\" in the column name\n",
+    "CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename(\n",
+    "    {col: f\"Metadata_{col}\" for col in CPJUMP_compound_metadata.columns}\n",
+    ")\n",
+    "broad_compound_moa_metadata = broad_compound_moa_metadata.rename(\n",
+    "    {col: f\"Metadata_{col}\" for col in broad_compound_moa_metadata.columns}\n",
+    ")\n",
+    "\n",
+    "# replace null values in the boroad compound moa to \"unknown\"\n",
-    "# replace null values in the boroad compound moa to \"unknown\"\n",
+    "# replace null values in the broad compound moa to \"unknown\"\n",
-    "# replace null values in the boroad compound moa to \"unknown\"\n",
+    "# replace null values in the broad compound moa to \"unknown\"\n",
+    "broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(\n",
+    "    pl.col(\"Metadata_moa\").fill_null(\"unknown\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "01db7db8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "complete_compound_metadata = CPJUMP_compound_metadata.join(\n",
+    "    broad_compound_moa_metadata,\n",
+    "    left_on=\"Metadata_pert_iname\",\n",
+    "    right_on=\"Metadata_pert_iname\",\n",
+    "    how=\"left\",\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# save the complete compound metadata as a tsv file\n",
+    "complete_compound_metadata.write_csv(\n",
+    "    cpjump1_dir / f\"cpjump1_{pert_type}_compound-metadata.tsv\", separator=\"\\t\"\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7021b414",
@@ -255,7 +329,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "06783224",
    "metadata": {},
    "outputs": [
@@ -284,16 +358,16 @@
    "source": [
     "## Downloading CFReT Data\n",
     "\n",
-    "In this section, we download feature-selected single-cell profiles from the CFReT plate `localhost230405150001`. This plate contains three treatments: DMSO (control), drug_x, and TGFRi. The dataset consists of high-content imaging data that has already undergone feature selection, making it suitable for downstream analysis.\n",
+    "This section downloads and saves feature-selected single-cell profiles from the CFReT plate `localhost230405150001`.\n",
     "\n",
-    "**Key Points:**\n",
-    "- Only the processed single-cell profiles are downloaded [here](https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/tree/main/3.process_cfret_features/data/single_cell_profiles)\n",
-    "- The CFReT dataset was used and published in [this study](https://doi.org/10.1161/CIRCULATIONAHA.124.071956)."
+    "- Only processed single-cell profiles are downloaded (no raw data).\n",
+    "- Data is saved as a Parquet file for fast access.\n",
+    "- Used in published cardiac fibrosis research ([study link](https://doi.org/10.1161/CIRCULATIONAHA.124.071956))."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "4d9fd47c",
    "metadata": {},
    "outputs": [
@@ -344,7 +418,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -31,13 +31,12 @@
     "import sys\n",
     "import json\n",
     "import pathlib\n",
-    "from typing import Optional\n",
     "\n",
     "import polars as pl\n",
     "\n",
     "sys.path.append(\"../../\")\n",
     "from utils.data_utils import split_meta_and_features, add_cell_id_hash\n",
-    "from utils.io_utils import load_profiles"
+    "from utils.io_utils import load_and_concat_profiles"
    ]
   },
   {
@@ -57,64 +56,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def load_and_concat_profiles(\n",
-    "    profile_dir: str | pathlib.Path,\n",
-    "    shared_features: Optional[list[str]] = None,\n",
-    "    specific_plates: Optional[list[pathlib.Path]] = None,\n",
-    ") -> pl.DataFrame:\n",
-    "    \"\"\"\n",
-    "    Load all profile files from a directory and concatenate them into a single Polars DataFrame.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    profile_dir : str or pathlib.Path\n",
-    "        Directory containing the profile files (.parquet).\n",
-    "    shared_features : Optional[list[str]], optional\n",
-    "        List of shared feature names to filter the profiles. If None, all features are loaded.\n",
-    "    specific_plates : Optional[list[pathlib.Path]], optional\n",
-    "        List of specific plate file paths to load. If None, all profiles in the directory are loaded.\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    pl.DataFrame\n",
-    "        Concatenated Polars DataFrame containing all loaded profiles.\n",
-    "    \"\"\"\n",
-    "    # Ensure profile_dir is a pathlib.Path\n",
-    "    if isinstance(profile_dir, str):\n",
-    "        profile_dir = pathlib.Path(profile_dir)\n",
-    "    elif not isinstance(profile_dir, pathlib.Path):\n",
-    "        raise TypeError(\"profile_dir must be a string or a pathlib.Path object\")\n",
-    "\n",
-    "    # Validate specific_plates\n",
-    "    if specific_plates is not None:\n",
-    "        if not isinstance(specific_plates, list):\n",
-    "            raise TypeError(\"specific_plates must be a list of pathlib.Path objects\")\n",
-    "        if not all(isinstance(path, pathlib.Path) for path in specific_plates):\n",
-    "            raise TypeError(\n",
-    "                \"All elements in specific_plates must be pathlib.Path objects\"\n",
-    "            )\n",
-    "\n",
-    "    # Use specific_plates if provided, otherwise gather all .parquet files\n",
-    "    if specific_plates is not None:\n",
-    "        # Validate that all specific plate files exist\n",
-    "        for plate_path in specific_plates:\n",
-    "            if not plate_path.exists():\n",
-    "                raise FileNotFoundError(f\"Profile file not found: {plate_path}\")\n",
-    "        files_to_load = specific_plates\n",
-    "    else:\n",
-    "        files_to_load = list(profile_dir.glob(\"*.parquet\"))\n",
-    "        if not files_to_load:\n",
-    "            raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n",
-    "\n",
-    "    # Load and concatenate profiles\n",
-    "    loaded_profiles = [\n",
-    "        load_profiles(f, shared_features=shared_features) for f in files_to_load\n",
-    "    ]\n",
-    "\n",
-    "    # Concatenate all loaded profiles\n",
-    "    return pl.concat(loaded_profiles, rechunk=True)\n",
-    "\n",
-    "\n",
     "def split_data(\n",
     "    pycytominer_output: pl.DataFrame, dataset: str = \"CP_and_DP\"\n",
     ") -> pl.DataFrame:\n",
@@ -199,16 +140,17 @@
     "# Setting profiles directory\n",
     "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
     "\n",
-    "# setting connectivity map drug repurposing config\n",
-    "drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n",
-    "    strict=True\n",
-    ")\n",
     "\n",
     "# Experimental metadata\n",
     "exp_metadata_path = (\n",
     "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
     ").resolve(strict=True)\n",
     "\n",
+    "# cpjump1 compound metadta\n",
-    "# cpjump1 compound metadta\n",
+    "# cpjump1 compound metadata\n",
-    "# cpjump1 compound metadta\n",
+    "# cpjump1 compound metadata\n",
+    "cmp_metadata_path = (\n",
+    "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_compound-metadata.tsv\"\n",
+    ").resolve(strict=True)\n",
+    "\n",
     "# Setting CFReT profiles directory\n",
     "cfret_profiles_dir = (profiles_dir / \"cfret\").resolve(strict=True)\n",
     "cfret_profiles_path = (\n",
@@ -321,7 +263,7 @@
    "id": "3df9bbf5",
    "metadata": {},
    "source": [
-    "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
+    "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each row with  Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP) and cell type. This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
    ]
   },
   {
@@ -333,14 +275,24 @@
    "source": [
     "# load drug repurposing moa file and add prefix to metadata columns\n",
     "rep_moa_df = pl.read_csv(\n",
-    "    drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n",
-    ").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n",
+    "    cmp_metadata_path,\n",
+    "    separator=\"\\t\",\n",
+    "    columns=[\"Metadata_pert_iname\", \"Metadata_target\", \"Metadata_moa\"],\n",
+    ").unique(subset=[\"Metadata_pert_iname\"])\n",
     "\n",
     "# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
     "cpjump1_profiles = cpjump1_profiles.join(\n",
     "    rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
     ")\n",
     "\n",
+    "# merge cell type metadata with cpjump1_profiles on Metadata_Plate\n",
+    "cell_type_metadata = exp_metadata.select([\"Assay_Plate_Barcode\", \"Cell_type\"]).rename(\n",
+    "    {\"Assay_Plate_Barcode\": \"Metadata_Plate\", \"Cell_type\": \"Metadata_cell_type\"}\n",
+    ")\n",
+    "cpjump1_profiles = cpjump1_profiles.join(\n",
+    "    cell_type_metadata, on=\"Metadata_Plate\", how=\"left\"\n",
+    ")\n",
+    "\n",
     "# split meta and feature\n",
     "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
     "\n",
@@ -364,7 +316,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4a0ba6ad",
+   "id": "92bacbc9",
    "metadata": {},
    "source": [
     "## Preprocessing MitoCheck Dataset\n",
@@ -582,6 +534,7 @@
     "# load in cfret profiles and add a unique cell ID\n",
     "cfret_profiles = pl.read_parquet(cfret_profiles_path)\n",
     "\n",
+    "\n",
     "# adding a unique cell ID based on all features\n",
     "cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)\n",
     "\n",
@@ -623,7 +576,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/0.download-data/3.subset-jump-controls.ipynb b/notebooks/0.download-data/3.subset-jump-controls.ipynb
@@ -142,13 +142,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# setting data path\n",
-    "data_dir = pathlib.Path(\"../0.download-data/data\").resolve(strict=True)\n",
-    "download_module_results_dir = pathlib.Path(\"../0.download-data/results\").resolve(\n",
-    "    strict=True\n",
-    ")\n",
-    "\n",
     "# setting directory where all the single-cell profiles are stored\n",
+    "data_dir = pathlib.Path.cwd() / \"data\"\n",
     "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
     "\n",
     "cpjump1_data_path = (\n",
@@ -161,11 +156,6 @@
     "    profiles_dir / \"cpjump1\" / \"feature_selected_sc_qc_features.json\"\n",
     ").resolve(strict=True)\n",
     "\n",
-    "# setting cpjump1 data dir\n",
-    "cpjump_crispr_data_dir = (data_dir / \"sc-profiles\" / \"cpjump1-crispr-negcon\").resolve()\n",
-    "cpjump_crispr_data_dir.mkdir(exist_ok=True)\n",
-    "\n",
-    "\n",
     "# setting negative control\n",
     "negcon_data_dir = (profiles_dir / \"cpjump1\" / \"negcon\").resolve()\n",
     "negcon_data_dir.mkdir(exist_ok=True)\n",
@@ -224,7 +214,7 @@
     "\n",
     "    # save the file\n",
     "    subsampled_df.write_parquet(\n",
-    "        negcon_data_dir / f\"cpjump1_crispr_negcon_seed{seed_val}.parquet\"\n",
+    "        negcon_data_dir / f\"cpjump1_compound_negcon_seed{seed_val}.parquet\"\n",
     "    )"
    ]
   },
@@ -268,7 +258,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/0.download-data/dl-configs.yaml b/notebooks/0.download-data/dl-configs.yaml
@@ -0,0 +1,8 @@
+links:
+  MitoCheck-profiles-source: https://zenodo.org/records/7967386/files/3.normalize_data__normalized_data.zip?download=1
+  CFReT-profiles-source: https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/raw/refs/heads/main/3.process_cfret_features/data/single_cell_profiles/localhost230405150001_sc_feature_selected.parquet?download=
+  CPJUMP1-experimental-metadata-source: https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/raw/refs/heads/main/benchmark/output/experiment-metadata.tsv
+  CPJUMP1-compound-metadata-source: https://raw.githubusercontent.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/main/metadata/external_metadata/JUMP-Target-1_compound_metadata_targets.tsv
+  Broad-compounds-moa-source: https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20180907.txt
+  CPJUMP-plate-maps-source: https://raw.githubusercontent.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1/refs/heads/main/metadata/platemaps/2020_11_04_CPJUMP1/platemap/JUMP-Target-1_crispr_platemap.txt
+  CPJUMP1-profiles-source: https://cellpainting-gallery.s3.amazonaws.com/cpg0000-jump-pilot/source_4/workspace/profiles