Merge pull request #69 from bigbio/dev

ypriverol · web-flow · commit 3a1d02077e25 · 2026-03-21T11:07:53.000Z
New version including changes in SDRF validation, remove validation of simple txt files
diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -0,0 +1,5 @@
+FROM python:3.11-slim
+RUN apt-get update && apt-get install -y --no-install-recommends git procps libglib2.0-0t64 && rm -rf /var/lib/apt/lists/*
+WORKDIR /src
+COPY . .
+RUN pip install --no-cache-dir .
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,7 @@ packages = [
 [tool.poetry.dependencies]
 python = "*"
 click = "*"
-sdrf-pipelines = "==0.0.33"
+sdrf-pipelines = ">=0.1.1"
 pyopenms = ">=3.3.0"
 pandas = "*"
 pyarrow = ">=16.1.0"
diff --git a/quantmsutils/diann/dianncfg.py b/quantmsutils/diann/dianncfg.py
@@ -9,7 +9,7 @@
 from typing import List, Tuple
 from collections import defaultdict
 import click
-from sdrf_pipelines.openms.unimod import UnimodDatabase
+from sdrf_pipelines.converters.openms.unimod import UnimodDatabase
 
 logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
 logger = logging.getLogger(__name__)
diff --git a/quantmsutils/sdrf/check_samplesheet.py b/quantmsutils/sdrf/check_samplesheet.py
@@ -1,192 +1,134 @@
-# nf-core: Update the script to check the sdrf
-# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
-
-import errno
 import logging
-import os
 import sys
 
 import click
 import pandas as pd
-from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame
-from sdrf_pipelines.sdrf.sdrf_schema import DEFAULT_TEMPLATE, MASS_SPECTROMETRY
+
+from sdrf_pipelines.sdrf.sdrf import read_sdrf
 
 logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
-
-def make_dir(path):
-    if len(path) > 0:
-        try:
-            os.makedirs(path)
-        except OSError as exception:
-            if exception.errno != errno.EEXIST:
-                raise exception
-
-
-def print_error(error, context="Line", context_str=""):
-    error_str = "ERROR: Please check samplesheet -> {}".format(error)
-    if context != "" and context_str != "":
-        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-            error, context.strip(), context_str.strip()
-        )
-    print(error_str)
-    sys.exit(1)
+# Minimal columns required to run quantms/quantmsdiann pipelines.
+# These are checked in --minimal mode instead of full schema validation.
+MINIMAL_REQUIRED_COLUMNS = [
+    "source name",
+    "assay name",
+    "comment[data file]",
+    "comment[label]",
+    "comment[cleavage agent details]",
+    "comment[instrument]",
+    "comment[proteomics data acquisition method]",
+    "technology type",
+]
+
+# Recommended columns: warn if missing but don't fail
+MINIMAL_RECOMMENDED_COLUMNS = [
+    "comment[precursor mass tolerance]",
+    "comment[fragment mass tolerance]",
+    "comment[dissociation method]",
+    "comment[technical replicate]",
+    "comment[fraction identifier]",
+]
 
 
 def check_sdrf(
     input_sdrf: str,
-    skip_ms_validation: bool = False,
-    skip_factor_validation: bool = False,
-    skip_experimental_design_validation: bool = False,
+    template: str = "ms-proteomics",
+    minimal: bool = False,
     use_ols_cache_only: bool = False,
-    skip_sdrf_validation: bool = False,
 ):
     """
-    Check the SDRF file for errors. If any errors are found, print them and exit with a non-zero status code.
-    @param input_sdrf: Path to the SDRF file to check
-    @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)
-    @param skip_factor_validation: Disable the validation of factor values in SDRF
-    @param skip_experimental_design_validation: Disable the validation of experimental design
-    @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service
-    @param skip_sdrf_validation: Disable the validation of SDRF
-    """
-    if skip_sdrf_validation:
-        print("No SDRF validation was performed.")
-        sys.exit(0)
-
-    df = SdrfDataFrame.parse(input_sdrf)
-    errors = df.validate(DEFAULT_TEMPLATE, use_ols_cache_only)
-
-    if not skip_ms_validation:
-        errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only)
+    Check the SDRF file for errors.
 
-    if not skip_factor_validation:
-        errors = errors + df.validate_factor_values()
-
-    if not skip_experimental_design_validation:
-        errors = errors + df.validate_experimental_design()
+    :param input_sdrf: Path to the SDRF file to check
+    :param template: Schema template for full validation (e.g. 'ms-proteomics', 'dia-acquisition')
+    :param minimal: Only validate columns required to run the pipeline (skip organism, etc.)
+    :param use_ols_cache_only: Use OLS cache instead of live OLS service
+    """
+    if minimal:
+        errors = _validate_minimal(input_sdrf)
+    else:
+        df = read_sdrf(input_sdrf)
+        errors = df.validate_sdrf(
+            template=template,
+            use_ols_cache_only=use_ols_cache_only,
+        )
 
     for error in errors:
         print(error)
 
     sys.exit(bool(errors))
 
 
-def check_expdesign(expdesign):
-    """
-    Check the expdesign file for errors. If any errors are found, print them and exit with a non-zero status code.
-    @param expdesign: Path to the expdesign file to check
-    """
-    data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str)
-    data = data.dropna()
-    schema_file = ["Fraction_Group", "Fraction", "Spectra_Filepath", "Label", "Sample"]
-    schema_sample = ["Sample", "MSstats_Condition", "MSstats_BioReplicate"]
-
-    # check table format: two table
-    with open(expdesign, "r") as f:
-        lines = f.readlines()
-        try:
-            empty_row = lines.index("\n")
-        except ValueError:
-            print(
-                "the one-table format parser is broken in OpenMS2.5, please use one-table or sdrf"
-            )
-            sys.exit(1)
-
-        s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:]
-        s_header = lines[empty_row + 1].replace("\n", "").split("\t")
-        s_data_frame = pd.DataFrame(s_table, columns=s_header)
-
-    # check missed mandatory column
-    missed_columns = set(schema_file) - set(data.columns)
-    if len(missed_columns) != 0:
-        print("{0} column missed".format(" ".join(missed_columns)))
-        sys.exit(1)
-
-    missed_columns = set(schema_sample) - set(s_data_frame.columns)
-    if len(missed_columns) != 0:
-        print("{0} column missed".format(" ".join(missed_columns)))
-        sys.exit(1)
+def _validate_minimal(input_sdrf: str) -> list[str]:
+    """Validate only the columns required to run the pipeline.
 
-    if len(set(data.Label)) != 1 and "MSstats_Mixture" not in s_data_frame.columns:
-        print("MSstats_Mixture column missed in ISO experiments")
-        sys.exit(1)
-
-    # check logical problem: may be improved
-    check_expdesign_logic(data, s_data_frame)
+    Returns a list of error strings. Only missing required columns
+    produce errors; missing recommended columns produce warnings (non-blocking).
+    """
+    df_header = pd.read_csv(input_sdrf, sep="\t", nrows=0)
+    columns_lower = [c.lower() for c in df_header.columns]
+    errors = []
+
+    # Reject header-only files
+    df_rows = pd.read_csv(input_sdrf, sep="\t", nrows=1)
+    if len(df_rows) == 0:
+        errors.append("ERROR: SDRF file contains a header but no data rows.")
+        return errors
+
+    # Check required columns (case-insensitive)
+    for col in MINIMAL_REQUIRED_COLUMNS:
+        if col.lower() not in columns_lower:
+            errors.append(f"ERROR: Required column '{col}' is missing from the SDRF file.")
+
+    # Check at least one modification parameters column exists
+    has_mod_col = any(c.startswith("comment[modification parameters") for c in columns_lower)
+    if not has_mod_col:
+        errors.append(
+            "ERROR: At least one 'comment[modification parameters]' column is required."
+        )
 
+    # Warn about recommended columns (non-blocking)
+    for col in MINIMAL_RECOMMENDED_COLUMNS:
+        if col.lower() not in columns_lower:
+            logger.warning(
+                f"Recommended column '{col}' is missing. Pipeline will use default parameters."
+            )
 
-def check_expdesign_logic(f_table, s_table):
-    fg_ints = f_table["Fraction_Group"].astype(int)
-    if fg_ints.max() > fg_ints.nunique():
-        print("Fraction_Group discontinuous!")
-        sys.exit(1)
-    f_table_d = f_table.drop_duplicates(["Fraction_Group", "Fraction", "Label", "Sample"])
-    if f_table_d.shape[0] < f_table.shape[0]:
-        print("Existing duplicate entries in Fraction_Group, Fraction, Label and Sample")
-        sys.exit(1)
-    if len(set(s_table.Sample)) < s_table.shape[0]:
-        print("Existing duplicate Sample in sample table!")
-        sys.exit(1)
+    return errors
 
 
 @click.command(
     "checksamplesheet",
-    short_help="Reformat nf-core/quantms sdrf file and check its contents.",
+    short_help="Validate an SDRF file for quantms pipelines.",
 )
-@click.option("--exp_design", help="SDRF/Expdesign file to be validated")
-@click.option("--is_sdrf", help="SDRF file or Expdesign file", is_flag=True)
-@click.option("--skip_sdrf_validation", help="Disable the validation of SDRF", is_flag=True)
+@click.option("--exp_design", help="SDRF file to be validated", required=True)
 @click.option(
-    "--skip_ms_validation",
-    help="Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)",
-    is_flag=True,
+    "--template", "-t",
+    help="Schema template for full validation (e.g. ms-proteomics, dia-acquisition)",
+    default="ms-proteomics",
 )
 @click.option(
-    "--skip_factor_validation",
-    help="Disable the validation of factor values in SDRF",
-    is_flag=True,
-)
-@click.option(
-    "--skip_experimental_design_validation",
-    help="Disable the validation of experimental design",
+    "--minimal",
+    help="Only validate columns required to run the pipeline (skip organism, metadata, etc.)",
     is_flag=True,
 )
 @click.option(
     "--use_ols_cache_only",
-    help="Use ols cache for validation of the terms and not OLS internet service",
+    help="Use OLS cache for ontology validation instead of the live OLS service",
     is_flag=True,
 )
 def checksamplesheet(
     exp_design: str,
-    is_sdrf: bool = False,
-    skip_sdrf_validation: bool = False,
-    skip_ms_validation: bool = False,
-    skip_factor_validation: bool = False,
-    skip_experimental_design_validation: bool = False,
+    template: str = "ms-proteomics",
+    minimal: bool = False,
     use_ols_cache_only: bool = False,
 ):
-    """
-    Reformat nf-core/quantms sdrf file and check its contents.
-    @param exp_design: SDRF/Expdesign file to be validated
-    @param is_sdrf: SDRF file or Expdesign file
-    @param skip_sdrf_validation: Disable the validation of SDRF
-    @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)
-    @param skip_factor_validation: Disable the validation of factor values in SDRF
-    @param skip_experimental_design_validation: Disable the validation of experimental design
-    @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service
-
-    """
-    # TODO validate expdesign file
-    if is_sdrf:
-        check_sdrf(
-            input_sdrf=exp_design,
-            skip_sdrf_validation=skip_sdrf_validation,
-            skip_ms_validation=skip_ms_validation,
-            skip_factor_validation=skip_factor_validation,
-            skip_experimental_design_validation=skip_experimental_design_validation,
-            use_ols_cache_only=use_ols_cache_only,
-        )
-    else:
-        check_expdesign(exp_design)
+    """Validate an SDRF file for quantms pipelines."""
+    check_sdrf(
+        input_sdrf=exp_design,
+        template=template,
+        minimal=minimal,
+        use_ols_cache_only=use_ols_cache_only,
+    )
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -1,7 +1,7 @@
 # recipe/meta.yaml
 package:
   name: quantms-utils
-  version: "0.0.25"
+  version: "0.0.26"
 
 source:
   path: ../
@@ -20,19 +20,16 @@ requirements:
     - python
     - pip
     - poetry-core >=1.2.0
-    - setuptools <78
+
   run:
     - python >=3.9,<3.13
     - click
-    - setuptools <78
-    - sdrf-pipelines >=0.0.33,<0.1.0
+    - sdrf-pipelines >=0.1.1
     - pyopenms>=3.3.0
     - pandas
     - pyarrow>=16.1.0
     - scipy
 test:
-  requires:
-    - setuptools <78
   imports:
     - quantmsutils
   commands:
diff --git a/tests/test_commands.py b/tests/test_commands.py
diff --git a/tests/test_data/diann2msstats/PXD026600_diann_design.tsv b/tests/test_data/diann2msstats/PXD026600_diann_design.tsv