Skip to content

Commit 3a1d020

Browse files
authored
Merge pull request #69 from bigbio/dev
New version including changes in SDRF validation, remove validation of simple txt files
2 parents cd0b93f + 45f3cff commit 3a1d020

7 files changed

Lines changed: 225 additions & 158 deletions

File tree

Dockerfile.dev

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
FROM python:3.11-slim
2+
RUN apt-get update && apt-get install -y --no-install-recommends git procps libglib2.0-0t64 && rm -rf /var/lib/apt/lists/*
3+
WORKDIR /src
4+
COPY . .
5+
RUN pip install --no-cache-dir .

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ packages = [
3131
[tool.poetry.dependencies]
3232
python = "*"
3333
click = "*"
34-
sdrf-pipelines = "==0.0.33"
34+
sdrf-pipelines = ">=0.1.1"
3535
pyopenms = ">=3.3.0"
3636
pandas = "*"
3737
pyarrow = ">=16.1.0"

quantmsutils/diann/dianncfg.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from typing import List, Tuple
1010
from collections import defaultdict
1111
import click
12-
from sdrf_pipelines.openms.unimod import UnimodDatabase
12+
from sdrf_pipelines.converters.openms.unimod import UnimodDatabase
1313

1414
logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
1515
logger = logging.getLogger(__name__)
Lines changed: 89 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -1,192 +1,134 @@
1-
# nf-core: Update the script to check the sdrf
2-
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
3-
4-
import errno
51
import logging
6-
import os
72
import sys
83

94
import click
105
import pandas as pd
11-
from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame
12-
from sdrf_pipelines.sdrf.sdrf_schema import DEFAULT_TEMPLATE, MASS_SPECTROMETRY
6+
7+
from sdrf_pipelines.sdrf.sdrf import read_sdrf
138

149
logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
1510
logger = logging.getLogger(__name__)
1611

17-
18-
def make_dir(path):
19-
if len(path) > 0:
20-
try:
21-
os.makedirs(path)
22-
except OSError as exception:
23-
if exception.errno != errno.EEXIST:
24-
raise exception
25-
26-
27-
def print_error(error, context="Line", context_str=""):
28-
error_str = "ERROR: Please check samplesheet -> {}".format(error)
29-
if context != "" and context_str != "":
30-
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
31-
error, context.strip(), context_str.strip()
32-
)
33-
print(error_str)
34-
sys.exit(1)
12+
# Minimal columns required to run quantms/quantmsdiann pipelines.
13+
# These are checked in --minimal mode instead of full schema validation.
14+
MINIMAL_REQUIRED_COLUMNS = [
15+
"source name",
16+
"assay name",
17+
"comment[data file]",
18+
"comment[label]",
19+
"comment[cleavage agent details]",
20+
"comment[instrument]",
21+
"comment[proteomics data acquisition method]",
22+
"technology type",
23+
]
24+
25+
# Recommended columns: warn if missing but don't fail
26+
MINIMAL_RECOMMENDED_COLUMNS = [
27+
"comment[precursor mass tolerance]",
28+
"comment[fragment mass tolerance]",
29+
"comment[dissociation method]",
30+
"comment[technical replicate]",
31+
"comment[fraction identifier]",
32+
]
3533

3634

3735
def check_sdrf(
3836
input_sdrf: str,
39-
skip_ms_validation: bool = False,
40-
skip_factor_validation: bool = False,
41-
skip_experimental_design_validation: bool = False,
37+
template: str = "ms-proteomics",
38+
minimal: bool = False,
4239
use_ols_cache_only: bool = False,
43-
skip_sdrf_validation: bool = False,
4440
):
4541
"""
46-
Check the SDRF file for errors. If any errors are found, print them and exit with a non-zero status code.
47-
@param input_sdrf: Path to the SDRF file to check
48-
@param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)
49-
@param skip_factor_validation: Disable the validation of factor values in SDRF
50-
@param skip_experimental_design_validation: Disable the validation of experimental design
51-
@param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service
52-
@param skip_sdrf_validation: Disable the validation of SDRF
53-
"""
54-
if skip_sdrf_validation:
55-
print("No SDRF validation was performed.")
56-
sys.exit(0)
57-
58-
df = SdrfDataFrame.parse(input_sdrf)
59-
errors = df.validate(DEFAULT_TEMPLATE, use_ols_cache_only)
60-
61-
if not skip_ms_validation:
62-
errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only)
42+
Check the SDRF file for errors.
6343
64-
if not skip_factor_validation:
65-
errors = errors + df.validate_factor_values()
66-
67-
if not skip_experimental_design_validation:
68-
errors = errors + df.validate_experimental_design()
44+
:param input_sdrf: Path to the SDRF file to check
45+
:param template: Schema template for full validation (e.g. 'ms-proteomics', 'dia-acquisition')
46+
:param minimal: Only validate columns required to run the pipeline (skip organism, etc.)
47+
:param use_ols_cache_only: Use OLS cache instead of live OLS service
48+
"""
49+
if minimal:
50+
errors = _validate_minimal(input_sdrf)
51+
else:
52+
df = read_sdrf(input_sdrf)
53+
errors = df.validate_sdrf(
54+
template=template,
55+
use_ols_cache_only=use_ols_cache_only,
56+
)
6957

7058
for error in errors:
7159
print(error)
7260

7361
sys.exit(bool(errors))
7462

7563

76-
def check_expdesign(expdesign):
77-
"""
78-
Check the expdesign file for errors. If any errors are found, print them and exit with a non-zero status code.
79-
@param expdesign: Path to the expdesign file to check
80-
"""
81-
data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str)
82-
data = data.dropna()
83-
schema_file = ["Fraction_Group", "Fraction", "Spectra_Filepath", "Label", "Sample"]
84-
schema_sample = ["Sample", "MSstats_Condition", "MSstats_BioReplicate"]
85-
86-
# check table format: two table
87-
with open(expdesign, "r") as f:
88-
lines = f.readlines()
89-
try:
90-
empty_row = lines.index("\n")
91-
except ValueError:
92-
print(
93-
"the one-table format parser is broken in OpenMS2.5, please use one-table or sdrf"
94-
)
95-
sys.exit(1)
96-
97-
s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:]
98-
s_header = lines[empty_row + 1].replace("\n", "").split("\t")
99-
s_data_frame = pd.DataFrame(s_table, columns=s_header)
100-
101-
# check missed mandatory column
102-
missed_columns = set(schema_file) - set(data.columns)
103-
if len(missed_columns) != 0:
104-
print("{0} column missed".format(" ".join(missed_columns)))
105-
sys.exit(1)
106-
107-
missed_columns = set(schema_sample) - set(s_data_frame.columns)
108-
if len(missed_columns) != 0:
109-
print("{0} column missed".format(" ".join(missed_columns)))
110-
sys.exit(1)
64+
def _validate_minimal(input_sdrf: str) -> list[str]:
65+
"""Validate only the columns required to run the pipeline.
11166
112-
if len(set(data.Label)) != 1 and "MSstats_Mixture" not in s_data_frame.columns:
113-
print("MSstats_Mixture column missed in ISO experiments")
114-
sys.exit(1)
115-
116-
# check logical problem: may be improved
117-
check_expdesign_logic(data, s_data_frame)
67+
Returns a list of error strings. Only missing required columns
68+
produce errors; missing recommended columns produce warnings (non-blocking).
69+
"""
70+
df_header = pd.read_csv(input_sdrf, sep="\t", nrows=0)
71+
columns_lower = [c.lower() for c in df_header.columns]
72+
errors = []
73+
74+
# Reject header-only files
75+
df_rows = pd.read_csv(input_sdrf, sep="\t", nrows=1)
76+
if len(df_rows) == 0:
77+
errors.append("ERROR: SDRF file contains a header but no data rows.")
78+
return errors
79+
80+
# Check required columns (case-insensitive)
81+
for col in MINIMAL_REQUIRED_COLUMNS:
82+
if col.lower() not in columns_lower:
83+
errors.append(f"ERROR: Required column '{col}' is missing from the SDRF file.")
84+
85+
# Check at least one modification parameters column exists
86+
has_mod_col = any(c.startswith("comment[modification parameters") for c in columns_lower)
87+
if not has_mod_col:
88+
errors.append(
89+
"ERROR: At least one 'comment[modification parameters]' column is required."
90+
)
11891

92+
# Warn about recommended columns (non-blocking)
93+
for col in MINIMAL_RECOMMENDED_COLUMNS:
94+
if col.lower() not in columns_lower:
95+
logger.warning(
96+
f"Recommended column '{col}' is missing. Pipeline will use default parameters."
97+
)
11998

120-
def check_expdesign_logic(f_table, s_table):
121-
fg_ints = f_table["Fraction_Group"].astype(int)
122-
if fg_ints.max() > fg_ints.nunique():
123-
print("Fraction_Group discontinuous!")
124-
sys.exit(1)
125-
f_table_d = f_table.drop_duplicates(["Fraction_Group", "Fraction", "Label", "Sample"])
126-
if f_table_d.shape[0] < f_table.shape[0]:
127-
print("Existing duplicate entries in Fraction_Group, Fraction, Label and Sample")
128-
sys.exit(1)
129-
if len(set(s_table.Sample)) < s_table.shape[0]:
130-
print("Existing duplicate Sample in sample table!")
131-
sys.exit(1)
99+
return errors
132100

133101

134102
@click.command(
135103
"checksamplesheet",
136-
short_help="Reformat nf-core/quantms sdrf file and check its contents.",
104+
short_help="Validate an SDRF file for quantms pipelines.",
137105
)
138-
@click.option("--exp_design", help="SDRF/Expdesign file to be validated")
139-
@click.option("--is_sdrf", help="SDRF file or Expdesign file", is_flag=True)
140-
@click.option("--skip_sdrf_validation", help="Disable the validation of SDRF", is_flag=True)
106+
@click.option("--exp_design", help="SDRF file to be validated", required=True)
141107
@click.option(
142-
"--skip_ms_validation",
143-
help="Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)",
144-
is_flag=True,
108+
"--template", "-t",
109+
help="Schema template for full validation (e.g. ms-proteomics, dia-acquisition)",
110+
default="ms-proteomics",
145111
)
146112
@click.option(
147-
"--skip_factor_validation",
148-
help="Disable the validation of factor values in SDRF",
149-
is_flag=True,
150-
)
151-
@click.option(
152-
"--skip_experimental_design_validation",
153-
help="Disable the validation of experimental design",
113+
"--minimal",
114+
help="Only validate columns required to run the pipeline (skip organism, metadata, etc.)",
154115
is_flag=True,
155116
)
156117
@click.option(
157118
"--use_ols_cache_only",
158-
help="Use ols cache for validation of the terms and not OLS internet service",
119+
help="Use OLS cache for ontology validation instead of the live OLS service",
159120
is_flag=True,
160121
)
161122
def checksamplesheet(
162123
exp_design: str,
163-
is_sdrf: bool = False,
164-
skip_sdrf_validation: bool = False,
165-
skip_ms_validation: bool = False,
166-
skip_factor_validation: bool = False,
167-
skip_experimental_design_validation: bool = False,
124+
template: str = "ms-proteomics",
125+
minimal: bool = False,
168126
use_ols_cache_only: bool = False,
169127
):
170-
"""
171-
Reformat nf-core/quantms sdrf file and check its contents.
172-
@param exp_design: SDRF/Expdesign file to be validated
173-
@param is_sdrf: SDRF file or Expdesign file
174-
@param skip_sdrf_validation: Disable the validation of SDRF
175-
@param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)
176-
@param skip_factor_validation: Disable the validation of factor values in SDRF
177-
@param skip_experimental_design_validation: Disable the validation of experimental design
178-
@param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service
179-
180-
"""
181-
# TODO validate expdesign file
182-
if is_sdrf:
183-
check_sdrf(
184-
input_sdrf=exp_design,
185-
skip_sdrf_validation=skip_sdrf_validation,
186-
skip_ms_validation=skip_ms_validation,
187-
skip_factor_validation=skip_factor_validation,
188-
skip_experimental_design_validation=skip_experimental_design_validation,
189-
use_ols_cache_only=use_ols_cache_only,
190-
)
191-
else:
192-
check_expdesign(exp_design)
128+
"""Validate an SDRF file for quantms pipelines."""
129+
check_sdrf(
130+
input_sdrf=exp_design,
131+
template=template,
132+
minimal=minimal,
133+
use_ols_cache_only=use_ols_cache_only,
134+
)

recipe/meta.yaml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# recipe/meta.yaml
22
package:
33
name: quantms-utils
4-
version: "0.0.25"
4+
version: "0.0.26"
55

66
source:
77
path: ../
@@ -20,19 +20,16 @@ requirements:
2020
- python
2121
- pip
2222
- poetry-core >=1.2.0
23-
- setuptools <78
23+
2424
run:
2525
- python >=3.9,<3.13
2626
- click
27-
- setuptools <78
28-
- sdrf-pipelines >=0.0.33,<0.1.0
27+
- sdrf-pipelines >=0.1.1
2928
- pyopenms>=3.3.0
3029
- pandas
3130
- pyarrow>=16.1.0
3231
- scipy
3332
test:
34-
requires:
35-
- setuptools <78
3633
imports:
3734
- quantmsutils
3835
commands:

0 commit comments

Comments
 (0)