|
1 | | -# nf-core: Update the script to check the sdrf |
2 | | -# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv |
3 | | - |
4 | | -import errno |
5 | 1 | import logging |
6 | | -import os |
7 | 2 | import sys |
8 | 3 |
|
9 | 4 | import click |
10 | 5 | import pandas as pd |
11 | | -from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame |
12 | | -from sdrf_pipelines.sdrf.sdrf_schema import DEFAULT_TEMPLATE, MASS_SPECTROMETRY |
| 6 | + |
| 7 | +from sdrf_pipelines.sdrf.sdrf import read_sdrf |
13 | 8 |
|
14 | 9 | logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG) |
15 | 10 | logger = logging.getLogger(__name__) |
16 | 11 |
|
17 | | - |
18 | | -def make_dir(path): |
19 | | - if len(path) > 0: |
20 | | - try: |
21 | | - os.makedirs(path) |
22 | | - except OSError as exception: |
23 | | - if exception.errno != errno.EEXIST: |
24 | | - raise exception |
25 | | - |
26 | | - |
27 | | -def print_error(error, context="Line", context_str=""): |
28 | | - error_str = "ERROR: Please check samplesheet -> {}".format(error) |
29 | | - if context != "" and context_str != "": |
30 | | - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( |
31 | | - error, context.strip(), context_str.strip() |
32 | | - ) |
33 | | - print(error_str) |
34 | | - sys.exit(1) |
| 12 | +# Minimal columns required to run quantms/quantmsdiann pipelines. |
| 13 | +# These are checked in --minimal mode instead of full schema validation. |
| 14 | +MINIMAL_REQUIRED_COLUMNS = [ |
| 15 | + "source name", |
| 16 | + "assay name", |
| 17 | + "comment[data file]", |
| 18 | + "comment[label]", |
| 19 | + "comment[cleavage agent details]", |
| 20 | + "comment[instrument]", |
| 21 | + "comment[proteomics data acquisition method]", |
| 22 | + "technology type", |
| 23 | +] |
| 24 | + |
| 25 | +# Recommended columns: warn if missing but don't fail |
| 26 | +MINIMAL_RECOMMENDED_COLUMNS = [ |
| 27 | + "comment[precursor mass tolerance]", |
| 28 | + "comment[fragment mass tolerance]", |
| 29 | + "comment[dissociation method]", |
| 30 | + "comment[technical replicate]", |
| 31 | + "comment[fraction identifier]", |
| 32 | +] |
35 | 33 |
|
36 | 34 |
|
37 | 35 | def check_sdrf( |
38 | 36 | input_sdrf: str, |
39 | | - skip_ms_validation: bool = False, |
40 | | - skip_factor_validation: bool = False, |
41 | | - skip_experimental_design_validation: bool = False, |
| 37 | + template: str = "ms-proteomics", |
| 38 | + minimal: bool = False, |
42 | 39 | use_ols_cache_only: bool = False, |
43 | | - skip_sdrf_validation: bool = False, |
44 | 40 | ): |
45 | 41 | """ |
46 | | - Check the SDRF file for errors. If any errors are found, print them and exit with a non-zero status code. |
47 | | - @param input_sdrf: Path to the SDRF file to check |
48 | | - @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications) |
49 | | - @param skip_factor_validation: Disable the validation of factor values in SDRF |
50 | | - @param skip_experimental_design_validation: Disable the validation of experimental design |
51 | | - @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service |
52 | | - @param skip_sdrf_validation: Disable the validation of SDRF |
53 | | - """ |
54 | | - if skip_sdrf_validation: |
55 | | - print("No SDRF validation was performed.") |
56 | | - sys.exit(0) |
57 | | - |
58 | | - df = SdrfDataFrame.parse(input_sdrf) |
59 | | - errors = df.validate(DEFAULT_TEMPLATE, use_ols_cache_only) |
60 | | - |
61 | | - if not skip_ms_validation: |
62 | | - errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only) |
| 42 | + Check the SDRF file for errors. |
63 | 43 |
|
64 | | - if not skip_factor_validation: |
65 | | - errors = errors + df.validate_factor_values() |
66 | | - |
67 | | - if not skip_experimental_design_validation: |
68 | | - errors = errors + df.validate_experimental_design() |
| 44 | + :param input_sdrf: Path to the SDRF file to check |
| 45 | + :param template: Schema template for full validation (e.g. 'ms-proteomics', 'dia-acquisition') |
| 46 | + :param minimal: Only validate columns required to run the pipeline (skip organism, etc.) |
| 47 | + :param use_ols_cache_only: Use OLS cache instead of live OLS service |
| 48 | + """ |
| 49 | + if minimal: |
| 50 | + errors = _validate_minimal(input_sdrf) |
| 51 | + else: |
| 52 | + df = read_sdrf(input_sdrf) |
| 53 | + errors = df.validate_sdrf( |
| 54 | + template=template, |
| 55 | + use_ols_cache_only=use_ols_cache_only, |
| 56 | + ) |
69 | 57 |
|
70 | 58 | for error in errors: |
71 | 59 | print(error) |
72 | 60 |
|
73 | 61 | sys.exit(bool(errors)) |
74 | 62 |
|
75 | 63 |
|
76 | | -def check_expdesign(expdesign): |
77 | | - """ |
78 | | - Check the expdesign file for errors. If any errors are found, print them and exit with a non-zero status code. |
79 | | - @param expdesign: Path to the expdesign file to check |
80 | | - """ |
81 | | - data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str) |
82 | | - data = data.dropna() |
83 | | - schema_file = ["Fraction_Group", "Fraction", "Spectra_Filepath", "Label", "Sample"] |
84 | | - schema_sample = ["Sample", "MSstats_Condition", "MSstats_BioReplicate"] |
85 | | - |
86 | | - # check table format: two table |
87 | | - with open(expdesign, "r") as f: |
88 | | - lines = f.readlines() |
89 | | - try: |
90 | | - empty_row = lines.index("\n") |
91 | | - except ValueError: |
92 | | - print( |
93 | | - "the one-table format parser is broken in OpenMS2.5, please use one-table or sdrf" |
94 | | - ) |
95 | | - sys.exit(1) |
96 | | - |
97 | | - s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:] |
98 | | - s_header = lines[empty_row + 1].replace("\n", "").split("\t") |
99 | | - s_data_frame = pd.DataFrame(s_table, columns=s_header) |
100 | | - |
101 | | - # check missed mandatory column |
102 | | - missed_columns = set(schema_file) - set(data.columns) |
103 | | - if len(missed_columns) != 0: |
104 | | - print("{0} column missed".format(" ".join(missed_columns))) |
105 | | - sys.exit(1) |
106 | | - |
107 | | - missed_columns = set(schema_sample) - set(s_data_frame.columns) |
108 | | - if len(missed_columns) != 0: |
109 | | - print("{0} column missed".format(" ".join(missed_columns))) |
110 | | - sys.exit(1) |
| 64 | +def _validate_minimal(input_sdrf: str) -> list[str]: |
| 65 | + """Validate only the columns required to run the pipeline. |
111 | 66 |
|
112 | | - if len(set(data.Label)) != 1 and "MSstats_Mixture" not in s_data_frame.columns: |
113 | | - print("MSstats_Mixture column missed in ISO experiments") |
114 | | - sys.exit(1) |
115 | | - |
116 | | - # check logical problem: may be improved |
117 | | - check_expdesign_logic(data, s_data_frame) |
| 67 | + Returns a list of error strings. Only missing required columns |
| 68 | + produce errors; missing recommended columns produce warnings (non-blocking). |
| 69 | + """ |
| 70 | + df_header = pd.read_csv(input_sdrf, sep="\t", nrows=0) |
| 71 | + columns_lower = [c.lower() for c in df_header.columns] |
| 72 | + errors = [] |
| 73 | + |
| 74 | + # Reject header-only files |
| 75 | + df_rows = pd.read_csv(input_sdrf, sep="\t", nrows=1) |
| 76 | + if len(df_rows) == 0: |
| 77 | + errors.append("ERROR: SDRF file contains a header but no data rows.") |
| 78 | + return errors |
| 79 | + |
| 80 | + # Check required columns (case-insensitive) |
| 81 | + for col in MINIMAL_REQUIRED_COLUMNS: |
| 82 | + if col.lower() not in columns_lower: |
| 83 | + errors.append(f"ERROR: Required column '{col}' is missing from the SDRF file.") |
| 84 | + |
| 85 | + # Check at least one modification parameters column exists |
| 86 | + has_mod_col = any(c.startswith("comment[modification parameters") for c in columns_lower) |
| 87 | + if not has_mod_col: |
| 88 | + errors.append( |
| 89 | + "ERROR: At least one 'comment[modification parameters]' column is required." |
| 90 | + ) |
118 | 91 |
|
| 92 | + # Warn about recommended columns (non-blocking) |
| 93 | + for col in MINIMAL_RECOMMENDED_COLUMNS: |
| 94 | + if col.lower() not in columns_lower: |
| 95 | + logger.warning( |
| 96 | + f"Recommended column '{col}' is missing. Pipeline will use default parameters." |
| 97 | + ) |
119 | 98 |
|
120 | | -def check_expdesign_logic(f_table, s_table): |
121 | | - fg_ints = f_table["Fraction_Group"].astype(int) |
122 | | - if fg_ints.max() > fg_ints.nunique(): |
123 | | - print("Fraction_Group discontinuous!") |
124 | | - sys.exit(1) |
125 | | - f_table_d = f_table.drop_duplicates(["Fraction_Group", "Fraction", "Label", "Sample"]) |
126 | | - if f_table_d.shape[0] < f_table.shape[0]: |
127 | | - print("Existing duplicate entries in Fraction_Group, Fraction, Label and Sample") |
128 | | - sys.exit(1) |
129 | | - if len(set(s_table.Sample)) < s_table.shape[0]: |
130 | | - print("Existing duplicate Sample in sample table!") |
131 | | - sys.exit(1) |
| 99 | + return errors |
132 | 100 |
|
133 | 101 |
|
134 | 102 | @click.command( |
135 | 103 | "checksamplesheet", |
136 | | - short_help="Reformat nf-core/quantms sdrf file and check its contents.", |
| 104 | + short_help="Validate an SDRF file for quantms pipelines.", |
137 | 105 | ) |
138 | | -@click.option("--exp_design", help="SDRF/Expdesign file to be validated") |
139 | | -@click.option("--is_sdrf", help="SDRF file or Expdesign file", is_flag=True) |
140 | | -@click.option("--skip_sdrf_validation", help="Disable the validation of SDRF", is_flag=True) |
| 106 | +@click.option("--exp_design", help="SDRF file to be validated", required=True) |
141 | 107 | @click.option( |
142 | | - "--skip_ms_validation", |
143 | | - help="Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)", |
144 | | - is_flag=True, |
| 108 | + "--template", "-t", |
| 109 | + help="Schema template for full validation (e.g. ms-proteomics, dia-acquisition)", |
| 110 | + default="ms-proteomics", |
145 | 111 | ) |
146 | 112 | @click.option( |
147 | | - "--skip_factor_validation", |
148 | | - help="Disable the validation of factor values in SDRF", |
149 | | - is_flag=True, |
150 | | -) |
151 | | -@click.option( |
152 | | - "--skip_experimental_design_validation", |
153 | | - help="Disable the validation of experimental design", |
| 113 | + "--minimal", |
| 114 | + help="Only validate columns required to run the pipeline (skip organism, metadata, etc.)", |
154 | 115 | is_flag=True, |
155 | 116 | ) |
156 | 117 | @click.option( |
157 | 118 | "--use_ols_cache_only", |
158 | | - help="Use ols cache for validation of the terms and not OLS internet service", |
| 119 | + help="Use OLS cache for ontology validation instead of the live OLS service", |
159 | 120 | is_flag=True, |
160 | 121 | ) |
161 | 122 | def checksamplesheet( |
162 | 123 | exp_design: str, |
163 | | - is_sdrf: bool = False, |
164 | | - skip_sdrf_validation: bool = False, |
165 | | - skip_ms_validation: bool = False, |
166 | | - skip_factor_validation: bool = False, |
167 | | - skip_experimental_design_validation: bool = False, |
| 124 | + template: str = "ms-proteomics", |
| 125 | + minimal: bool = False, |
168 | 126 | use_ols_cache_only: bool = False, |
169 | 127 | ): |
170 | | - """ |
171 | | - Reformat nf-core/quantms sdrf file and check its contents. |
172 | | - @param exp_design: SDRF/Expdesign file to be validated |
173 | | - @param is_sdrf: SDRF file or Expdesign file |
174 | | - @param skip_sdrf_validation: Disable the validation of SDRF |
175 | | - @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications) |
176 | | - @param skip_factor_validation: Disable the validation of factor values in SDRF |
177 | | - @param skip_experimental_design_validation: Disable the validation of experimental design |
178 | | - @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service |
179 | | -
|
180 | | - """ |
181 | | - # TODO validate expdesign file |
182 | | - if is_sdrf: |
183 | | - check_sdrf( |
184 | | - input_sdrf=exp_design, |
185 | | - skip_sdrf_validation=skip_sdrf_validation, |
186 | | - skip_ms_validation=skip_ms_validation, |
187 | | - skip_factor_validation=skip_factor_validation, |
188 | | - skip_experimental_design_validation=skip_experimental_design_validation, |
189 | | - use_ols_cache_only=use_ols_cache_only, |
190 | | - ) |
191 | | - else: |
192 | | - check_expdesign(exp_design) |
| 128 | + """Validate an SDRF file for quantms pipelines.""" |
| 129 | + check_sdrf( |
| 130 | + input_sdrf=exp_design, |
| 131 | + template=template, |
| 132 | + minimal=minimal, |
| 133 | + use_ols_cache_only=use_ols_cache_only, |
| 134 | + ) |
0 commit comments