Skip to content

Commit ffd55fb

Browse files
authored
Merge pull request #111 from Vitek-Lab/feature-anomaly_model
Feature anomaly model
2 parents 8e38072 + 3decde3 commit ffd55fb

47 files changed

Lines changed: 4565 additions & 78 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

DESCRIPTION

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,16 @@ Imports:
2424
methods,
2525
checkmate,
2626
utils,
27-
stringi
27+
stringi,
28+
Rcpp,
29+
parallel
2830
Suggests:
2931
tinytest,
3032
covr,
3133
knitr,
32-
rmarkdown,
33-
arrow
34+
arrow,
35+
rmarkdown
36+
LinkingTo: Rcpp
3437
Collate:
3538
'clean_ProteinProspector.R'
3639
'clean_Metamorpheus.R'
@@ -46,6 +49,7 @@ Collate:
4649
'clean_MaxQuant.R'
4750
'clean_DIAUmpire.R'
4851
'MSstatsConvert_core_functions.R'
52+
'RcppExports.R'
4953
'converters_DIANNtoMSstatsFormat.R'
5054
'converters_DIAUmpiretoMSstatsFormat.R'
5155
'converters_FragPipetoMSstatsFormat.R'
@@ -60,10 +64,12 @@ Collate:
6064
'converters_SpectronauttoMSstatsFormat.R'
6165
'utils_MSstatsConvert.R'
6266
'utils_annotation.R'
67+
'utils_anomaly_score.R'
6368
'utils_balanced_design.R'
6469
'utils_checks.R'
6570
'utils_classes.R'
6671
'utils_clean_features.R'
72+
'utils_data_health.R'
6773
'utils_documentation.R'
6874
'utils_dt_operations.R'
6975
'utils_filtering.R'

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22

33
S3method(as.data.frame,MSstatsValidated)
44
S3method(as.data.table,MSstatsValidated)
5+
export(CheckDataHealth)
56
export(DIANNtoMSstatsFormat)
67
export(DIAUmpiretoMSstatsFormat)
78
export(FragPipetoMSstatsFormat)
9+
export(MSstatsAnomalyScores)
810
export(MSstatsBalancedDesign)
911
export(MSstatsClean)
1012
export(MSstatsImport)
@@ -25,7 +27,9 @@ export(getDataType)
2527
export(getInputFile)
2628
exportMethods(getDataType)
2729
exportMethods(getInputFile)
30+
import(Rcpp)
2831
import(data.table)
32+
import(parallel)
2933
importFrom(data.table,as.data.table)
3034
importFrom(data.table,fread)
3135
importFrom(data.table,melt)
@@ -37,3 +41,4 @@ importFrom(log4r,file_appender)
3741
importFrom(methods,new)
3842
importFrom(stats,na.omit)
3943
importFrom(utils,sessionInfo)
44+
useDynLib(MSstatsConvert, .registration = TRUE)

R/MSstatsConvert_core_functions.R

Lines changed: 93 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles",
322322
#' names defined by the names of this list and values corresponding to its elements
323323
#' will be added to the output `data.frame`.
324324
#' @param aggregate_isotopic logical. If `TRUE`, isotopic peaks will by summed.
325+
#' @param anomaly_metrics character vector of names of columns with quality metrics. Default is missing and is not required if anomaly model not run.
325326
#' @param ... additional parameters to `data.table::fread`.
326327
#'
327328
#' @return data.table
@@ -363,7 +364,7 @@ MSstatsPreprocess = function(
363364
summarize_multiple_psms = max),
364365
score_filtering = list(), exact_filtering = list(),
365366
pattern_filtering = list(), columns_to_fill = list(),
366-
aggregate_isotopic = FALSE, ...
367+
aggregate_isotopic = FALSE, anomaly_metrics = c(), ...
367368
) {
368369
.checkMSstatsParams(input, annotation, feature_columns,
369370
remove_shared_peptides,
@@ -380,8 +381,10 @@ MSstatsPreprocess = function(
380381
input = .handleIsotopicPeaks(input, aggregate_isotopic)
381382
input = .filterFewMeasurements(input, 1, FALSE)
382383
input = .handleSharedPeptides(input, remove_shared_peptides)
383-
input = .cleanByFeature(input, feature_columns, feature_cleaning)
384-
input = .handleSingleFeaturePerProtein(input, remove_single_feature_proteins)
384+
input = .cleanByFeature(input, feature_columns,
385+
feature_cleaning, anomaly_metrics)
386+
input = .handleSingleFeaturePerProtein(input,
387+
remove_single_feature_proteins)
385388
input = .mergeAnnotation(input, annotation)
386389
.fillValues(input, columns_to_fill)
387390
.adjustIntensities(input)
@@ -406,6 +409,7 @@ MSstatsPreprocess = function(
406409
#' If "na_to_zero", missing values will be replaced by zeros.
407410
#' @param remove_few lgl, if TRUE, features with one or two measurements
408411
#' across runs will be removed.
412+
#' @param anomaly_metrics character vector of names of columns with quality metrics
409413
#'
410414
#' @export
411415
#' @return data.frame of class `MSstatsValidated`
@@ -422,7 +426,7 @@ MSstatsPreprocess = function(
422426
#'
423427
MSstatsBalancedDesign = function(input, feature_columns, fill_incomplete = TRUE,
424428
handle_fractions = TRUE, fix_missing = NULL,
425-
remove_few = TRUE) {
429+
remove_few = TRUE, anomaly_metrics = c()) {
426430
feature = NULL
427431

428432
input[, feature := do.call(".combine", .SD), .SDcols = feature_columns]
@@ -435,7 +439,7 @@ MSstatsBalancedDesign = function(input, feature_columns, fill_incomplete = TRUE,
435439
getOption("MSstatsLog")("INFO", msg_fractions)
436440
getOption("MSstatsMsg")("INFO", msg_fractions)
437441
}
438-
input = .makeBalancedDesign(input, fill_incomplete)
442+
input = .makeBalancedDesign(input, fill_incomplete, anomaly_metrics)
439443
msg_balanced = paste("** Updated quantification data to make balanced design.",
440444
"Missing values are marked by NA")
441445
getOption("MSstatsLog")("INFO", msg_balanced)
@@ -445,7 +449,7 @@ MSstatsBalancedDesign = function(input, feature_columns, fill_incomplete = TRUE,
445449
with = FALSE]
446450

447451
getOption("MSstatsLog")("INFO", "\n")
448-
.MSstatsFormat(input)
452+
.MSstatsFormat(input, anomaly_metrics)
449453
}
450454

451455

@@ -512,3 +516,86 @@ MSstatsMakeAnnotation = function(input, annotation, ...) {
512516
getOption("MSstatsMsg")("INFO", msg)
513517
annotation
514518
}
519+
520+
#' Run Anomaly Model
521+
#'
522+
#' @param input data.table preprocessed by the MSstatsBalancedDesign function
523+
#' @param quality_metrics character vector of quality metrics to use in the model
524+
#' @param temporal_direction character vector of same length as quality_metrics indicating temporal feature to create.
525+
#' @param missing_run_count numeric, maximum allowed fraction of missing runs per feature.
526+
#' @param n_feat numeric, maximum number of features per protein to use in the model.
527+
#' @param run_order data.frame with two columns: Run and Order. Order should be numeric and indicate the order of runs.
528+
#' @param n_trees numeric, number of trees to use in the isolation forest model. Default is 100.
529+
#' @param max_depth numeric or "auto", maximum depth of each tree. Default is "auto" which sets depth to log2(N) where N is the number of runs.
530+
#' @param cores numeric, number of cores to use for parallel processing. Default is 1.
531+
#' @useDynLib MSstatsConvert, .registration = TRUE
532+
#'
533+
#' @return data.table
534+
#' @export
535+
MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
536+
missing_run_count, n_feat, run_order, n_trees,
537+
max_depth, cores){
538+
539+
input = .prepareSpectronautAnomalyInput(input, quality_metrics,
540+
run_order, n_feat,
541+
missing_run_count)
542+
input$PSM = paste0(input$PeptideSequence, input$PrecursorCharge)
543+
544+
for (i in seq_along(quality_metrics)){
545+
if (temporal_direction[i] != FALSE){
546+
quality_metrics = c(quality_metrics,
547+
paste0(quality_metrics[i], ".",
548+
temporal_direction[i]))
549+
}
550+
}
551+
552+
input = .runAnomalyModel(input,
553+
n_trees=n_trees,
554+
max_depth=max_depth,
555+
cores=cores,
556+
split_column="PSM",
557+
quality_metrics=quality_metrics)
558+
559+
subset_cols = c("Run", "ProteinName", "PeptideSequence",
560+
"PrecursorCharge", "FragmentIon",
561+
"ProductCharge", "IsotopeLabelType",
562+
"Condition", "BioReplicate",
563+
"Fraction", "Intensity", "AnomalyScores",
564+
quality_metrics)
565+
566+
subset_cols = subset_cols[subset_cols %in% names(input)]
567+
input = input[, ..subset_cols]
568+
569+
return(input)
570+
571+
}
572+
573+
#' Takes as input the output of the SpectronauttoMSstatsFormat function and calculates various quality metrics to assess the health of the data. Requires Anomaly Detection model to be fit.
574+
#'
575+
#' @param input MSstats input which is the output of Spectronaut converter
576+
#' @return list of two data.tables
577+
#'
578+
#' @export
579+
CheckDataHealth = function(input){
580+
581+
input = as.data.table(input)
582+
583+
# All intensity characteristics
584+
missing_percent = .checkMissing(input)
585+
zero_truncated = .checkIntensityDistribution(input)
586+
587+
# Feature specific characteristics
588+
input$Feature = paste(input$PeptideSequence,
589+
input$PrecursorCharge,
590+
input$FragmentIon,
591+
input$ProductCharge, sep="_")
592+
feature_data = .checkFeatureSD(input)
593+
outlier_info = .checkFeatureOutliers(input, feature_data)
594+
feature_data = outlier_info[[1]]
595+
outlier_summary = outlier_info[[2]]
596+
feature_data = .checkFeatureCoverage(input, feature_data)
597+
598+
skew_results = .checkAnomalySkew(input)
599+
600+
return(list(feature_data, skew_results))
601+
}

R/RcppExports.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
2+
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3+
4+
calculate_anomaly_score <- function(df, n_trees, max_depth) {
5+
.Call(`_MSstatsConvert_calculate_anomaly_score`, df, n_trees, max_depth)
6+
}
7+

R/clean_Spectronaut.R

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,39 @@
11
#' Clean raw Spectronaut output.
22
#' @param msstats_object an object of class `MSstatsSpectronautFiles`.
33
#' @param intensity chr, specifies which column will be used for Intensity.
4+
#' @param calculateAnomalyScores logical, whether to calculate anomaly scores
5+
#' @param anomalyModelFeatures character vector, specifies which columns will be used for anomaly detection model. Can be NULL if calculateAnomalyScores=FALSE.
46
#' @return `data.table`
57
#' @keywords internal
6-
.cleanRawSpectronaut = function(msstats_object, intensity) {
8+
.cleanRawSpectronaut = function(msstats_object, intensity,
9+
calculateAnomalyScores,
10+
anomalyModelFeatures) {
711
FFrgLossType = FExcludedFromQuantification = NULL
812

913
spec_input = getInputFile(msstats_object, "input")
1014
.validateSpectronautInput(spec_input)
1115
spec_input = spec_input[FFrgLossType == "noloss", ]
12-
13-
if (is.character(spec_input$FExcludedFromQuantification)) {
14-
spec_input = spec_input[FExcludedFromQuantification == "False", ]
15-
} else {
16-
spec_input = spec_input[!(as.logical(FExcludedFromQuantification)), ]
17-
}
1816

1917
f_charge_col = .findAvailable(c("FCharge", "FFrgZ"), colnames(spec_input))
2018
pg_qval_col = .findAvailable(c("PGQvalue"), colnames(spec_input))
19+
interference_col = .findAvailable(c("FPossibleInterference"),
20+
colnames(spec_input))
21+
exclude_col = .findAvailable(c("FExcludedFromQuantification"),
22+
colnames(spec_input))
2123
cols = c("PGProteinGroups", "EGModifiedSequence", "FGCharge", "FFrgIon",
2224
f_charge_col, "RFileName", "RCondition", "RReplicate",
23-
"EGQvalue", pg_qval_col, paste0("F", intensity))
25+
"EGQvalue", pg_qval_col, interference_col, exclude_col,
26+
paste0("F", intensity))
27+
if (calculateAnomalyScores){
28+
cols = c(cols, anomalyModelFeatures)
29+
}
2430
cols = intersect(cols, colnames(spec_input))
2531
spec_input = spec_input[, cols, with = FALSE]
2632
data.table::setnames(
2733
spec_input,
2834
c("PGProteinGroups", "EGModifiedSequence", "FGCharge", "FFrgIon",
29-
f_charge_col, "RFileName", paste0("F", intensity), "RCondition", "RReplicate"),
35+
f_charge_col, "RFileName", paste0("F", intensity),
36+
"RCondition", "RReplicate"),
3037
c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
3138
"ProductCharge", "Run", "Intensity", "Condition", "BioReplicate"),
3239
skip_absent = TRUE)

0 commit comments

Comments
 (0)