@@ -322,6 +322,7 @@ setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles",
322322# ' names defined by the names of this list and values corresponding to its elements
323323# ' will be added to the output `data.frame`.
324324# ' @param aggregate_isotopic logical. If `TRUE`, isotopic peaks will by summed.
325+ # ' @param anomaly_metrics character vector of names of columns with quality metrics. Default is missing and is not required if anomaly model not run.
325326# ' @param ... additional parameters to `data.table::fread`.
326327# '
327328# ' @return data.table
@@ -363,7 +364,7 @@ MSstatsPreprocess = function(
363364 summarize_multiple_psms = max ),
364365 score_filtering = list (), exact_filtering = list (),
365366 pattern_filtering = list (), columns_to_fill = list (),
366- aggregate_isotopic = FALSE , ...
367+ aggregate_isotopic = FALSE , anomaly_metrics = c(), ...
367368) {
368369 .checkMSstatsParams(input , annotation , feature_columns ,
369370 remove_shared_peptides ,
@@ -380,8 +381,10 @@ MSstatsPreprocess = function(
380381 input = .handleIsotopicPeaks(input , aggregate_isotopic )
381382 input = .filterFewMeasurements(input , 1 , FALSE )
382383 input = .handleSharedPeptides(input , remove_shared_peptides )
383- input = .cleanByFeature(input , feature_columns , feature_cleaning )
384- input = .handleSingleFeaturePerProtein(input , remove_single_feature_proteins )
384+ input = .cleanByFeature(input , feature_columns ,
385+ feature_cleaning , anomaly_metrics )
386+ input = .handleSingleFeaturePerProtein(input ,
387+ remove_single_feature_proteins )
385388 input = .mergeAnnotation(input , annotation )
386389 .fillValues(input , columns_to_fill )
387390 .adjustIntensities(input )
@@ -406,6 +409,7 @@ MSstatsPreprocess = function(
406409# ' If "na_to_zero", missing values will be replaced by zeros.
407410# ' @param remove_few lgl, if TRUE, features with one or two measurements
408411# ' across runs will be removed.
412+ # ' @param anomaly_metrics character vector of names of columns with quality metrics
409413# '
410414# ' @export
411415# ' @return data.frame of class `MSstatsValidated`
@@ -422,7 +426,7 @@ MSstatsPreprocess = function(
422426# '
423427MSstatsBalancedDesign = function (input , feature_columns , fill_incomplete = TRUE ,
424428 handle_fractions = TRUE , fix_missing = NULL ,
425- remove_few = TRUE ) {
429+ remove_few = TRUE , anomaly_metrics = c() ) {
426430 feature = NULL
427431
428432 input [, feature : = do.call(" .combine" , .SD ), .SDcols = feature_columns ]
@@ -435,7 +439,7 @@ MSstatsBalancedDesign = function(input, feature_columns, fill_incomplete = TRUE,
435439 getOption(" MSstatsLog" )(" INFO" , msg_fractions )
436440 getOption(" MSstatsMsg" )(" INFO" , msg_fractions )
437441 }
438- input = .makeBalancedDesign(input , fill_incomplete )
442+ input = .makeBalancedDesign(input , fill_incomplete , anomaly_metrics )
439443 msg_balanced = paste(" ** Updated quantification data to make balanced design." ,
440444 " Missing values are marked by NA" )
441445 getOption(" MSstatsLog" )(" INFO" , msg_balanced )
@@ -445,7 +449,7 @@ MSstatsBalancedDesign = function(input, feature_columns, fill_incomplete = TRUE,
445449 with = FALSE ]
446450
447451 getOption(" MSstatsLog" )(" INFO" , " \n " )
448- .MSstatsFormat(input )
452+ .MSstatsFormat(input , anomaly_metrics )
449453}
450454
451455
@@ -512,3 +516,86 @@ MSstatsMakeAnnotation = function(input, annotation, ...) {
512516 getOption(" MSstatsMsg" )(" INFO" , msg )
513517 annotation
514518}
519+
520+ # ' Run Anomaly Model
521+ # '
522+ # ' @param input data.table preprocessed by the MSstatsBalancedDesign function
523+ # ' @param quality_metrics character vector of quality metrics to use in the model
524+ # ' @param temporal_direction character vector of same length as quality_metrics indicating temporal feature to create.
525+ # ' @param missing_run_count numeric, maximum allowed fraction of missing runs per feature.
526+ # ' @param n_feat numeric, maximum number of features per protein to use in the model.
527+ # ' @param run_order data.frame with two columns: Run and Order. Order should be numeric and indicate the order of runs.
528+ # ' @param n_trees numeric, number of trees to use in the isolation forest model. Default is 100.
529+ # ' @param max_depth numeric or "auto", maximum depth of each tree. Default is "auto" which sets depth to log2(N) where N is the number of runs.
530+ # ' @param cores numeric, number of cores to use for parallel processing. Default is 1.
531+ # ' @useDynLib MSstatsConvert, .registration = TRUE
532+ # '
533+ # ' @return data.table
534+ # ' @export
535+ MSstatsAnomalyScores = function (input , quality_metrics , temporal_direction ,
536+ missing_run_count , n_feat , run_order , n_trees ,
537+ max_depth , cores ){
538+
539+ input = .prepareSpectronautAnomalyInput(input , quality_metrics ,
540+ run_order , n_feat ,
541+ missing_run_count )
542+ input $ PSM = paste0(input $ PeptideSequence , input $ PrecursorCharge )
543+
544+ for (i in seq_along(quality_metrics )){
545+ if (temporal_direction [i ] != FALSE ){
546+ quality_metrics = c(quality_metrics ,
547+ paste0(quality_metrics [i ], " ." ,
548+ temporal_direction [i ]))
549+ }
550+ }
551+
552+ input = .runAnomalyModel(input ,
553+ n_trees = n_trees ,
554+ max_depth = max_depth ,
555+ cores = cores ,
556+ split_column = " PSM" ,
557+ quality_metrics = quality_metrics )
558+
559+ subset_cols = c(" Run" , " ProteinName" , " PeptideSequence" ,
560+ " PrecursorCharge" , " FragmentIon" ,
561+ " ProductCharge" , " IsotopeLabelType" ,
562+ " Condition" , " BioReplicate" ,
563+ " Fraction" , " Intensity" , " AnomalyScores" ,
564+ quality_metrics )
565+
566+ subset_cols = subset_cols [subset_cols %in% names(input )]
567+ input = input [, ..subset_cols ]
568+
569+ return (input )
570+
571+ }
572+
573+ # ' Takes as input the output of the SpectronauttoMSstatsFormat function and calculates various quality metrics to assess the health of the data. Requires Anomaly Detection model to be fit.
574+ # '
575+ # ' @param input MSstats input which is the output of Spectronaut converter
576+ # ' @return list of two data.tables
577+ # '
578+ # ' @export
579+ CheckDataHealth = function (input ){
580+
581+ input = as.data.table(input )
582+
583+ # All intensity characteristics
584+ missing_percent = .checkMissing(input )
585+ zero_truncated = .checkIntensityDistribution(input )
586+
587+ # Feature specific characteristics
588+ input $ Feature = paste(input $ PeptideSequence ,
589+ input $ PrecursorCharge ,
590+ input $ FragmentIon ,
591+ input $ ProductCharge , sep = " _" )
592+ feature_data = .checkFeatureSD(input )
593+ outlier_info = .checkFeatureOutliers(input , feature_data )
594+ feature_data = outlier_info [[1 ]]
595+ outlier_summary = outlier_info [[2 ]]
596+ feature_data = .checkFeatureCoverage(input , feature_data )
597+
598+ skew_results = .checkAnomalySkew(input )
599+
600+ return (list (feature_data , skew_results ))
601+ }
0 commit comments