-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathFastaValidator.cs
More file actions
6345 lines (5364 loc) · 270 KB
/
Copy pathFastaValidator.cs
File metadata and controls
6345 lines (5364 loc) · 270 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using PRISM;
namespace ValidateFastaFile
{
// ReSharper disable UnusedMember.Global
/// <summary>
/// This class will read a protein FASTA file and validate its contents
/// </summary>
/// <remarks>
/// <para>
/// Written by Matthew Monroe for the Department of Energy (PNNL, Richland, WA)
/// Program started March 21, 2005
/// </para>
/// <para>
/// E-mail: matthew.monroe@pnnl.gov or proteomics@pnnl.gov
/// Website: https://github.com/PNNL-Comp-Mass-Spec/ or https://panomics.pnnl.gov/ or https://www.pnnl.gov/integrative-omics
/// </para>
/// <para>
/// Licensed under the Apache License, Version 2.0; you may not use this file except
/// in compliance with the License. You may obtain a copy of the License at
/// http://www.apache.org/licenses/LICENSE-2.0
/// </para>
/// </remarks>
public class FastaValidator : PRISM.FileProcessor.ProcessFilesBase
{
// ReSharper disable once CommentTypo
// Ignore Spelling: A-Za-z, Diff, Dups, xfffd, gi, gzipped, jgi, Lf, Mem, ornithine
// Ignore Spelling: pre, selenocysteine, Sep, seqs, Validator, varchar, yyyy-MM-dd
/// <summary>
/// Constructor
/// </summary>
public FastaValidator()
{
mFileDate = "December 1, 2021";
mTempFilesToDelete = new List<string>();
InitializeLocalVariables();
}
/// <summary>
/// Constructor that takes a parameter file
/// </summary>
/// <param name="parameterFilePath"></param>
// ReSharper disable once UnusedMember.Global
public FastaValidator(string parameterFilePath) : this()
{
LoadParameterFileSettings(parameterFilePath);
}
private const int DEFAULT_MINIMUM_PROTEIN_NAME_LENGTH = 3;
/// <summary>
/// The maximum suggested value when using SEQUEST is 34 characters
/// In contrast, MS-GF+ supports long protein names
/// </summary>
public const int DEFAULT_MAXIMUM_PROTEIN_NAME_LENGTH = 60;
private const int DEFAULT_MAXIMUM_RESIDUES_PER_LINE = 120;
/// <summary>
/// Default protein line start character
/// </summary>
public const char DEFAULT_PROTEIN_LINE_START_CHAR = '>';
/// <summary>
/// Default long protein name split char
/// </summary>
public const char DEFAULT_LONG_PROTEIN_NAME_SPLIT_CHAR = '|';
/// <summary>
/// Default protein name first reference split chars
/// </summary>
public const string DEFAULT_PROTEIN_NAME_FIRST_REF_SEP_CHARS = ":|";
/// <summary>
/// Default protein name subsequent reference separation chars
/// </summary>
public const string DEFAULT_PROTEIN_NAME_SUBSEQUENT_REF_SEP_CHARS = ":|;";
private const char INVALID_PROTEIN_NAME_CHAR_REPLACEMENT = '_';
private const int CUSTOM_RULE_ID_START = 1000;
private const int DEFAULT_CONTEXT_LENGTH = 13;
/// <summary>
/// Protein description missing message
/// </summary>
public const string MESSAGE_TEXT_PROTEIN_DESCRIPTION_MISSING = "Line contains a protein name, but not a description";
/// <summary>
/// Protein description too long message
/// </summary>
public const string MESSAGE_TEXT_PROTEIN_DESCRIPTION_TOO_LONG = "Protein description is over 900 characters long";
/// <summary>
/// Asterisk found in the residues message
/// </summary>
public const string MESSAGE_TEXT_ASTERISK_IN_RESIDUES = "An asterisk was found in the residues";
/// <summary>
/// Dash found in the residues message
/// </summary>
public const string MESSAGE_TEXT_DASH_IN_RESIDUES = "A dash was found in the residues";
/// <summary>
/// Option section name in the XML parameter file
/// </summary>
public const string XML_SECTION_OPTIONS = "ValidateFastaFileOptions";
/// <summary>
/// Fixed FASTA file options section in the XML parameter file
/// </summary>
public const string XML_SECTION_FIXED_FASTA_FILE_OPTIONS = "ValidateFastaFixedFASTAFileOptions";
/// <summary>
/// Fixed FASTA file options section in the XML parameter file
/// </summary>
public const string XML_SECTION_FASTA_HEADER_LINE_RULES = "ValidateFastaHeaderLineRules";
/// <summary>
/// Protein name rules section in the XML parameter file
/// </summary>
public const string XML_SECTION_FASTA_PROTEIN_NAME_RULES = "ValidateFastaProteinNameRules";
/// <summary>
/// Protein description rules section in the XML parameter file
/// </summary>
public const string XML_SECTION_FASTA_PROTEIN_DESCRIPTION_RULES = "ValidateFastaProteinDescriptionRules";
/// <summary>
/// Protein sequence rules section in the XML parameter file
/// </summary>
public const string XML_SECTION_FASTA_PROTEIN_SEQUENCE_RULES = "ValidateFastaProteinSequenceRules";
/// <summary>
/// RuleCount element name
/// </summary>
public const string XML_OPTION_ENTRY_RULE_COUNT = "RuleCount";
/// <summary>
/// Maximum protein description length
/// </summary>
/// <remarks>
/// The value of 7995 is chosen because the maximum varchar() value in SQL Server is varchar(8000)
/// and we want to prevent truncation errors when importing protein names and descriptions into SQL Server
/// </remarks>
public const int MAX_PROTEIN_DESCRIPTION_LENGTH = 7995;
private const string MEM_USAGE_PREFIX = "MemUsage: ";
private const bool REPORT_DETAILED_MEMORY_USAGE = false;
private const string PROTEIN_NAME_COLUMN = "Protein_Name";
private const string SEQUENCE_LENGTH_COLUMN = "Sequence_Length";
private const string SEQUENCE_HASH_COLUMN = "Sequence_Hash";
private const string PROTEIN_HASHES_FILENAME_SUFFIX = "_ProteinHashes.txt";
private const int DEFAULT_WARNING_SEVERITY = 3;
private const int DEFAULT_ERROR_SEVERITY = 7;
/// <summary>
/// Message code constants
/// </summary>
/// <remarks>
/// Custom rules start with message code CUSTOM_RULE_ID_START=1000, and therefore
/// the values in enum MessageCodeConstants should all be less than CUSTOM_RULE_ID_START
/// </remarks>
public enum MessageCodeConstants
{
#pragma warning disable 1591
UnspecifiedError = 0,
// Error messages
ProteinNameIsTooLong = 1,
LineStartsWithSpace = 2,
// RightArrowFollowedBySpace = 3,
// RightArrowFollowedByTab = 4,
// RightArrowButNoProteinName = 5,
BlankLineBetweenProteinNameAndResidues = 6,
BlankLineInMiddleOfResidues = 7,
ResiduesFoundWithoutProteinHeader = 8,
ProteinEntriesNotFound = 9,
FinalProteinEntryMissingResidues = 10,
FileDoesNotEndWithLinefeed = 11,
DuplicateProteinName = 12,
// Warning messages
ProteinNameIsTooShort = 13,
// ProteinNameContainsVerticalBars = 14,
// ProteinNameContainsWarningCharacters = 21,
// ProteinNameWithoutDescription = 14,
BlankLineBeforeProteinName = 15,
// ProteinNameAndDescriptionSeparatedByTab = 16,
// ProteinDescriptionWithTab = 25,
// ProteinDescriptionWithQuotationMark = 26,
// ProteinDescriptionWithEscapedSlash = 27,
// ProteinDescriptionWithUndesirableCharacter = 28,
ResiduesLineTooLong = 17,
// ResiduesLineContainsU = 30,
DuplicateProteinSequence = 18,
RenamedProtein = 19,
ProteinRemovedSinceDuplicateSequence = 20,
DuplicateProteinNameRetained = 21,
ResiduesAreLikelyDNA = 22
#pragma warning restore 1591
}
/// <summary>
/// Error message info
/// </summary>
public class MsgInfo : IComparable<MsgInfo>
{
/// <summary>
/// Line number of this error in the FASTA file
/// </summary>
public int LineNumber { get; }
/// <summary>
/// Column number of this error in the FASTA file
/// </summary>
public int ColNumber { get; }
/// <summary>
/// Column number of this error in the FASTA file
/// </summary>
public string ProteinName { get; }
/// <summary>
/// Error message code
/// </summary>
public int MessageCode { get; }
/// <summary>
/// Extra info about this error
/// </summary>
public string ExtraInfo { get; }
/// <summary>
/// Error message context
/// </summary>
public string Context { get; }
/// <summary>
/// Constructor that takes line number, column number, etc.
/// </summary>
/// <param name="lineNumber"></param>
/// <param name="colNumber"></param>
/// <param name="proteinName"></param>
/// <param name="messageCode"></param>
/// <param name="extraInfo"></param>
/// <param name="context"></param>
public MsgInfo(int lineNumber, int colNumber, string proteinName, int messageCode, string extraInfo, string context)
{
LineNumber = lineNumber;
ColNumber = colNumber;
ProteinName = proteinName;
MessageCode = messageCode;
ExtraInfo = extraInfo;
Context = context;
}
/// <summary>
/// Parameterless constructor
/// </summary>
public MsgInfo()
{
ProteinName = string.Empty;
ExtraInfo = string.Empty;
Context = string.Empty;
}
/// <summary>
/// Return a string describing this error
/// </summary>
public override string ToString()
{
return string.Format("Line {0}, protein {1}, code {2}: {3}", LineNumber, ProteinName, MessageCode, ExtraInfo);
}
/// <summary>
/// Compare one instance of this class to another
/// </summary>
/// <param name="other"></param>
/// <returns>0 if the two instances match, otherwise -1 or 1 based on sort order</returns>
public int CompareTo(MsgInfo other)
{
if (ReferenceEquals(this, other))
return 0;
if (other is null)
return 1;
var messageCodeComparison = MessageCode.CompareTo(other.MessageCode);
if (messageCodeComparison != 0)
return messageCodeComparison;
var lineNumberComparison = LineNumber.CompareTo(other.LineNumber);
if (lineNumberComparison != 0)
return lineNumberComparison;
return ColNumber.CompareTo(other.ColNumber);
}
}
/// <summary>
/// Options for reporting results
/// </summary>
public class OutputOptions
{
/// <summary>
/// Filename of the FASTA file examined
/// </summary>
public string SourceFile { get; set; }
/// <summary>
/// When true, write message stats to a file
/// </summary>
public bool OutputToStatsFile { get; }
/// <summary>
/// Output file path
/// </summary>
public StreamWriter OutFile { get; set; }
/// <summary>
/// Column separation character
/// </summary>
public string SepChar { get; set; }
/// <summary>
/// Constructor
/// </summary>
/// <param name="outputToStatsFile"></param>
/// <param name="sepChar"></param>
public OutputOptions(bool outputToStatsFile, string sepChar)
{
OutputToStatsFile = outputToStatsFile;
SepChar = sepChar;
}
/// <summary>
/// Return the name of the FASTA file being analyzed
/// </summary>
public override string ToString()
{
return SourceFile;
}
}
#pragma warning disable 1591
/// <summary>
/// Validation rule types
/// </summary>
public enum RuleTypes
{
HeaderLine,
ProteinName,
ProteinDescription,
ProteinSequence
}
/// <summary>
/// Option switches
/// </summary>
public enum SwitchOptions
{
AddMissingLineFeedAtEOF,
AllowAsteriskInResidues,
CheckForDuplicateProteinNames,
GenerateFixedFASTAFile,
SplitOutMultipleRefsInProteinName,
OutputToStatsFile,
WarnBlankLinesBetweenProteins,
WarnLineStartsWithSpace,
NormalizeFileLineEndCharacters,
CheckForDuplicateProteinSequences,
FixedFastaRenameDuplicateNameProteins,
SaveProteinSequenceHashInfoFiles,
FixedFastaConsolidateDuplicateProteinSeqs,
FixedFastaConsolidateDupsIgnoreILDiff,
FixedFastaTruncateLongProteinNames,
FixedFastaSplitOutMultipleRefsForKnownAccession,
FixedFastaWrapLongResidueLines,
FixedFastaRemoveInvalidResidues,
SaveBasicProteinHashInfoFile,
AllowDashInResidues,
FixedFastaKeepDuplicateNamedProteins, // Keep duplicate named proteins, unless the name and sequence match exactly, then they're removed
AllowAllSymbolsInProteinNames
}
/// <summary>
/// Fixed FASTA stat categories
/// </summary>
public enum FixedFASTAFileValues
{
DuplicateProteinNamesSkippedCount,
ProteinNamesInvalidCharsReplaced,
ProteinNamesMultipleRefsRemoved,
TruncatedProteinNameCount,
UpdatedResidueLines,
DuplicateProteinNamesRenamedCount,
DuplicateProteinSeqsSkippedCount
}
/// <summary>
/// Error warning count types
/// </summary>
public enum ErrorWarningCountTypes
{
Specified,
Unspecified,
Total
}
/// <summary>
/// Message type constants
/// </summary>
public enum MsgTypeConstants
{
ErrorMsg = 0,
WarningMsg = 1,
StatusMsg = 2
}
/// <summary>
/// Validation error codes
/// </summary>
public enum ValidateFastaFileErrorCodes
{
NoError = 0,
OptionsSectionNotFound = 1,
ErrorReadingInputFile = 2,
ErrorCreatingStatsFile = 4,
ErrorVerifyingLinefeedAtEOF = 8,
UnspecifiedError = -1
}
#pragma warning restore 1591
/// <summary>
/// Line ending characters
/// </summary>
public enum LineEndingCharacters
{
/// <summary>
/// Windows
/// </summary>
CRLF,
/// <summary>
/// Old style Mac
/// </summary>
CR,
/// <summary>
/// Unix, Linux, OS X
/// </summary>
LF,
/// <summary>
/// Oddball (Just for completeness!)
/// </summary>
LFCR
}
/// <summary>
/// Error stats container
/// </summary>
private class ErrorStats
{
/// <summary>
/// Error code
/// </summary>
/// <remarks>Custom rules start with message code CUSTOM_RULE_ID_START</remarks>
private int MessageCode { get; }
/// <summary>
/// Number of times detailed information about this error was stored in mFileErrors
/// </summary>
public int CountSpecified { get; set; }
/// <summary>
/// Number of additional occurrences of this error (where details were not stored in mFileErrors)
/// </summary>
public int CountUnspecified { get; set; }
/// <summary>
/// Constructor
/// </summary>
/// <param name="messageCode"></param>
public ErrorStats(int messageCode)
{
MessageCode = messageCode;
}
/// <summary>
/// Return the message code, count specified, and count unspecified
/// </summary>
public override string ToString()
{
return MessageCode + ": " + CountSpecified + " specified, " + CountUnspecified + " unspecified";
}
}
/// <summary>
/// Container for tracking errors and warnings
/// </summary>
private class MsgInfosAndSummary
{
/// <summary>
/// Error messages
/// </summary>
public List<MsgInfo> Messages { get; } = new();
/// <summary>
/// Stats dictionary
/// </summary>
public Dictionary<int, ErrorStats> MessageCodeToErrorStats { get; } = new();
/// <summary>
/// Clear cached messages
/// </summary>
public void Reset()
{
Messages.Clear();
MessageCodeToErrorStats.Clear();
}
/// <summary>
/// Return the sum of CountSpecified for all tracked messages
/// </summary>
// ReSharper disable once UnusedMember.Local
public int ComputeTotalSpecifiedCount()
{
return MessageCodeToErrorStats.Values.Sum(stat => stat.CountSpecified);
}
/// <summary>
/// Return the sum of CountUnspecified for all tracked messages
/// </summary>
public int ComputeTotalUnspecifiedCount()
{
return MessageCodeToErrorStats.Values.Sum(stat => stat.CountUnspecified);
}
}
/// <summary>
/// Validation rule definition container
/// </summary>
private class RuleDefinition
{
/// <summary>
/// Rule RegEx
/// </summary>
public string MatchRegEx { get; }
/// <summary>
/// True means text matching the RegEx means a problem
/// False means if text doesn't match the RegEx, that means a problem
/// </summary>
public bool MatchIndicatesProblem { get; set; }
/// <summary>
/// Message to display if a problem is present
/// </summary>
public string MessageWhenProblem { get; set; }
/// <summary>
/// 0 is lowest severity, 9 is highest severity; value >= 5 means error
/// </summary>
public short Severity { get; set; }
/// <summary>
/// If true, the matching text is stored as the context info
/// </summary>
public bool DisplayMatchAsExtraInfo { get; set; }
/// <summary>
/// Custom Rule ID
/// </summary>
/// <remarks>This value is auto-assigned</remarks>
public int CustomRuleID { get; set; }
/// <summary>
/// Constructor
/// </summary>
/// <param name="matchRegEx"></param>
public RuleDefinition(string matchRegEx)
{
MatchRegEx = matchRegEx;
}
/// <summary>
/// Return the rule ID and message to display if a problem is present
/// </summary>
public override string ToString()
{
return CustomRuleID + ": " + MessageWhenProblem;
}
}
/// <summary>
/// Extended rule definition container
/// </summary>
private class RuleDefinitionExtended
{
/// <summary>
/// Parent rule definition
/// </summary>
public RuleDefinition RuleDefinition { get; }
public Regex MatchRegEx { get; }
/// <summary>
/// True if the rule is valid, false if a problem
/// </summary>
// ReSharper disable once UnusedAutoPropertyAccessor.Local
public bool Valid { get; set; }
/// <summary>
/// Constructor
/// </summary>
/// <param name="ruleDefinition"></param>
/// <param name="regexRule"></param>
public RuleDefinitionExtended(RuleDefinition ruleDefinition, Regex regexRule)
{
RuleDefinition = ruleDefinition;
MatchRegEx = regexRule;
}
/// <summary>
/// Return the rule ID and message to display if a problem is present
/// </summary>
public override string ToString()
{
return RuleDefinition.CustomRuleID + ": " + RuleDefinition.MessageWhenProblem;
}
}
/// <summary>
/// Options container
/// </summary>
private class FixedFastaOptions
{
/// <summary>
/// Split out multiple refs in protein name
/// </summary>
public bool SplitOutMultipleRefsInProteinName { get; set; }
/// <summary>
/// Split out multiple refs for known accession
/// </summary>
public bool SplitOutMultipleRefsForKnownAccession { get; set; }
/// <summary>
/// Long protein name split chars
/// </summary>
public char[] LongProteinNameSplitChars { get; set; }
/// <summary>
/// Protein name invalid chars to remove
/// </summary>
public char[] ProteinNameInvalidCharsToRemove { get; set; }
/// <summary>
/// Rename proteins with duplicate names
/// </summary>
public bool RenameProteinsWithDuplicateNames { get; set; }
/// <summary>
/// Keep duplicate named proteins unless matching sequence
/// </summary>
/// <remarks>Ignored if RenameProteinsWithDuplicateNames=true or ConsolidateProteinsWithDuplicateSeqs=true</remarks>
public bool KeepDuplicateNamedProteinsUnlessMatchingSequence { get; set; }
/// <summary>
/// Consolidate proteins with duplicate sequences
/// </summary>
public bool ConsolidateProteinsWithDuplicateSeqs { get; set; }
/// <summary>
/// Ignore I/L differences when consolidating duplicates
/// </summary>
public bool ConsolidateDupsIgnoreILDiff { get; set; }
/// <summary>
/// Truncate long protein names
/// </summary>
public bool TruncateLongProteinNames { get; set; }
/// <summary>
/// Wrap long residue lines
/// </summary>
public bool WrapLongResidueLines { get; set; }
/// <summary>
/// Remove invalid residues
/// </summary>
public bool RemoveInvalidResidues { get; set; }
/// <summary>
/// Constructor
/// </summary>
public FixedFastaOptions()
{
LongProteinNameSplitChars = new[] { DEFAULT_LONG_PROTEIN_NAME_SPLIT_CHAR };
// Default to an empty character array for invalid characters
ProteinNameInvalidCharsToRemove = new char[] { };
}
}
/// <summary>
/// Fixed FASTA stats
/// </summary>
private class FixedFastaStats
{
/// <summary>
/// Truncated protein name count
/// </summary>
public int TruncatedProteinNameCount { get; set; }
/// <summary>
/// Updated residue lines
/// </summary>
public int UpdatedResidueLines { get; set; }
/// <summary>
/// Protein names invalid chars replaced
/// </summary>
public int ProteinNamesInvalidCharsReplaced { get; set; }
/// <summary>
/// Protein names multiple refs removed
/// </summary>
public int ProteinNamesMultipleRefsRemoved { get; set; }
/// <summary>
/// Duplicate name proteins skipped
/// </summary>
public int DuplicateNameProteinsSkipped { get; set; }
/// <summary>
/// Duplicate name proteins renamed
/// </summary>
public int DuplicateNameProteinsRenamed { get; set; }
/// <summary>
/// Duplicate sequence proteins skipped
/// </summary>
public int DuplicateSequenceProteinsSkipped { get; set; }
/// <summary>
/// Constructor
/// </summary>
public FixedFastaStats()
{
Reset();
}
/// <summary>
/// Reset all counts to 0
/// </summary>
public void Reset()
{
TruncatedProteinNameCount = 0;
UpdatedResidueLines = 0;
ProteinNamesInvalidCharsReplaced = 0;
ProteinNamesMultipleRefsRemoved = 0;
DuplicateNameProteinsSkipped = 0;
DuplicateNameProteinsRenamed = 0;
DuplicateSequenceProteinsSkipped = 0;
}
/// <summary>
/// Get the specified statistic
/// </summary>
/// <param name="statCategory"></param>
public int GetStat(FixedFASTAFileValues statCategory)
{
return statCategory switch
{
FixedFASTAFileValues.DuplicateProteinNamesSkippedCount => DuplicateNameProteinsSkipped,
FixedFASTAFileValues.ProteinNamesInvalidCharsReplaced => ProteinNamesInvalidCharsReplaced,
FixedFASTAFileValues.ProteinNamesMultipleRefsRemoved => ProteinNamesMultipleRefsRemoved,
FixedFASTAFileValues.TruncatedProteinNameCount => TruncatedProteinNameCount,
FixedFASTAFileValues.UpdatedResidueLines => UpdatedResidueLines,
FixedFASTAFileValues.DuplicateProteinNamesRenamedCount => DuplicateNameProteinsRenamed,
FixedFASTAFileValues.DuplicateProteinSeqsSkippedCount => DuplicateSequenceProteinsSkipped,
_ => 0,
};
}
}
private class ProteinNameTruncationRegex
{
/// <summary>
/// Extracts IPI:IPI00048500.11 from IPI:IPI00048500.11|ref|23848934 <br />
/// Second matching group contains everything after the first vertical bar
/// </summary>
public Regex MatchIPI { get; }
/// <summary>
/// Extracts gi|169602219 from gi|169602219|ref|XP_001794531.1| <br />
/// Second matching group contains everything after the second vertical bar
/// </summary>
public Regex MatchGI { get; }
/// <summary>
/// Extracts jgi|Batde5|906240 from jgi|Batde5|90624|GP3.061830 <br />
/// Second matching group contains everything after the third vertical bar
/// </summary>
public Regex MatchJGI { get; }
/// <summary>
/// Extracts bob|234384 from bob|234384|ref|483293, or bob|845832 from bob|845832;ref|384923 <br />
/// Second matching group contains everything after the separator following the first matched group
/// </summary>
public Regex MatchGeneric { get; }
/// <summary>
/// Matches jgi|Batde5|23435 ; it requires that there be a number after the second bar <br />
/// Contains no matching groups
/// </summary>
public Regex MatchJGIBaseAndID { get; }
/// <summary>
/// Extracts the separator set following the first separator in the string
/// </summary>
public Regex MatchDoubleBarOrColonAndBar { get; }
/// <summary>
/// Constructor
/// </summary>
/// <param name="proteinNameFirstRefSepChars"></param>
/// <param name="proteinNameSubsequentRefSepChars"></param>
public ProteinNameTruncationRegex(char[] proteinNameFirstRefSepChars, char[] proteinNameSubsequentRefSepChars)
{
// Note that each of these RegEx tests contain two groups with captured text:
// The following will extract IPI:IPI00048500.11 from IPI:IPI00048500.11|ref|23848934
MatchIPI =
new Regex(@"^(IPI:IPI[\w.]{2,})\|(.+)",
RegexOptions.Singleline | RegexOptions.Compiled);
// The following will extract gi|169602219 from gi|169602219|ref|XP_001794531.1|
MatchGI =
new Regex(@"^(gi\|\d+)\|(.+)",
RegexOptions.Singleline | RegexOptions.Compiled);
// The following will extract jgi|Batde5|906240 from jgi|Batde5|90624|GP3.061830
MatchJGI =
new Regex(@"^(jgi\|[^|]+\|[^|]+)\|(.+)",
RegexOptions.Singleline | RegexOptions.Compiled);
// The following will extract bob|234384 from bob|234384|ref|483293
// or bob|845832 from bob|845832;ref|384923
MatchGeneric =
new Regex(@"^(\w{2,}[" +
new string(proteinNameFirstRefSepChars) + @"][\w\d._]{2,})[" +
new string(proteinNameSubsequentRefSepChars) + "](.+)",
RegexOptions.Singleline | RegexOptions.Compiled);
// The following matches jgi|Batde5|23435 ; it requires that there be a number after the second bar
MatchJGIBaseAndID =
new Regex(@"^jgi\|[^|]+\|\d+",
RegexOptions.Singleline | RegexOptions.Compiled);
// Note that this RegEx contains a group with captured text:
MatchDoubleBarOrColonAndBar =
new Regex("[" +
new string(proteinNameFirstRefSepChars) + "][^" +
new string(proteinNameSubsequentRefSepChars) + "]*([" +
new string(proteinNameSubsequentRefSepChars) + "])",
RegexOptions.Singleline | RegexOptions.Compiled);
}
}
/// <summary>
/// FASTA file path being examined
/// </summary>
/// <remarks>Used by CustomValidateFastaFiles</remarks>
protected string mFastaFilePath;
private readonly FixedFastaStats mFixedFastaStats = new();
private readonly MsgInfosAndSummary mFileErrors = new();
private readonly MsgInfosAndSummary mFileWarnings = new();
private readonly List<RuleDefinition> mHeaderLineRules = new();
private readonly List<RuleDefinition> mProteinNameRules = new();
private readonly List<RuleDefinition> mProteinDescriptionRules = new();
private readonly List<RuleDefinition> mProteinSequenceRules = new();
private int mMasterCustomRuleID = CUSTOM_RULE_ID_START;
private char[] mProteinNameFirstRefSepChars;
private char[] mProteinNameSubsequentRefSepChars;
/// <summary>
/// <para>
/// This array has a space and a non-breaking space (\x00a0)
/// </para>
/// <para>
/// \xfffd (the Unicode replacement character) is included to match a non-breaking space
/// when reading an input file that does not specify an encoding using a BOM at the start of the file
/// </para>
/// </summary>
/// <remarks>
/// It should not include a tab since we check for that separately
/// </remarks>
private readonly char[] mProteinAccessionSepChars = { ' ', '\x00a0', '\xfffd' };
private bool mAddMissingLinefeedAtEOF;
private bool mCheckForDuplicateProteinNames;
/// <summary>
/// Check for duplicate protein sequences
/// </summary>
/// <remarks>
/// This will be set to True if mSaveProteinSequenceHashInfoFiles is True
/// or mFixedFastaOptions.ConsolidateProteinsWithDuplicateSeqs is True
/// </remarks>
private bool mCheckForDuplicateProteinSequences;
/// <summary>
/// Maximum number of errors per type to track
/// </summary>
private int mMaximumFileErrorsToTrack;
private int mMinimumProteinNameLength;
private int mMaximumProteinNameLength;
private int mMaximumResiduesPerLine;
/// <summary>
/// Options used when mGenerateFixedFastaFile is True
/// </summary>
private readonly FixedFastaOptions mFixedFastaOptions = new();
private bool mOutputToStatsFile;
private string mStatsFilePath;
private bool mGenerateFixedFastaFile;
private bool mSaveProteinSequenceHashInfoFiles;
/// <summary>
/// When true, create a text file that will contain the protein name and sequence hash for each protein.
/// This option will not store protein names and/or hashes in memory, and is thus useful for processing
/// huge .Fasta files to determine duplicate proteins.
/// </summary>
private bool mSaveBasicProteinHashInfoFile;
private bool mAllowAsteriskInResidues;
private bool mAllowDashInResidues;
private bool mAllowAllSymbolsInProteinNames;
private bool mWarnBlankLinesBetweenProteins;
private bool mWarnLineStartsWithSpace;
private bool mNormalizeFileLineEndCharacters;
/// <summary>
/// The number of characters at the start of key strings to use when adding items to NestedStringDictionary instances
/// </summary>
/// <remarks>
/// If this value is too short, all of the items added to the NestedStringDictionary instance
/// will be tracked by the same dictionary, which could result in a dictionary surpassing the 2 GB boundary
/// </remarks>
private byte mProteinNameSpannerCharLength = 1;
private ValidateFastaFileErrorCodes mLocalErrorCode;
private MemoryUsageLogger mMemoryUsageLogger;
private float mProcessMemoryUsageMBAtStart;
private string mSortUtilityErrorMessage;
private readonly List<string> mTempFilesToDelete;
private int mUnicodeReplaceCharacterRuleId;
/// <summary>
/// Set a processing option
/// </summary>
/// <remarks>Be sure to call SetDefaultRules() after setting all of the options</remarks>
/// <param name="switchName"></param>
/// <param name="state"></param>
public void SetOptionSwitch(SwitchOptions switchName, bool state)
{
switch (switchName)
{
case SwitchOptions.AddMissingLineFeedAtEOF:
mAddMissingLinefeedAtEOF = state;