-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstatsFasta.py
More file actions
executable file
·65 lines (52 loc) · 1.94 KB
/
statsFasta.py
File metadata and controls
executable file
·65 lines (52 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python
# Jun-Hoe, Lee (2020)
# moidified from countGCperGenome.py to output more basic stats for a fasta file
# stats are numbe of seqs, average seq Length, longestSeq, shortestSeq, Gc%
# Usage: python countGCperGenome.py [input fasta file] [make output file]
import sys
import os
from Bio.SeqUtils import GC
from Bio import SeqIO
import numpy as np
##################################3
# function to turn on verbose mode
verbose = False
if verbose: # function turned on if verbose called
def verboseprint(arg, *args): # *args to output any/all given arguments
print("-v ", arg, ":", args)
else:
verboseprint = lambda *a: None # do-nothing function
####################################
# A. get input file
try:
fasta_file = sys.argv[1]
verboseprint("fasta_file", fasta_file)
except:
print("Incorrect input parameters")
print("python countGCperGenome.py [input fasta file] [yes/no make output file]")
sys.exit(1)
seqs = [seq for seq in SeqIO.parse(fasta_file, "fasta")]
lengths_list = [len(i.seq) for i in seqs]
# number of seqs
numSeqs = len(lengths_list)
# get average length
avgSeqLength = np.average(lengths_list)
# get longest and shortest seq
maxSeqLength = max(lengths_list)
minSeqLength = min(lengths_list)
## calculate GC%
gc_percent = np.average([GC(i.seq) for i in seqs])
# total sequnce length (not used for now)
total_size = np.sum(lengths_list)
print("fasta_file\tNumSeqs\tAvgSeqLength\tMaxSeqLength\tMinSeqLength\tGC_percent")
writeLine = "%s\t%d\t%0.2f\t%d\t%d\t%0.2f\n" % \
(fasta_file, numSeqs, avgSeqLength, maxSeqLength, minSeqLength, gc_percent)
print(writeLine)
# check whether to create output file
if len(sys.argv) > 2:
getInputFileName = os.path.basename(fasta_file)
outputFileName = "out_" + getInputFileName
verboseprint("make outputFileName", outputFileName)
with open(outputFileName, 'w') as outputFile:
outputFile.write(writeLine)
outputFile.close()