-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun_n_optimize_basta.sh
More file actions
executable file
·105 lines (101 loc) · 3.24 KB
/
run_n_optimize_basta.sh
File metadata and controls
executable file
·105 lines (101 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env bash
# This code run a grid search for the best parameters of a basta sequence call
# using a mock community list of species
#set -e
## Commands
OLDIFS=${IFS}
basta_db=$1
blast=$2
out_prefix=$3
config=$4
true_file=$5
basta=$6
## Set the grid
evalues=( 1E-80 1E-40 1E-20 1E-10 1E-5 1E-2 )
p_ids=( 70 80 90 95 100 )
m_hits={1..10}
p_hits=( 60 70 80 90 99 )
n_hits={0..10}
string=`IFS=,;eval echo "{${evalues[*]}}_{${p_ids[*]}}_${m_hits[*]}_{${p_hits[*]}}_${n_hits}"`
IFS=" ";perms=(${string})
IFS=${OLDIFS}
# set variuable for hash table
declare -A results
## Set functions
prog() {
# Progress bar, courtesy of ilkkachu (stackoverflow)
local w=80 p=$1; shift
printf -v dots "%*s" "$(( $p*$w/100 ))" ""; dots=${dots// /.};
printf "\r\e[K|%-*s| %3d %% %s" "$w" "$dots" "$p" "$*";
}
run_basta(){
# Run basta with parameters given by variable 1
# 1) "_"-delimited string of basta parameters
# 2) blast file
# 3) mapping (gb or the one you constructed)
# 4) config file
# 5) File with list of True lables
# 6) workerid
#echo "running run_basta $@"
local IFS="_"; params=(${1})
IFS=${OLDIFS}
local IFS=${OLDIFS}
local evalue="${params[0]}"
local p_id="${params[1]}"
local m_hit="${params[2]}"
local p_hit="${params[3]}"
local n_hit="${params[4]}"
local blast=${2}
local mapping=${3}
local config_file=${4}
local true_lables=$(sort -u ${5})
local workerid=${6}
#echo "running basta"
python2 ${basta} sequence "${blast}" basta${workerid}.out ${mapping} \
-e ${evalue} -i ${p_id} -m ${m_hit} -n ${n_hit} -p ${p_hit} \
-c ${config_file} 2>/dev/null
#echo "Extracting metrics"
pred=$(cut -f 2 basta.out| cut -d ';' -f 7- | sed -e '/^$/d' -e 's/;//' -e 's/_/ /g' | sort -u)
local false_negatives=`comm -23 <(echo "${true_lables[@]}") <(echo "${pred[@]}") | wc -l`
local false_positives=`comm -13 <(echo "${true_lables[@]}") <(echo "${pred[@]}") | wc -l`
local true_positives=`comm -12 <(echo "${true_lables[@]}") <(echo "${pred[@]}"}) | wc -l`
#echo "Computing F1"
intF1=$(( (true_positives * 100) / (true_positives + false_negatives + false_positives) ))
#echo "F1 is ${intF1}"
#echo "Removing basta file"
echo -e "${param}\t$intF1" >> results.basta
rm basta${workerid}.out
}
export -f run_basta
echo '' > results.basta # if already exist overwrite
total="${#perms[@]}"
counter=0
for param in "${perms[@]}"
do
let counter++
prog $(( (counter * 100) / total ))
run_basta "${param}" "${blast}" "${basta_db}" "${config}" ${true_file}
# echo "out F1 ${intF1}"
# results["${param}"]="${intF1}"
# unset intF1
done
sort_it(){
python - << EOF
import pandas as pd
df = pd.read_csv('results.basta', sep='\t', header=None, names=['params', 'f1'])
best = df.nlargest(1, 'f1')
p = best.params.iloc[0].split('_')
with open('best.param', 'w') as o:
o.write("basta sequence ${blast} ${out_prefix}.out ${basta_db} \\\\")
o.write("\n-e %s -i %s -m %s -p %s -n %s\n" %(p[0], p[1], p[2], p[3], p[4]))
EOF
}
sort_it
#for k in "${!results[@]}"
# do
# echo "${k} ${results[$k]}" >> results.basta
# done
#IFS="_"; read -ra best <<< `cat results.basta| sort -rn -k2| head -n 1 | cut -f 1 -d $'\t'`
#IFS=${OLDIFS}
#echo "basta sequence "${blast}" ${out_prefix}.out ${basta_db} \\" > best.param
#echo "-e ${best[0]} -i ${best[1]} -m ${best[2]} -n ${best[3]} -p ${best[4]}" >> best.param