-
Notifications
You must be signed in to change notification settings - Fork 0
/
learnAlphabet.sh
108 lines (84 loc) · 3.38 KB
/
learnAlphabet.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/bin/bash -x
if [ "$#" -ne 5 ]; then
echo "Illegal number of parameters"
exit
fi
K=$1 # 20
TRIES=$2 #100
PDBS_TRAIN=$3 # data/pdbs_train.txt
PDBS_VAL=$4 # data/pdbs_val.txt
OUTPUT_DIR=$5 # save alphabet here # tmp/
mkdir -p tmp
mkdir -p $OUTPUT_DIR
## Fetch PDBs
#if [ ! -d tmp/pdb ]; then
# curl https://wwwuser.gwdg.de/~compbiol/foldseek/scop40pdb.tar.gz | tar -xz -C tmp
#fi
#
## Compile ssw_test
#if [ ! -f tmp/ssw_test ]; then
# git clone --depth 1 https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library tmp/ssw
# (cd tmp/ssw/src && make)
# cp tmp/ssw/src/ssw_test tmp/ssw_test
#fi
#
## Filter alignments for training
#awk 'FNR==NR {pdbs[$1]=1; next}
# ($1 in pdbs) && ($2 in pdbs) {print $1,$2,$10}' \
# $PDBS_TRAIN training/data/tmaln-06.out > tmp/pairfile_train.out
# Benchmark virtual center positions
#for D in {1,1.5,2,2.5,3}; do
# for THETA in {45..360..45}; do
# for TAU in {0..180..45}; do
# echo -n $THETA $TAU $D >> "$OUTPUT_DIR/log.txt"
# # create alphabet and benchmark
# done
# done
#done
THETA=270
TAU=0
D=2
## Create training data
#./create_vqvae_training_data.py \
# tmp/pdb tmp/pairfile_train.out $THETA $TAU $D tmp/vaevq_training_data.npy
for ((seed=0;seed<$TRIES;seed++))
do
echo -n "$seed " >> "$OUTPUT_DIR/log.txt"
./train_vqvae.py $seed tmp/vaevq_training_data.npy $OUTPUT_DIR $K \
| awk '/opt_loss=/{printf "%s ", $2}' >> "$OUTPUT_DIR/log.txt"
$RUN \
./encode_pdbs.py $OUTPUT_DIR/encoder.pt $OUTPUT_DIR/states.txt $K \
--pdb_dir tmp/pdb --virt $THETA $TAU $D \
< $PDBS_TRAIN > tmp/seqs.csv
./create_submat2.py tmp/pairfile_train.out tmp/seqs.csv --mat tmp/sub_score.mat
./run-benchmark.sh $OUTPUT_DIR/encoder.pt $OUTPUT_DIR/states.txt tmp/sub_score.mat \
$PDBS_VAL data/scop_lookup.tsv $THETA $TAU $D X >> "$OUTPUT_DIR/log.txt"
# sudo bash run-benchmark.sh tmp/encoder.pt tmp/states.txt tmp/sub_score.mat data/pdbs_val.txt data/scop_lookup.tsv 270 0 2 X
done
# Find best seed
# TMalign.rocx => 0.928162 0.662063 0.275436
SEED=$(awk '{print $1, ($3/0.928162 + $4/0.662063 + $5/0.275436) / 3}' "$OUTPUT_DIR/log.txt" \
| sort -rk 2,2 | head -n 1 | awk '{print $1}')
# Create final alphabet
./train_vqvae.py "$SEED" tmp/vaevq_training_data.npy $OUTPUT_DIR $K
# Create final submat
cp $PDBS_TRAIN tmp/pdbs_submat.txt
awk 'FNR==NR {pdbs[$1]=1; next}
($1 in pdbs) && ($2 in pdbs) {print $1,$2,$10}' \
tmp/pdbs_submat.txt data/tmaln-06.out > tmp/pairfile_submat.out
$RUN ./encode_pdbs.py $OUTPUT_DIR/encoder.pt $OUTPUT_DIR/states.txt $K \
--pdb_dir tmp/pdb --virt $THETA $TAU $D \
< tmp/pdbs_submat.txt > $OUTPUT_DIR/seqs.csv
./create_submat2.py tmp/pairfile_submat.out tmp/seqs.csv \
--mat tmp/sub_score.mat --merge_state X \
| tee tmp/create_submat.log
awk '/^assign_invalid_states_to/{printf "%s", $3}' tmp/create_submat.log > "$OUTPUT_DIR/invalid_state.txt"
# Add X to submat TODO: adapt to k
awk 'NR==1 {printf "%s X\n", $0}
NR!=1 {printf "%s 0\n", $0}
END{print "X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"}' \
tmp/sub_score.mat > "$OUTPUT_DIR/sub_score.mat"
# Merge X state in seq. file
#awk 'FNR==NR && /^assign_invalid_states_to/{newxstate=$3;nextfile}
# FNR!=NR {gsub(/X/,newxstate,$2);print $1,$2}' \
# tmp/create_submat.log tmp/seqs.csv > tmp/seqs_no_x.csv