forked from albertwcheng/albert-bioinformatics-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetchSMARTBatch.sh
executable file
·70 lines (49 loc) · 1.77 KB
/
fetchSMARTBatch.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
if [ $# -lt 5 ]; then
echo $0 geneList aceviewProteinFastaRoot smartResultDatabaseRoot checkExists [ 1 or 0 ] tmpRoot [ . ]
exit
fi
geneList=$1
aceviewProteinFastaRoot=$2
smartResultDatabaseRoot=$3
checkExists=$4
tmpRoot=$5
#now load gene list
genes=( `cat $geneList` )
ngenesLoaded=${#genes[@]}
echo $ngenesLoaded number of genes loaded
tmpfolder=$tmpRoot/tmp
if [ ! -e $tmpfolder ]; then
mkdir $tmpfolder
fi
if [[ $checkExists == 1 ]]; then
#save time not to redo the existing ones
rm $tmpfolder/genesToGetThisTime.txt
for gene in ${genes[@]}; do
if [ -e $smartResultDatabaseRoot/${gene}.*_SMART_results.txt ]; then
echo ${gene} existed in database, ignored
else
echo ${gene} >> $tmpfolder/genesToGetThisTime.txt
fi
done
else
cp $geneList $tmpfolder/genesToGetThisTime.txt
fi
genesToGet=( `cat $tmpfolder/genesToGetThisTime.txt` )
ngenesToGet=${#genesToGet[@]}
echo $ngenesToGet number of genes to get from SMART
#now join sequences on names
seqfile=$aceviewProteinFastaRoot/allGoodProtein.oneline.addedM.SEQ.geneNameAppended
joinu.py -w 2 $tmpfolder/genesToGetThisTime.txt $seqfile > $tmpfolder/genesToGetThisTime.SEQ.geneNameAppended
#now get back SEQ
cuta.py -f2,3 $tmpfolder/genesToGetThisTime.SEQ.geneNameAppended > $tmpfolder/genesToGetThisTime.SEQ
#now get back FASTA
seq2fasta.sh $tmpfolder/genesToGetThisTime.SEQ $tmpfolder/genesToGetThisTime.fasta
#now split into chucks of job
rm -R $tmpfolder/chunks
mkdir $tmpfolder/chunks
split -l 1000 $tmpfolder/genesToGetThisTime.fasta $tmpfolder/chunks/geneToGetThisTimeChunk.
cd $tmpfolder/chunks/
for chunkjob in geneToGetThisTimeChunk.*; do
bsub SMART_batch.pl --inputFile ${chunkjob} --outputDirectory $smartResultDatabaseRoot --includePfam --includeSignalP --includeRepeats --includeDISEMBL
done