-
Notifications
You must be signed in to change notification settings - Fork 3
/
do_step1.psh
37 lines (27 loc) · 1.11 KB
/
do_step1.psh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/bin/bash
gene=$1
N=`grep '^>' cand_fa/${gene}_A.fa | wc -l` # sequence number
for type in A C G T
do
base="${gene}_${type}"
echo $base
grep 'XM:i:0' ./cand_sam/${base}.sam | cut -f 1-4,10 > ./cand_sam/${base}_Nmm.sam # no mismatch
cat ./cand_sam/${base}_Nmm.sam | cut -f 1,3 |sort -u|cut -f 1|sort -n|uniq -c > ${base}.cand1 # chromsome_number
cat ./cand_sam/${base}_Nmm.sam | sort -k 1n -u | cut -f 1,3 > $base.chr
# for i in `seq $N`
# do
# if [[ `grep -w '^$i' $base.chr | wc -l` -eq 0 ]]
# then echo -e "$i\t*" >> $base.chr
# fi
# done
cat <(cut -f 1 $base.chr) <(seq $N) | sort | uniq -u >> $base.chr
sort -k 1n -o $base.chr $base.chr
awk '{if($2=="") {$2="*"} ; print}' OFS="\t" $base.chr > $base.chrs
mv $base.chrs $base.chr
# warning: only one will be shown if many chromosomes
awk '$1==1 {print $2}' ${base}.cand1 > ${base}.st1 # 1 chr (NOT including *)
#awk '$3=="*" {print $1}' ./cand_sam/${base}.sam > ${base}.st2 # the seq IDs which do not match
#sort ${base}.st1 ${base}.st2 | uniq -u > ${base}.st3 # specific for 1 chr without *
cp ${base}.st1 ${base}.st3
Rscript do_chr.r "$base"
done