-
Notifications
You must be signed in to change notification settings - Fork 2
/
pairwise_similarity_from_fasta.py
39 lines (35 loc) · 1.65 KB
/
pairwise_similarity_from_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#### Build a similarity matrix from a given fastq file,
#### using pairwise2 module in Biopython, in order to group the sequences
#### into clusters. Each fastq file is only compared with one
#### another at a time. Only suitable for small sequences.
#!/usr/bin/python
import os, sys, argparse, itertools
import numpy as np
from Bio import pairwise2, SeqIO
parser=argparse.ArgumentParser()
parser.add_argument('input_fastq',metavar='input_fastq', help='Input the fastq read file')
args=parser.parse_args()
#### read the fastq file into a python list
def fastq_to_list(file):
fastqfile=open(file,'rU')
return list(SeqIO.parse(fastqfile,'fasta'))
#### perform pair-wise similarity calculation and build the similarity matrix
def seq_handshake(seqio_list):
seq_names=[]
sml_matrix=np.zeros(shape=(len(seqio_list), len(seqio_list)))
for i in range(len(seqio_list)):
seq_names.append(seqio_list[i].id)
for k in range(i+1, len(seqio_list)):
effective_length=min(len(seqio_list[i]),len(seqio_list[k]))
#### 2 points for perfect match, -1 for mismatch, -3 for gap opening, -1 for gap extension.
#### only_one_alignment and score_only spped up the calculation dramatically
#### gaps at the end of both sequences are not penalized to cope with large spam of length
sml_matrix[i,k]=pairwise2.align.globalms(seqio_list[i].seq,seqio_list[k].seq,2,-1,-3,-1,one_alignment_only=1,score_only=1,
penalize_end_gaps=0)/float(effective_length)
return sml_matrix, seq_names
####
similarity_matrix,seqname_list=seq_handshake(fastq_to_list(args.input_fastq))
f=open('matrix.csv','w')
f.write(','.join(seqname_list)+'\n')
np.savetxt(f,similarity_matrix,delimiter=',')
f.close()