-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathsplit_in_pairs.py
94 lines (82 loc) · 2.87 KB
/
split_in_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Ad hoc script to fix problem of legacy illumina files
# read 1 and read 2 are in the same file,
# read 1 first, but in random order and with orphans
# the script pairs up the reads into two separate outputfiles
# restoring the order
# orphans go into a separate file
# run as python split_in_pairs.py file.fastq
import sys, os
import gzip
from Bio.SeqIO.QualityIO import FastqGeneralIterator
def write_pair(readID, read1, read2, fh_out_R1, fh_out_R2):
"""Write paired reads to two files"""
fh_out_R1.write("@%s\n%s\n+\n%s\n" % (readID + "/1", read1[0], read1[1]))
fh_out_R2.write("@%s\n%s\n+\n%s\n" % (readID + "/2", read2[0], read2[1]))
pass
def write_orph(readID, read, fh_out_orphan):
"""Write orphan read to separate file"""
fh_out_orphan.write("@%s\n%s\n+\n%s\n" % (readID, read[0], read[1]))
pass
fastqIn = sys.argv[1]
# create output filenames
# Input: sometext.fq.gz or sometext.fq
# Out read 1: sometext.R1.fq
# Out read 1: sometext.R2.fq
# Out orphan reads: sometext.orhan.fq
fileName = os.path.basename(fastqIn.rstrip(".gz"))
fileOutname, ext = os.path.splitext(fileName)
fastqOutR1 = "%s.R1%s" %(fileOutname, ext)
fastqOutR2 = "%s.R2%s" %(fileOutname, ext)
fastqOutOrph = "%s.orphan%s" %(fileOutname, ext)
# determine whether gzipped
# open file appropriately
if fastqIn.endswith(".gz"):
fh_in = gzip.open(fastqIn, 'r')
else:
fh_in = open(fastqIn, 'r')
# dictionary to keep encountered reads
reads1 = {}
reads2 = {}
# open output files
fh_out_R1 = open(fastqOutR1, 'wb')
fh_out_R2 = open(fastqOutR2, 'wb')
# parse input fastq file
for title, seq, qual in FastqGeneralIterator(fh_in):
pairID = title[-2:]
readID = title[:-2]
if pairID == "/1":
# read 1
# seen? then write as paired
if readID in reads2:
read1 = [seq, qual]
read2 = reads2[readID]
write_pair(readID, read1, read2, fh_out_R1, fh_out_R2)
# remove from dictionary
del reads2[readID]
else:
# add to dictionary
reads1[readID] = [seq, qual]
elif pairID == "/2":
# read 2
# seen? then write as paired
if readID in reads1:
read1 = reads1[readID]
read2 = [seq, qual]
write_pair(readID, read1, read2, fh_out_R1, fh_out_R2)
# remove from dictionary
del reads1[readID]
else:
# add to dictionary
reads2[readID] = [seq, qual]
else:
raise Exception("Read identifier does not end in /1 or /2: %s" %title)
fh_in.close()
fh_out_R1.close()
fh_out_R2.close()
# print out orphans, i.e. the remaining reads in the dictionaries
fh_out_orphan = open(fastqOutOrph, 'wb')
for readID in reads1:
write_orph(readID + "/1", reads1[readID], fh_out_orphan)
for readID in reads2:
write_orph(readID + "/2", reads2[readID], fh_out_orphan)
fh_out_orphan.close()