-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_sequences.py
30 lines (26 loc) · 1.02 KB
/
get_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from Bio import Entrez, SeqIO
import pandas as pd
Entrez.email = ""
chr_accession = pd.read_csv('chr_accession_hg19.csv')
peaks = pd.read_csv('regions-of-interest.csv')
peaks = pd.merge(peaks, chr_accession, on="chr")
peaks = peaks.sort_values(by=['chr','start'])
peaks["sequence"] = ""
peaks_list = []
for i in range(len(peaks)):
handle = Entrez.efetch(db="nucleotide",
id=peaks.loc[i, "accession"],
rettype="fasta",
strand=1,
seq_start=peaks.loc[i, "start"],
seq_stop=peaks.loc[i, "end"])
record = SeqIO.read(handle, "fasta")
handle.close()
peaks.loc[i, "sequence"] = record.seq
header = "\n>" + str(peaks.loc[i, "chr"]) + ":" + str(peaks.loc[i, "start"]) + "-" + str(peaks.loc[i, "end"])
peaks_list.append(header)
peaks_list.append(record.seq)
#peaks.to_csv('peak_sequences.txt', sep='\t')
with open('sequences.fasta', 'w') as f:
for item in peaks_list:
f.write("%s\n" % item)