-
Notifications
You must be signed in to change notification settings - Fork 1
/
xkcd_downloader.py
64 lines (49 loc) · 2.09 KB
/
xkcd_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
downloads all XKCD strips and hover-captions within the numbers specified
This code is free to copy, modify, distribute, eat sleep or swear at. Just credit me somewhere.
Sriram Padmanabhan
screamingwdm2 at gmail dot com
"""
import re,urllib,os,subprocess
mainpage=urllib.urlopen("http://www.xkcd.org")
mainpage_html=mainpage.read()
latest_num=int(re.search(r'Permanent link to this comic: http://xkcd.com/(.*)/<br.*',mainpage_html).group(1))
print 'Latest comic number is',latest_num
rangeinput=raw_input('\nEnter comics to download (eg: "55,630,666-999,1024": ')
comiclist=list()
for (num_left,num_right,num_single) in re.findall(r'(\d+)-(\d+)|(\d+)',rangeinput):
if num_single is not '':
comiclist.append(int(num_single))
if num_left is not '' and num_right is not '':
for i in range(int(num_left),int(num_right)+1):
comiclist.append(i)
url_error=list()
wget_arg=''
fnull = open(os.devnull, 'w')
os.system('[ -d xkcd_downloaded ] || mkdir xkcd_downloaded')
fnull.close()
for comic in comiclist:
try:
comic_url="http://xkcd.com/"+str(comic)
print 'Opening ',comic_url,'...'
page=urllib.urlopen(comic_url).read()
img_url=re.search(r'Image URL \(for hotlinking/embedding\): (.*png)',page).group(1)
img_name=re.search(r'/(\w*).png',img_url).group(1)
comic_transcript=re.search(r'http://imgs.xkcd.com/comics/.*\.png" title="(.*)" alt',page).group(1)
wget_arg+=img_url+' ' #add image url to wget list
except AttributeError: #most probably a regex that failed to match
url_error.append(comic)
continue
#convert special characters in transcript
comic_transcript=comic_transcript.replace("'","'")
comic_transcript=comic_transcript.replace("&","&")
comic_transcript=comic_transcript.replace(""",'"')
comic_transcript=comic_transcript.replace("<","<")
comic_transcript=comic_transcript.replace(">",">")
#save transcript
ftext=open("./xkcd_downloaded/"+img_name+".txt",'w')
ftext.write(comic_transcript)
ftext.flush()
#run wget for all the links
os.system('cd xkcd_downloaded;wget -nv -nc '+wget_arg)
print 'Could not fetch comics numbered: ', url_error #todo FIXME