-
Notifications
You must be signed in to change notification settings - Fork 0
/
dailymail.py
77 lines (68 loc) · 2.54 KB
/
dailymail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import datetime
import urllib
import time
from bs4 import BeautifulSoup
import json
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.news
articles = db.articles
def fetch_page(url):
url = 'http://www.dailymail.co.uk' + url
print url
return urllib.urlopen(url).read()
def fetch_article_list(url,d):
html = fetch_page(url)
soup = BeautifulSoup(html, 'html.parser')
html_articles = soup.find('ul', class_='archive-articles').find_all('a')
for html_article in html_articles:
a = articles.find_one({'url': html_article['href']})
if (('/news/' in html_article['href']) or ('/wires/' in html_article['href']) or ('/money/' in html_article['href'])):
if (a is None):
article = {
'publication': 'daily_mail',
'url': html_article['href'],
'title': html_article.get_text(),
'date': d
}
print 'indexed ', article['url']
articles.insert_one(article)
else:
print 'already there', html_article['href']
else:
print 'not relevant article', html_article['href']
def fetch_article_detail(url):
html = fetch_page(url)
soup = BeautifulSoup(html, 'html.parser')
article_body = soup.find('div', itemprop='articleBody')
text = ''
html_paras = article_body.find_all('p')
for html_para in html_paras:
text += html_para.get_text() + '\n'
article = articles.find_one({'url': url})
article['fetched'] = datetime.datetime.utcnow()
article['text'] = text.strip()
article['html'] = unicode(article_body)
articles.save(article)
def fetch_detail_loop():
for article in articles.find():
if 'fetched' not in article:
fetch_article_detail(article['url'])
time.sleep(1)
#print 'need to fetch', article['url']
def fetch_list_loop():
datetime_start = datetime.datetime(2014, 12, 9)
offset = 0
while True:
d = datetime_start + datetime.timedelta(offset)
if d.year >= 2017:
break
list_url = '/home/sitemaparchive/day_%s.html' % d.strftime('%Y%m%d')
fetch_article_list(list_url,d)
offset += 1
time.sleep(1)
#DETAIL_PAGE = 'http://www.dailymail.co.uk/'
#fetch_article_detail('/wires/reuters/article-5037509/Return-Manaforts-money-Democrats-demand-California-Republican.html')
#fetch_article_list('/home/sitemaparchive/day_20100101.html')
fetch_list_loop()
# fetch_detail_loop()