-
Notifications
You must be signed in to change notification settings - Fork 0
/
headlines.py
123 lines (93 loc) · 3.14 KB
/
headlines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
import uuid
import time
from bs4 import BeautifulSoup
import string
import os, sys, getpass
from os import path
import re
import nltk
import random
from flask import Flask, render_template
app = Flask('cindyheadlines')
if not hasattr(app, 'extensions'):
app.extensions = {}
app.extensions['cindy'] = {
'headlines': None,
'model': None
}
def read_headline():
with open('headlines.txt') as f:
for idx, line in enumerate(f):
if not line:
continue
yield BeautifulSoup(line).text + '.' # this might help the sentence tokenizer
def get_words():
print "fetching words"
words = []
for post in read_headline():
for sentence in nltk.sent_tokenize(post):
for word in nltk.word_tokenize(sentence):
words.append(word.encode('utf8', 'ignore'))
print "%i words" % len(words)
return words
def post_prep(post):
#post = re.sub(r" ([!\.:'\";,?])", r"\1", post).replace('( ', '(').replace(' * ', '*')
#post = re.sub(r" (n't|'ll)", r'\1', post)
return post
def headlines():
return app.extensions['cindy']['headlines']
def model():
return app.extensions['cindy']['model']
def get_headline():
for attempts in range(10):
lines = app.extensions['cindy']['headlines'].values()
model = app.extensions['cindy']['model']
ctx = [random.choice(lines).split()[0]]
words = ' '.join(w.decode('utf-8', 'ignore') for w in model.generate(20, context=ctx))
sents = words.split('.')
sents.sort(key=lambda s: len(s), reverse=True)
hl = sents[0]
hl = hl.replace(" 's", "'s").replace(' ,', ',').replace(' :', ':')\
.replace(' ?', '?').replace(" 'm", "'m")\
.replace(' !', '!').strip()\
.replace(" n't", "n't")
tokes = hl.split()
headline = tokes[0].capitalize() + ' ' + ' '.join(tokes[1:])
if headline not in headlines():
break
return headline
def get_model():
if not app.extensions['cindy']['model']:
words = get_words()
print "Generating model...",
text = nltk.Text(words)
estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2)
model = nltk.model.NgramModel(3, text, estimator=estimator)
app.extensions['cindy']['model'] = model
return model
@app.route('/')
def index():
return render_template('index.html')
@app.route('/favicon.ico')
def bleh():
return 'blargh!', 404
@app.route('/guess/<uu>')
def guess(uu):
d = {"result": uu in headlines()}
return json.dumps(d), 200, {'content-type': 'text/json'}
@app.route('/q')
def question():
posts = {}
uu = random.choice(headlines().keys())
posts[uu] = headlines()[uu]
posts[str(uuid.uuid4())] = get_headline()
return json.dumps(posts), 200, {'content-type': 'text/json'}
_model = get_model()
print 'done'
print "reading headline data..",
with open('headlines.txt') as fin:
app.extensions['cindy']['headlines'] = {str(uuid.uuid4()): BeautifulSoup(h.strip()).text for h in fin}
print "done"
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0')