diff --git a/rebound/rebound.py b/rebound/rebound.py index b39a347..d120a7a 100644 --- a/rebound/rebound.py +++ b/rebound/rebound.py @@ -16,6 +16,9 @@ import time from urwid.widget import (BOX, FLOW, FIXED) import random +from urllib.request import Request, urlopen +import urllib.parse + SO_URL = "https://stackoverflow.com" @@ -41,39 +44,7 @@ SCROLLBAR_LEFT = "left" SCROLLBAR_RIGHT = "right" -USER_AGENTS = [ - "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", - "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", - "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", - "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Firefox/59", - "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', - 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', - 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', - 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', -] + ################## @@ -269,16 +240,39 @@ def souper(url): """Turns a given URL into a BeautifulSoup object.""" try: - html = requests.get(url, headers={"User-Agent": random.choice(USER_AGENTS)}) - except requests.exceptions.RequestException: + + req = Request(url , headers={'User-Agent': "Mozilla/5.0"}) + html = urlopen(req).read() + + + + + except Request.exceptions.RequestException: sys.stdout.write("\n%s%s%s" % (RED, "Rebound was unable to fetch Stack Overflow results. " "Please check that you are connected to the internet.\n", END)) sys.exit(1) - if re.search("\.com/nocaptcha", html.url): # URL is a captcha page + # if re.search("\.com/nocaptcha", req.get_full_url()): # URL is a captcha page + # return None + # else: + # return BeautifulSoup(html, "html.parser") + try: + resp = urllib.request.urlopen(req) + except urllib.error.HTTPError as e: + if e.status != 307: + raise # not a status code that can be handled here + redirected_url = urllib.parse.urljoin(url, e.headers['Location']) + resp = urlopen(redirected_url) + # print('Redirected -> %s' % redirected_url) # the original redirected url + # print('Response URL -> %s ' % resp.url) # the final url/ + + if re.search("\.com/nocaptcha", resp.url): # URL is a captcha page + print('captcha check! open url in browser to whitelist') + print('Response URL -> %s ' % resp.url) # the final url/ + return None else: - return BeautifulSoup(html.text, "html.parser") + return BeautifulSoup(html, "html.parser") ## Main ##