From 42e29f7929a4d0cdfd317420b50a9e7e372f1fc2 Mon Sep 17 00:00:00 2001 From: Benjamin van der Burgh Date: Mon, 24 Jun 2019 15:56:04 +0200 Subject: [PATCH] Migrate to Python 3 --- README.md | 2 +- gather_urls.py | 2 +- post_process.py | 20 ++++++++++---------- scrape_reviews.py | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 400a740..77bbaa7 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ If you're on macOS and you have Homebrew installed, you can install ChromeDriver You can download ChromeDriver from the official [download page](http://chromedriver.chromium.org/downloads). ### Python -The scripts are written for **Python 2**, but I'm sure they'll work for Python 3 with minor adjustments. To install the Python dependencies, run: +The scripts are written for **Python 3**=. To install the Python dependencies, run: pip install -r ./requirements.txt diff --git a/gather_urls.py b/gather_urls.py index 2c6ac4f..fc7948d 100644 --- a/gather_urls.py +++ b/gather_urls.py @@ -27,7 +27,7 @@ def gather(outfile, offset, step): soup = BeautifulSoup(data['html'], 'lxml') new_urls = [div['data-url'] for div in soup('div', {'class': 'item'})] - print("Fetched {} urls from {}".format(len(new_urls), target_url)) + print(f"Fetched {len(new_urls)} urls from {len(target_url)}") urls.extend(new_urls) offset += 1000 diff --git a/post_process.py b/post_process.py index d92fd2f..8924d25 100644 --- a/post_process.py +++ b/post_process.py @@ -17,12 +17,12 @@ def load(infile, keep_incorrect_date=False, unique=True, sort=True): reviews = json.load(f) if not keep_incorrect_date: - reviews = filter(lambda x: x is not None and x['published'] >= '2002-09-11T00:00:00+02:00', reviews) + reviews = [x for x in reviews if x is not None and x['published'] >= '2002-09-11T00:00:00+02:00'] if unique: # Define a unique review as one with a unique review text u = {review['text']: review for review in reviews} - reviews = u.values() + reviews = list(u.values()) if sort: reviews = sorted(reviews, key=lambda x: x['published']) @@ -107,9 +107,9 @@ def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_frac if shuffle: reviews = sklearn.utils.shuffle(reviews) - pos = filter(lambda x: x['rating'] > 3, reviews) - neg = filter(lambda x: x['rating'] < 3, reviews) - neut = filter(lambda x: x['rating'] == 3, reviews) # set aside for model fine-tuning + pos = [x for x in reviews if x['rating'] > 3] + neg = [x for x in reviews if x['rating'] < 3] + neut = [x for x in reviews if x['rating'] == 3] # set aside for model fine-tuning # Balance dataset train_size = min(len(pos), len(neg)) @@ -124,11 +124,11 @@ def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_frac test = sup[:end] train = sup[end:] - print("Size all data:\t{}".format(len(reviews))) - print("Size supervised:\t{}".format(len(sup))) - print("Size unsupervised:\t{}".format(len(unsup))) - print("Size training:\t{}".format(len(train))) - print("Size testing:\t{}".format(len(test))) + print(f"Size all data:\t{len(reviews)}") + print(f"Size supervised:\t{len(sup)}") + print(f"Size unsupervised:\t{len(unsup)}") + print(f"Size training:\t{len(train)}") + print(f"Size testing:\t{len(test)}") os.mkdir(outdir) diff --git a/scrape_reviews.py b/scrape_reviews.py index 387b1ce..4dc5f90 100644 --- a/scrape_reviews.py +++ b/scrape_reviews.py @@ -72,12 +72,12 @@ def scrape(infile, outfile, encoding, indent): }) except Exception: errors.append(url) - print("Error {}: {}".format(len(errors), url)) + print("Error {len(errors)}: {url}") continue - print("Finished scraping {} urls with {} errors.".format(len(urls), len(errors))) + print(f"Finished scraping {len(urls)} urls with {len(errors)}") - print("Writing reviews to {}".format(outfile)) + print(f"Writing reviews to {outfile}") with codecs.open(outfile, 'w', encoding=encoding) as f: json.dump(reviews, f, ensure_ascii=False, indent=indent)