From 42e29f7929a4d0cdfd317420b50a9e7e372f1fc2 Mon Sep 17 00:00:00 2001
From: Benjamin van der Burgh <benjaminvdb@gmail.com>
Date: Mon, 24 Jun 2019 15:56:04 +0200
Subject: [PATCH] Migrate to Python 3

---
 README.md         |  2 +-
 gather_urls.py    |  2 +-
 post_process.py   | 20 ++++++++++----------
 scrape_reviews.py |  6 +++---
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 400a740..77bbaa7 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ If you're on macOS and you have Homebrew installed, you can install ChromeDriver
 You can download ChromeDriver from the official [download page](http://chromedriver.chromium.org/downloads).
 
 ### Python
-The scripts are written for **Python 2**, but I'm sure they'll work for Python 3 with minor adjustments. To install the Python dependencies, run:     
+The scripts are written for **Python 3**=. To install the Python dependencies, run:     
 
     pip install -r ./requirements.txt
 
diff --git a/gather_urls.py b/gather_urls.py
index 2c6ac4f..fc7948d 100644
--- a/gather_urls.py
+++ b/gather_urls.py
@@ -27,7 +27,7 @@ def gather(outfile, offset, step):
 
         soup = BeautifulSoup(data['html'], 'lxml')
         new_urls = [div['data-url'] for div in soup('div', {'class': 'item'})]
-        print("Fetched {} urls from {}".format(len(new_urls), target_url))
+        print(f"Fetched {len(new_urls)} urls from {len(target_url)}")
         urls.extend(new_urls)
         offset += 1000
 
diff --git a/post_process.py b/post_process.py
index d92fd2f..8924d25 100644
--- a/post_process.py
+++ b/post_process.py
@@ -17,12 +17,12 @@ def load(infile, keep_incorrect_date=False, unique=True, sort=True):
         reviews = json.load(f)
 
         if not keep_incorrect_date:
-            reviews = filter(lambda x: x is not None and x['published'] >= '2002-09-11T00:00:00+02:00', reviews)
+            reviews = [x for x in reviews if x is not None and x['published'] >= '2002-09-11T00:00:00+02:00']
 
         if unique:
             # Define a unique review as one with a unique review text
             u = {review['text']: review for review in reviews}
-            reviews = u.values()
+            reviews = list(u.values())
 
         if sort:
             reviews = sorted(reviews, key=lambda x: x['published'])
@@ -107,9 +107,9 @@ def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_frac
     if shuffle:
         reviews = sklearn.utils.shuffle(reviews)
 
-    pos = filter(lambda x: x['rating'] > 3, reviews)
-    neg = filter(lambda x: x['rating'] < 3, reviews)
-    neut = filter(lambda x: x['rating'] == 3, reviews)  # set aside for model fine-tuning
+    pos = [x for x in reviews if x['rating'] > 3]
+    neg = [x for x in reviews if x['rating'] < 3]
+    neut = [x for x in reviews if x['rating'] == 3]  # set aside for model fine-tuning
 
     # Balance dataset
     train_size = min(len(pos), len(neg))
@@ -124,11 +124,11 @@ def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_frac
     test = sup[:end]
     train = sup[end:]
 
-    print("Size all data:\t{}".format(len(reviews)))
-    print("Size supervised:\t{}".format(len(sup)))
-    print("Size unsupervised:\t{}".format(len(unsup)))
-    print("Size training:\t{}".format(len(train)))
-    print("Size testing:\t{}".format(len(test)))
+    print(f"Size all data:\t{len(reviews)}")
+    print(f"Size supervised:\t{len(sup)}")
+    print(f"Size unsupervised:\t{len(unsup)}")
+    print(f"Size training:\t{len(train)}")
+    print(f"Size testing:\t{len(test)}")
 
     os.mkdir(outdir)
 
diff --git a/scrape_reviews.py b/scrape_reviews.py
index 387b1ce..4dc5f90 100644
--- a/scrape_reviews.py
+++ b/scrape_reviews.py
@@ -72,12 +72,12 @@ def scrape(infile, outfile, encoding, indent):
                     })
         except Exception:
             errors.append(url)
-            print("Error {}: {}".format(len(errors), url))
+            print("Error {len(errors)}: {url}")
             continue
 
-    print("Finished scraping {} urls with {} errors.".format(len(urls), len(errors)))
+    print(f"Finished scraping {len(urls)} urls with {len(errors)}")
 
-    print("Writing reviews to {}".format(outfile))
+    print(f"Writing reviews to {outfile}")
     with codecs.open(outfile, 'w', encoding=encoding) as f:
         json.dump(reviews, f, ensure_ascii=False, indent=indent)