Migrate to Python 3

benjaminvdb · Jun 24, 2019 · 42e29f7 · 42e29f7
1 parent c6e071a
commit 42e29f7
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ If you're on macOS and you have Homebrew installed, you can install ChromeDriver
 You can download ChromeDriver from the official [download page](http://chromedriver.chromium.org/downloads).
 
 ### Python
-The scripts are written for **Python 2**, but I'm sure they'll work for Python 3 with minor adjustments. To install the Python dependencies, run: 
+The scripts are written for **Python 3**=. To install the Python dependencies, run: 
 
  pip install -r ./requirements.txt
 

diff --git a/gather_urls.py b/gather_urls.py
@@ -27,7 +27,7 @@ def gather(outfile, offset, step):
 
  soup = BeautifulSoup(data['html'], 'lxml')
  new_urls = [div['data-url'] for div in soup('div', {'class': 'item'})]
- print("Fetched {} urls from {}".format(len(new_urls), target_url))
+ print(f"Fetched {len(new_urls)} urls from {len(target_url)}")
  urls.extend(new_urls)
  offset += 1000
 

diff --git a/post_process.py b/post_process.py
@@ -17,12 +17,12 @@ def load(infile, keep_incorrect_date=False, unique=True, sort=True):
  reviews = json.load(f)
 
  if not keep_incorrect_date:
- reviews = filter(lambda x: x is not None and x['published'] >= '2002-09-11T00:00:00+02:00', reviews)
+ reviews = [x for x in reviews if x is not None and x['published'] >= '2002-09-11T00:00:00+02:00']
 
  if unique:
  # Define a unique review as one with a unique review text
  u = {review['text']: review for review in reviews}
- reviews = u.values()
+ reviews = list(u.values())
 
  if sort:
  reviews = sorted(reviews, key=lambda x: x['published'])
@@ -107,9 +107,9 @@ def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_frac
  if shuffle:
  reviews = sklearn.utils.shuffle(reviews)
 
- pos = filter(lambda x: x['rating'] > 3, reviews)
- neg = filter(lambda x: x['rating'] < 3, reviews)
- neut = filter(lambda x: x['rating'] == 3, reviews) # set aside for model fine-tuning
+ pos = [x for x in reviews if x['rating'] > 3]
+ neg = [x for x in reviews if x['rating'] < 3]
+ neut = [x for x in reviews if x['rating'] == 3] # set aside for model fine-tuning
 
  # Balance dataset
  train_size = min(len(pos), len(neg))
@@ -124,11 +124,11 @@ def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_frac
  test = sup[:end]
  train = sup[end:]
 
- print("Size all data:\t{}".format(len(reviews)))
- print("Size supervised:\t{}".format(len(sup)))
- print("Size unsupervised:\t{}".format(len(unsup)))
- print("Size training:\t{}".format(len(train)))
- print("Size testing:\t{}".format(len(test)))
+ print(f"Size all data:\t{len(reviews)}")
+ print(f"Size supervised:\t{len(sup)}")
+ print(f"Size unsupervised:\t{len(unsup)}")
+ print(f"Size training:\t{len(train)}")
+ print(f"Size testing:\t{len(test)}")
 
  os.mkdir(outdir)
 

diff --git a/scrape_reviews.py b/scrape_reviews.py
@@ -72,12 +72,12 @@ def scrape(infile, outfile, encoding, indent):
  })
  except Exception:
  errors.append(url)
- print("Error {}: {}".format(len(errors), url))
+ print("Error {len(errors)}: {url}")
  continue
 
- print("Finished scraping {} urls with {} errors.".format(len(urls), len(errors)))
+ print(f"Finished scraping {len(urls)} urls with {len(errors)}")
 
- print("Writing reviews to {}".format(outfile))
+ print(f"Writing reviews to {outfile}")
  with codecs.open(outfile, 'w', encoding=encoding) as f:
  json.dump(reviews, f, ensure_ascii=False, indent=indent)