Skip to content

Commit

Permalink
Migrate to Python 3
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminvdb committed Jun 24, 2019
1 parent c6e071a commit 42e29f7
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 15 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ If you're on macOS and you have Homebrew installed, you can install ChromeDriver
You can download ChromeDriver from the official [download page](http://chromedriver.chromium.org/downloads).

### Python
The scripts are written for **Python 2**, but I'm sure they'll work for Python 3 with minor adjustments. To install the Python dependencies, run:
The scripts are written for **Python 3**=. To install the Python dependencies, run:

pip install -r ./requirements.txt

Expand Down
2 changes: 1 addition & 1 deletion gather_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def gather(outfile, offset, step):

soup = BeautifulSoup(data['html'], 'lxml')
new_urls = [div['data-url'] for div in soup('div', {'class': 'item'})]
print("Fetched {} urls from {}".format(len(new_urls), target_url))
print(f"Fetched {len(new_urls)} urls from {len(target_url)}")
urls.extend(new_urls)
offset += 1000

Expand Down
20 changes: 10 additions & 10 deletions post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ def load(infile, keep_incorrect_date=False, unique=True, sort=True):
reviews = json.load(f)

if not keep_incorrect_date:
reviews = filter(lambda x: x is not None and x['published'] >= '2002-09-11T00:00:00+02:00', reviews)
reviews = [x for x in reviews if x is not None and x['published'] >= '2002-09-11T00:00:00+02:00']

if unique:
# Define a unique review as one with a unique review text
u = {review['text']: review for review in reviews}
reviews = u.values()
reviews = list(u.values())

if sort:
reviews = sorted(reviews, key=lambda x: x['published'])
Expand Down Expand Up @@ -107,9 +107,9 @@ def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_frac
if shuffle:
reviews = sklearn.utils.shuffle(reviews)

pos = filter(lambda x: x['rating'] > 3, reviews)
neg = filter(lambda x: x['rating'] < 3, reviews)
neut = filter(lambda x: x['rating'] == 3, reviews) # set aside for model fine-tuning
pos = [x for x in reviews if x['rating'] > 3]
neg = [x for x in reviews if x['rating'] < 3]
neut = [x for x in reviews if x['rating'] == 3] # set aside for model fine-tuning

# Balance dataset
train_size = min(len(pos), len(neg))
Expand All @@ -124,11 +124,11 @@ def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_frac
test = sup[:end]
train = sup[end:]

print("Size all data:\t{}".format(len(reviews)))
print("Size supervised:\t{}".format(len(sup)))
print("Size unsupervised:\t{}".format(len(unsup)))
print("Size training:\t{}".format(len(train)))
print("Size testing:\t{}".format(len(test)))
print(f"Size all data:\t{len(reviews)}")
print(f"Size supervised:\t{len(sup)}")
print(f"Size unsupervised:\t{len(unsup)}")
print(f"Size training:\t{len(train)}")
print(f"Size testing:\t{len(test)}")

os.mkdir(outdir)

Expand Down
6 changes: 3 additions & 3 deletions scrape_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ def scrape(infile, outfile, encoding, indent):
})
except Exception:
errors.append(url)
print("Error {}: {}".format(len(errors), url))
print("Error {len(errors)}: {url}")
continue

print("Finished scraping {} urls with {} errors.".format(len(urls), len(errors)))
print(f"Finished scraping {len(urls)} urls with {len(errors)}")

print("Writing reviews to {}".format(outfile))
print(f"Writing reviews to {outfile}")
with codecs.open(outfile, 'w', encoding=encoding) as f:
json.dump(reviews, f, ensure_ascii=False, indent=indent)

Expand Down

0 comments on commit 42e29f7

Please sign in to comment.