Skip to content

Commit

Permalink
Remove maximum option and fix problem with shuffling
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminvdb committed Jun 24, 2019
1 parent b56d986 commit c6e071a
Showing 1 changed file with 4 additions and 8 deletions.
12 changes: 4 additions & 8 deletions post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from progressbar import ProgressBar


def load(infile, keep_incorrect_date=False, unique=True, sort=True, maximum=110000):
def load(infile, keep_incorrect_date=False, unique=True, sort=True):
"""
Load reviews from JSON input file.
"""
Expand All @@ -27,9 +27,6 @@ def load(infile, keep_incorrect_date=False, unique=True, sort=True, maximum=1100
if sort:
reviews = sorted(reviews, key=lambda x: x['published'])

if maximum:
reviews = reviews[:maximum]

return reviews


Expand Down Expand Up @@ -102,14 +99,13 @@ def write_urls(reviews, outfile):
@click.option('--encoding', default='utf-8', help='Input file encoding')
@click.option('--keep-incorrect-date', default=False, help='Whether to keep reviews with invalid dates.')
@click.option('--sort', default=True, help='Whether to sort reviews by date.')
@click.option('--maximum', default=110000, help='Maximum number of reviews in output')
@click.option('--valid-size-fraction', default=0.1, help='Fraction of total to set aside as validation.')
@click.option('--shuffle', default=True, help='Shuffle data before saving.')
def process(infile, outdir, encoding, keep_incorrect_date, sort, maximum, valid_size_fraction, shuffle):
reviews = load(infile, keep_incorrect_date, sort, encoding, maximum)
def process(infile, outdir, encoding, keep_incorrect_date, sort, valid_size_fraction, shuffle):
reviews = load(infile, keep_incorrect_date, sort, encoding)

if shuffle:
sklearn.utils.shuffle(reviews)
reviews = sklearn.utils.shuffle(reviews)

pos = filter(lambda x: x['rating'] > 3, reviews)
neg = filter(lambda x: x['rating'] < 3, reviews)
Expand Down

0 comments on commit c6e071a

Please sign in to comment.