-
Notifications
You must be signed in to change notification settings - Fork 0
/
second_round_scraper.py
95 lines (70 loc) · 3.03 KB
/
second_round_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
# coding: utf-8
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import time
# url template for scraping
base_url = "http://www.basketball-reference.com/draft/NBA_{year}.html"
# initialize large DataFrame to append individual draft classes into
draft_df = pd.DataFrame()
# create a list of strings that indicate second round pick
pick_range = list(range(31,61))
pick_range = [str(x) for x in pick_range]
# append individual draft classes into DataFrame
for year in range(2005, 2015):
# insert year into url to scrape by year
url = base_url.format(year=year)
# get the html for the url we use and create the soup object to parse the html
html = urlopen(url)
soup = BeautifulSoup(html, 'html5lib')
# parse through <tr> tag in html for the headers to have our DataFrame labels
column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
# Rk does not get caught by player_data, so remove it for consistency
column_headers.remove('Rk')
# get our player data from <td> tag in html
data_rows = soup.findAll('tr')[2:]
player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]
# replace empty string values with 0
for i in player_data:
i[:] = [0 if x=='' else x for x in i]
# remove empty data that was grabbed from scraping empty rows
counter = 0
length = len(player_data)
while(counter<length):
if(player_data[counter]==[]):
player_data.remove(player_data[counter])
# as an element is removed
# so decrease the length by 1
length = length - 1
# run loop again to check element
# at same index, when item removed
# next item will shift to the left
continue
counter = counter + 1
# only access 2nd rounders
player_data = [x for x in player_data if x[0] in pick_range]
# insert data into pandas DataFrame
year_df = pd.DataFrame(player_data, columns=column_headers)
# make draft classes distinct by inserting new column into DataFrame
year_df.insert(0, 'Draft Year', year)
# Append to the original DataFrame
draft_df = draft_df.append(year_df, ignore_index=True)
# Pause program in order to not overload servers and get blacklisted
time.sleep(45)
# Convert data to proper data types (strings to int/float)
draft_df = draft_df.infer_objects()
# Get rid of the rows full of null values
draft_df = draft_df[draft_df.Player.notnull()]
# Replace NaNs with 0s
draft_df = draft_df.fillna(0)
# Rename Columns
draft_df.rename(columns={'WS/48': 'WS_per_48'}, inplace=True)
# Change % symbol
draft_df.columns = draft_df.columns.str.replace('%', '_Perc')
# Add per_G to per game stats
draft_df.columns.values[15:19] = [draft_df.columns.values[15:19][col] +
"_per_G" for col in range(4)]
draft_df.infer_objects()
# download the csv file
draft_df.to_csv("second_rounders_2005-14.csv")