-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
213 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
GH_USERNAME=your-gh-username | ||
GH_LOGIN_PASSWORD=your_github_password | ||
CHROME_DRIVER_PATH=~/Downloads/chromedriver-mac-x64/chromedriver | ||
|
||
# optional | ||
# GH_LOGIN_URL=https://github.com/login | ||
# Wait user to finish login procedure | ||
# GH_LOGIN_WAIT=60 | ||
|
||
# path to the output file. default "output.json" in the current dir | ||
# OUTPUT_FILE=output.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
venv | ||
.venv | ||
.env | ||
.envrc | ||
.direnv | ||
|
||
*.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# GitHub Stars scraper | ||
|
||
Primitive web-scraper for your own GitHub Stars for further analysis or to recall long-time agi forgotten repos. | ||
|
||
This script is extremly plain and primitive. | ||
|
||
Script utilizes [selenium](https://www.selenium.dev/) web automation library. | ||
|
||
Contributions are welcome. | ||
|
||
## Requirements | ||
- Chrome Browser | ||
- chromedriver | ||
- python 3.9+ | ||
|
||
## Installation | ||
Instructions shown for Unix based Operating Systems (specifically MacOS) | ||
|
||
Download stable [chromedriver](https://googlechromelabs.github.io/chrome-for-testing/). | ||
Better if version matches to version of your Chrome Browser. | ||
|
||
```sh | ||
cd ~/Downloads | ||
curl -OL https://storage.googleapis.com/chrome-for-testing-public/126.0.6478.63/mac-x64/chrome-mac-x64.zip | ||
unzip chrome-mac-x64.zip | ||
|
||
# on MacOS you might need to run | ||
cd chromedriver-mac-x64 | ||
xattr -d com.apple.quarantine chromedriver | ||
``` | ||
|
||
Scraper setup | ||
```sh | ||
git clone https://github.com/den-is/gh-stars-scraper.git | ||
|
||
cd gh-stars-scraper | ||
|
||
python3 -m venv venv | ||
source venv/bin/activate | ||
|
||
pip install -r requirements.txt | ||
``` | ||
|
||
## Running scraper | ||
- Create `.env` file with correct values from provided `.env.example`. | ||
- Run script `python3 main.py` | ||
- Wait script to open browser window and open login page | ||
- Provide OTP code (I hope you have protected your github account with 2FA authentication) | ||
- Watch magic to happen | ||
- Do not interact with the web page | ||
|
||
## Known-issues | ||
- For some reason, script is not fetching lists starred repo belongs to. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import json | ||
import os | ||
import time | ||
from pathlib import Path | ||
|
||
from dotenv import load_dotenv | ||
from selenium import webdriver | ||
from selenium.common.exceptions import NoSuchElementException | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support import expected_conditions as ec | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
|
||
load_dotenv() | ||
|
||
try: | ||
GH_USERNAME = os.environ["GH_USERNAME"] | ||
GH_LOGIN_PASSWORD = os.environ["GH_LOGIN_PASSWORD"] | ||
|
||
GH_LOGIN_URL = os.getenv("GH_LOGIN_URL", "https://github.com/login") | ||
GH_STARS_URL = f"https://github.com/{GH_USERNAME}?tab=stars" | ||
|
||
CHROME_DRIVER_PATH_ENV = os.environ["CHROME_DRIVER_PATH"] | ||
CHROME_DRIVER_PATH = Path(CHROME_DRIVER_PATH_ENV).expanduser().resolve() | ||
GH_LOGIN_WAIT = int(os.getenv("GH_LOGIN_WAIT", 60)) | ||
OUTPUT_FILE = os.getenv("OUTPUT_FILE", "output.json") | ||
except KeyError as e: | ||
print(f"Missing environment variable: {e}") | ||
exit(1) | ||
|
||
OUTPUT_DICT = {} | ||
|
||
|
||
def driver_setup(): | ||
options = webdriver.ChromeOptions() | ||
service = webdriver.ChromeService(executable_path=CHROME_DRIVER_PATH) | ||
driver = webdriver.Chrome(service=service, options=options) | ||
driver.set_window_size(1280, 1024) | ||
return driver | ||
|
||
|
||
def get_stars(container): | ||
divs = container.find_elements(By.XPATH, ".//div/div/div") | ||
|
||
for d in divs: | ||
try: | ||
repo = d.find_element(By.XPATH, "./div/h3/a") | ||
|
||
details = d.find_element(By.XPATH, ".//details") | ||
|
||
summary = details.find_element(By.XPATH, ".//summary") | ||
summary.click() | ||
# wait for menu to appear and load | ||
time.sleep(1) | ||
|
||
star_menu = details.find_element(By.XPATH, ".//details-menu") | ||
star_menu_close = WebDriverWait(details, 5).until( | ||
ec.presence_of_element_located( | ||
(By.XPATH, ".//button[@class='SelectMenu-closeButton']") | ||
) | ||
) | ||
|
||
star_menu_items = star_menu.find_elements(By.XPATH, ".//div[@role='listitem']") | ||
categories = [] | ||
for cat in star_menu_items: | ||
input = cat.find_element(By.TAG_NAME, "input") | ||
if input.is_selected(): | ||
categories.append(cat.text) | ||
|
||
if len(categories) == 0: | ||
print(repo.text.replace(" ", ""), repo.get_attribute("href")) | ||
|
||
star_menu_close.click() | ||
OUTPUT_DICT[repo.text.replace(" ", "")] = { | ||
"url": repo.get_attribute("href"), | ||
"categories": categories, | ||
} | ||
|
||
except NoSuchElementException: | ||
pass | ||
|
||
|
||
def main(): | ||
|
||
driver = driver_setup() | ||
|
||
driver.get(GH_LOGIN_URL) | ||
|
||
wait = WebDriverWait(driver, 10) | ||
insert_username = wait.until(ec.presence_of_element_located((By.NAME, "login"))) | ||
insert_username.send_keys(GH_USERNAME) | ||
|
||
insert_password = driver.find_element(By.NAME, "password") | ||
insert_password.send_keys(GH_LOGIN_PASSWORD) | ||
|
||
sign_in = driver.find_element(By.NAME, "commit") | ||
sign_in.click() | ||
|
||
# Input 2FA OTP code | ||
# Wait user for 60 seconds to finish Login procedure (enter username, password and OTP) | ||
|
||
start_time = time.perf_counter() | ||
|
||
WebDriverWait(driver, GH_LOGIN_WAIT).until( | ||
ec.presence_of_element_located((By.CLASS_NAME, "AppHeader-user")) | ||
) | ||
|
||
# open stars page | ||
driver.get(GH_STARS_URL) | ||
|
||
scrape = True | ||
|
||
while scrape: | ||
|
||
stars_frame = driver.find_element(By.ID, "user-starred-repos") | ||
get_stars(stars_frame) | ||
|
||
try: | ||
driver.find_element(By.XPATH, '//button[@disabled="disabled" and text()="Next"]') | ||
print("Reached the last page") | ||
scrape = False | ||
except NoSuchElementException: | ||
pass | ||
|
||
if scrape: | ||
next_link = driver.find_element(By.LINK_TEXT, "Next") | ||
next_link.click() | ||
# wait for the next page to load | ||
time.sleep(5) | ||
|
||
driver.quit() | ||
|
||
time_elapsed_str = time.strftime("%M:%S", time.gmtime(time.perf_counter() - start_time)) | ||
print(f"Scraped in (mm:ss): {time_elapsed_str}") | ||
|
||
with open(OUTPUT_FILE, "w") as _out_f: | ||
json.dump(OUTPUT_DICT, _out_f, indent=2) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
selenium | ||
python-dotenv |