Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
den-is committed Jun 23, 2024
1 parent 6fb9b1a commit 41b7352
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
GH_USERNAME=your-gh-username
GH_LOGIN_PASSWORD=your_github_password
CHROME_DRIVER_PATH=~/Downloads/chromedriver-mac-x64/chromedriver

# optional
# GH_LOGIN_URL=https://github.com/login
# Wait user to finish login procedure
# GH_LOGIN_WAIT=60

# path to the output file. default "output.json" in the current dir
# OUTPUT_FILE=output.json
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
venv
.venv
.env
.envrc
.direnv

*.json
53 changes: 53 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# GitHub Stars scraper

Primitive web-scraper for your own GitHub Stars for further analysis or to recall long-time agi forgotten repos.

This script is extremly plain and primitive.

Script utilizes [selenium](https://www.selenium.dev/) web automation library.

Contributions are welcome.

## Requirements
- Chrome Browser
- chromedriver
- python 3.9+

## Installation
Instructions shown for Unix based Operating Systems (specifically MacOS)

Download stable [chromedriver](https://googlechromelabs.github.io/chrome-for-testing/).
Better if version matches to version of your Chrome Browser.

```sh
cd ~/Downloads
curl -OL https://storage.googleapis.com/chrome-for-testing-public/126.0.6478.63/mac-x64/chrome-mac-x64.zip
unzip chrome-mac-x64.zip

# on MacOS you might need to run
cd chromedriver-mac-x64
xattr -d com.apple.quarantine chromedriver
```

Scraper setup
```sh
git clone https://github.com/den-is/gh-stars-scraper.git

cd gh-stars-scraper

python3 -m venv venv
source venv/bin/activate

pip install -r requirements.txt
```

## Running scraper
- Create `.env` file with correct values from provided `.env.example`.
- Run script `python3 main.py`
- Wait script to open browser window and open login page
- Provide OTP code (I hope you have protected your github account with 2FA authentication)
- Watch magic to happen
- Do not interact with the web page

## Known-issues
- For some reason, script is not fetching lists starred repo belongs to.
140 changes: 140 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import json
import os
import time
from pathlib import Path

from dotenv import load_dotenv
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait

load_dotenv()

try:
GH_USERNAME = os.environ["GH_USERNAME"]
GH_LOGIN_PASSWORD = os.environ["GH_LOGIN_PASSWORD"]

GH_LOGIN_URL = os.getenv("GH_LOGIN_URL", "https://github.com/login")
GH_STARS_URL = f"https://github.com/{GH_USERNAME}?tab=stars"

CHROME_DRIVER_PATH_ENV = os.environ["CHROME_DRIVER_PATH"]
CHROME_DRIVER_PATH = Path(CHROME_DRIVER_PATH_ENV).expanduser().resolve()
GH_LOGIN_WAIT = int(os.getenv("GH_LOGIN_WAIT", 60))
OUTPUT_FILE = os.getenv("OUTPUT_FILE", "output.json")
except KeyError as e:
print(f"Missing environment variable: {e}")
exit(1)

OUTPUT_DICT = {}


def driver_setup():
options = webdriver.ChromeOptions()
service = webdriver.ChromeService(executable_path=CHROME_DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)
driver.set_window_size(1280, 1024)
return driver


def get_stars(container):
divs = container.find_elements(By.XPATH, ".//div/div/div")

for d in divs:
try:
repo = d.find_element(By.XPATH, "./div/h3/a")

details = d.find_element(By.XPATH, ".//details")

summary = details.find_element(By.XPATH, ".//summary")
summary.click()
# wait for menu to appear and load
time.sleep(1)

star_menu = details.find_element(By.XPATH, ".//details-menu")
star_menu_close = WebDriverWait(details, 5).until(
ec.presence_of_element_located(
(By.XPATH, ".//button[@class='SelectMenu-closeButton']")
)
)

star_menu_items = star_menu.find_elements(By.XPATH, ".//div[@role='listitem']")
categories = []
for cat in star_menu_items:
input = cat.find_element(By.TAG_NAME, "input")
if input.is_selected():
categories.append(cat.text)

if len(categories) == 0:
print(repo.text.replace(" ", ""), repo.get_attribute("href"))

star_menu_close.click()
OUTPUT_DICT[repo.text.replace(" ", "")] = {
"url": repo.get_attribute("href"),
"categories": categories,
}

except NoSuchElementException:
pass


def main():

driver = driver_setup()

driver.get(GH_LOGIN_URL)

wait = WebDriverWait(driver, 10)
insert_username = wait.until(ec.presence_of_element_located((By.NAME, "login")))
insert_username.send_keys(GH_USERNAME)

insert_password = driver.find_element(By.NAME, "password")
insert_password.send_keys(GH_LOGIN_PASSWORD)

sign_in = driver.find_element(By.NAME, "commit")
sign_in.click()

# Input 2FA OTP code
# Wait user for 60 seconds to finish Login procedure (enter username, password and OTP)

start_time = time.perf_counter()

WebDriverWait(driver, GH_LOGIN_WAIT).until(
ec.presence_of_element_located((By.CLASS_NAME, "AppHeader-user"))
)

# open stars page
driver.get(GH_STARS_URL)

scrape = True

while scrape:

stars_frame = driver.find_element(By.ID, "user-starred-repos")
get_stars(stars_frame)

try:
driver.find_element(By.XPATH, '//button[@disabled="disabled" and text()="Next"]')
print("Reached the last page")
scrape = False
except NoSuchElementException:
pass

if scrape:
next_link = driver.find_element(By.LINK_TEXT, "Next")
next_link.click()
# wait for the next page to load
time.sleep(5)

driver.quit()

time_elapsed_str = time.strftime("%M:%S", time.gmtime(time.perf_counter() - start_time))
print(f"Scraped in (mm:ss): {time_elapsed_str}")

with open(OUTPUT_FILE, "w") as _out_f:
json.dump(OUTPUT_DICT, _out_f, indent=2)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
selenium
python-dotenv

0 comments on commit 41b7352

Please sign in to comment.