-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
email_harvester.py
16 lines (14 loc) · 846 Bytes
/
email_harvester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import re
from requests_html import HTMLSession
import sys
url = sys.argv[1]
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
# initiate an HTTP session
session = HTMLSession()
# get the HTTP Response
r = session.get(url)
# for JAVA-Script driven websites
r.html.render()
with open(sys.argv[2], "a") as f:
for re_match in re.finditer(EMAIL_REGEX, r.html.raw_html.decode()):
print(re_match.group().strip(), file=f)