-
Notifications
You must be signed in to change notification settings - Fork 3
/
full_email_export.py
78 lines (65 loc) · 3.32 KB
/
full_email_export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from selenium.common.exceptions import WebDriverException
from body_finder import setup_driver
import pandas as pd
import re
import urllib.parse
from body_finder import get_cleaned_html
from lead_extraction import extract_leads_from_html
import json
import time
import os
def get_phone(response_text):
phone = re.search(r'\(?\b[2-9][0-9]{2}\)?[-. ]?[2-9][0-9]{2}[-. ]?[0-9]{4}\b', response_text)
return phone.group(0) if phone else 'Phone number not found'
def get_email(response_text):
email = re.search(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', response_text)
return email.group(0) if email else 'Email not found'
def extract_and_process_links(driver, base_url):
driver.get(base_url)
links = driver.find_elements("css selector", "a[href]")
base_domain = urllib.parse.urlparse(base_url).netloc # Extract the domain of the base URL
results = []
for link in links:
href = link.get_attribute('href')
link_domain = urllib.parse.urlparse(href).netloc # Extract the domain of each found link
# Check if the link domain is different from the base domain and not a LinkedIn URL
if href and link_domain != base_domain and "linkedin.com" not in href and "twitter.com" not in href and "pinterest.com" not in href and "featured.com" not in href:
results.append(href)
return results
def process_website(driver, url):
try:
driver.get(url)
response_text = driver.page_source
phone = get_phone(response_text)
email = get_email(response_text)
print(f'Processed {url}:')
print(f'Phone: {phone}, Email: {email}\n')
return {'Website': url, 'Phone': phone, 'Email': email}
except WebDriverException as e:
print(f"Error accessing {url}: {e}")
return None
def generate_leads_from_csv(csv_file_path= "mini-articles.csv", output_file_path = "output.json", driver_path="/usr/local/bin/geckodriver"):
# Setup Selenium WebDriver
driver = setup_driver(driver_path)
# Load initial URLs from CSV
articles_df = pd.read_csv(csv_file_path)
# print("Loaded URLs:", articles_df['Website'].tolist())
with open(output_file_path, 'w') as f:
url_to_leads_mapping = {} # Initialize an empty dictionary
for _, row in articles_df.iterrows():
article_url = row['Website']
result = get_cleaned_html(article_url, driver_path)
print(f"Processing webpage: {article_url}")
print(f"Extracted HTML body content length: {len(result)}")
print(f"Estimated token count: {round(len(result)/5)}")
leads_json_str = extract_leads_from_html(result) # returns a json object of leads
leads_python_obj = json.loads(leads_json_str)
print(f"JSON: {leads_python_obj}")
url_to_leads_mapping[article_url] = leads_python_obj
f.seek(0) # Move the file pointer to the beginning of the file
json.dump(url_to_leads_mapping, f, indent=4)
f.truncate() # Truncate the file to the current position
f.flush() # Flush the buffer to ensure data is written to the file
os.fsync(f.fileno()) # Force write of file to disk
time.sleep(2) # Wait for 2 second to prevent rate limiting issues
driver.quit()