-
Notifications
You must be signed in to change notification settings - Fork 0
/
indeed.py
104 lines (91 loc) · 3.93 KB
/
indeed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
import csv
import re
#setting driver
options = webdriver.ChromeOptions()
options.add_argument("--incognito")
options.add_argument('--disable-gpu')
#creating driver
driver = webdriver.Chrome(options=options)
url = "https://es.indeed.com"
driver.get(url)
wait = WebDriverWait(driver, 5)
#searching in a bar
search_bar = driver.find_element_by_name("q")
search_bar.clear()
#type your job, also the location is taken --> ex: data analyst dublin
keyword = "data madrid"
print("looking for",keyword)
search_bar.send_keys(keyword)
search_bar.send_keys(Keys.RETURN)
driver.current_url
links = []
while True:
new_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".jobtitle.turnstileLink ")))
links.extend([l.get_attribute("href") for l in new_links])
try: #EC needed as otherwise the element was not clickable
next_page = wait.until(EC.element_to_be_clickable((By.XPATH, "//ul[contains(@class, 'agination')]/li[last()]/a")))
#ActionChains is needed as Indeed opens a small window and it is needed to be closed to continue
ActionChains(driver).move_to_element(next_page).click().perform()
except TimeoutException:
print("links scraped")
break
offer_links = []
positions = []
companies = []
days = []
conditions = []
for l in links:
driver.get(l)
#get original link offer
try:
offer_link = driver.find_element_by_xpath("//div[contains(@class, 'icl-u-xs-hide icl-u-lg-block icl-u-lg-textCenter')]/a").get_attribute("href")
offer_links.append(offer_link)
except NoSuchElementException:
offer_links.append("no original link offer")
#find job position
try:
position = driver.find_element_by_xpath("//h3[contains(@class, 'jobsearch-JobInfoHeader-title')]").text
positions.append(position)
except NoSuchElementException:
positions.append("no position description")
#find company
try:
company = driver.find_element_by_xpath("//div[contains(@class, 'icl-u-lg-mr--sm icl-u-xs-mr--xs')]").text
companies.append(company)
except NoSuchElementException:
companies.append("no company description")
#release day
try:
meta = driver.find_element_by_xpath("//div[contains(@class, 'jobsearch-JobMetadataFooter')]").text
#change días to days or your language translation
release_date = "días"
search = re.search(f"(\d+).*({release_date})", str(meta))
if search:
release = "".join([search.group(1)," ",search.group(2)])
else:
release = "today/yesterday"
days.append(release)
except NoSuchElementException:
days.append("no release description")
#my condition -- I wanted jobs that included python, change or add the conditions you'd like
condition = "ython"
if condition in driver.page_source:
conditions.append("python")
else:
conditions.append("nop")
#saving in csv
with open("".join([keyword.replace(" ","-"), ".csv"]), 'w', newline='') as csvfile:
fieldnames = ["indeed_link", "offer_link", "position", "company", "release day", "contains"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for link, offer_link, position, company, release, condition in zip(links, offer_links, positions, companies, days, conditions):
writer.writerow({"indeed_link":link, "offer_link":offer_link, "position":position, "company":company, "release day":release, "contains":condition})
print("".join([keyword.replace(" ","-"), ".csv file available"]))