-
Notifications
You must be signed in to change notification settings - Fork 3
/
body_finder.py
48 lines (45 loc) · 1.87 KB
/
body_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
def setup_driver(driver_path):
options = Options()
options.add_argument("--headless")
options.headless = True # Enable headless mode
# Automatically manage firefoxdriver
# Set the path to the GeckoDriver executable
gecko_driver_path = driver_path
driver = webdriver.Firefox(service=Service(executable_path=gecko_driver_path), options=options)
return driver
def get_cleaned_html(url, driver_path):
driver = setup_driver(driver_path)
try:
# Navigate to the URL
driver.get(url)
# Get the page source
page_source = driver.page_source
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(page_source, "html.parser")
# Find the body tag
body = soup.find('body')
# Remove specific tags by decomposing them
for tag in body.find_all(["script", "head", "link"]):
tag.decompose()
# Iterate over all tags to clean up attributes
for tag in body.find_all(True):
attrs_to_keep = {}
# For <a> tags, keep the href attribute
if tag.name == "a" and tag.has_attr("href"):
attrs_to_keep["href"] = tag["href"]
# For <img> tags, keep the alt attribute
if tag.name == "img" and tag.has_attr("alt"):
attrs_to_keep["alt"] = tag["alt"]
# For any tag, if it has a title attribute, keep it
if tag.has_attr("title"):
attrs_to_keep["title"] = tag["title"]
# Update the tag's attributes to only keep the ones we want
tag.attrs = attrs_to_keep
# Return the prettified HTML of the body
return body.prettify()
finally:
driver.quit()