Skip to content

Commit

Permalink
update deps and use single browser at the cost of collecting home pag…
Browse files Browse the repository at this point in the history
…es synchronously
  • Loading branch information
intael committed Dec 20, 2023
1 parent 718044c commit 8c6f03a
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 118 deletions.
27 changes: 8 additions & 19 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,42 +1,31 @@
version: '3.3'
version: '3.8'

services:
core:
build:
dockerfile: docker/Dockerfile
context: .
container_name: REcrawler
container_name: recrawler
environment:
- HOST=172.28.1.2
- HOST=db
- PORT=3306
- MYSQL_DATABASE=recrawler
- MYSQL_USER=root
- MYSQL_ROOT_PASSWORD=passwd
entrypoint: ["tail", "-f", "/dev/null"]
volumes:
- .:/usr/src/app
networks:
testing_net:
ipv4_address: 172.28.1.1
depends_on: ["db"]

db:
container_name: db_scrapper
image: mysql:8.0
image: mysql:8.0.35
platform: linux/amd64
environment:
- MYSQL_DATABASE=recrawler
- MYSQL_ROOT_PASSWORD=passwd
ports:
- 33069:3306
- 3306:3306
restart: always
volumes:
- ./docker/mysql/scripts:/docker-entrypoint-initdb.d:rw
networks:
testing_net:
ipv4_address: 172.28.1.2

networks:
testing_net:
ipam:
driver: default
config:
- subnet: 172.28.0.0/16
- ./docker/mysql/scripts:/docker-entrypoint-initdb.d:rw
7 changes: 4 additions & 3 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
FROM adoptopenjdk/maven-openjdk11:latest
FROM maven:3.9.5-amazoncorretto-17-debian-bookworm

RUN apt-get update && apt-get install mysql-client -y
RUN apt update && apt upgrade -y

# Install project dependencies and keep sources
WORKDIR /usr/src/app

# install maven dependency packages (keep in image)
COPY . .
RUN mvn clean package
RUN mvn clean package && \
mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install --with-deps chromium"
16 changes: 8 additions & 8 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
<version>1.17.1</version>
</dependency>
<dependency>
<groupId>org.realityforge.org.jetbrains.annotations</groupId>
Expand All @@ -27,7 +27,7 @@
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
<version>4.5.14</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand All @@ -52,9 +52,9 @@
<version>1.7.36</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.22.0</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
Expand All @@ -65,7 +65,7 @@
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.28</version>
<version>8.0.33</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
Expand All @@ -85,12 +85,12 @@
<dependency>
<groupId>org.apache.maven</groupId>
<artifactId>maven-archiver</artifactId>
<version>3.5.2</version>
<version>3.6.1</version>
</dependency>
<dependency>
<groupId>com.microsoft.playwright</groupId>
<artifactId>playwright</artifactId>
<version>1.21.0</version>
<version>1.40.0</version>
</dependency>
</dependencies>

Expand Down
5 changes: 4 additions & 1 deletion src/main/java/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ public class Main {
private static final String GEOAREA3_SHORT_NAME = "g3";
private static final String MIN_BATHROOMS = "minBathrooms";
private static final String MIN_ROOMS = "minRooms";
private static Logger LOGGER = LoggerFactory.getLogger(Main.class);

public static void main(String[] args) throws Exception {
Logger LOGGER = LoggerFactory.getLogger(Main.class);
new CmdLineAppBuilder(args)
.withJarName("REcrawler") // just for a help text (-h option)
.withDescription("My program does ...")
Expand Down Expand Up @@ -62,6 +62,7 @@ public static void main(String[] args) throws Exception {
.withEntryPoint(Main::run)
.build()
.run();
System.exit(0);
}

private static void run(CommandLine commandLine) {
Expand All @@ -76,7 +77,9 @@ private static void run(CommandLine commandLine) {
break;
}
crawler.crawl();
LOGGER.info("Crawling finalized successfully! Persisting data...");
crawler.getCollectedRealEstates().forEach(realEstateRepository::save);
LOGGER.info("Data persisted successfully!");
}

private static UrlBuilder createSearchUrlBuilder(Site site, CommandLine commandLineApp) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import com.microsoft.playwright.*;
import java.io.IOException;
import java.util.Optional;

import com.microsoft.playwright.options.LoadState;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
Expand All @@ -14,32 +16,33 @@
public class PlaywrightSiteCollector implements SiteCollector {
private static final Logger LOGGER = LoggerFactory.getLogger(PlaywrightSiteCollector.class);
private final int maxRetries;
private final BrowserType.LaunchOptions launchOptions;
private final Browser browser;

@Inject
public PlaywrightSiteCollector(
@LaunchOptionsQualifier BrowserType.LaunchOptions launchOptions,
@Named("maxRetries") int maxRetries) {
this.maxRetries = maxRetries;
this.launchOptions = launchOptions;
this.browser = Playwright.create().chromium().launch(launchOptions);
}

@Override
public Optional<Document> collect(String url) {
int retries = 0;
synchronized public Optional<Document> collect(String url) {
var retries = 0;
while (retries <= maxRetries) {
try (Browser browser = Playwright.create().chromium().launch(launchOptions);
Page page = browser.newPage()) {
try (Page page = browser.newPage()) {
Response response = page.navigate(url);
page.waitForLoadState(LoadState.DOMCONTENTLOADED);
if (response.status() != 200) {
throw new IOException("Response is not 200.");
}
return Optional.of(Jsoup.parse(response.text()));
} catch (IOException ioe) {
} catch (IOException | PlaywrightException ioe) {
LOGGER.info("Failed to collect url: " + url + ioe.getMessage() + " Retrying.");
retries++;
}
}
LOGGER.warn("Giving up on retries for " + url);
return Optional.empty();
}
}
152 changes: 72 additions & 80 deletions src/main/java/webcrawling/spanishestate/SpanishEstateWebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import java.util.concurrent.Future;
import java.util.function.Consumer;
import java.util.stream.Collectors;

import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
Expand All @@ -23,93 +24,84 @@
import webcrawling.utils.FetchDocumentCallable;

public class SpanishEstateWebCrawler implements WebCrawler {
private static final Logger LOGGER = LoggerFactory.getLogger(SpanishEstateWebCrawler.class);
private final Set<URL> searchResultsPages = new HashSet<>();
private final Set<RealEstate> collectedHomes = ConcurrentHashMap.newKeySet();
private final Map<URL, String> listingUrlToSearchUrl = new HashMap<>();
private final HtmlParser<URL> searchResultsPageParser;
private final SiteCollector siteCollector;
private final HtmlParser<RealEstate> listingHtmlParser;
private final UrlBuilder urlBuilder;

public Set<URL> getSearchResultsPages() {
return new HashSet<>(searchResultsPages); // defensive copy
}
private static final Logger LOGGER = LoggerFactory.getLogger(SpanishEstateWebCrawler.class);
private final Set<URL> searchResultsPages = new HashSet<>();
private final Set<RealEstate> collectedHomes = ConcurrentHashMap.newKeySet();
private final HtmlParser<URL> searchResultsPageParser;
private final SiteCollector siteCollector;
private final HtmlParser<RealEstate> listingHtmlParser;
private final UrlBuilder urlBuilder;

public Set<RealEstate> getCollectedRealEstates() {
return new HashSet<>(collectedHomes); // defensive copy
}
public Set<RealEstate> getCollectedRealEstates() {
return new HashSet<>(collectedHomes);
}

public SpanishEstateWebCrawler(
@NotNull SiteCollector siteCollector,
@NotNull HtmlParser<RealEstate> listingHtmlParser,
@NotNull HtmlParser<URL> searchResultsPageHtmlParser,
@NotNull UrlBuilder urlBuilder) {
this.siteCollector = siteCollector;
this.listingHtmlParser = listingHtmlParser;
this.searchResultsPageParser = searchResultsPageHtmlParser;
this.urlBuilder = urlBuilder;
}
public SpanishEstateWebCrawler(
@NotNull SiteCollector siteCollector,
@NotNull HtmlParser<RealEstate> listingHtmlParser,
@NotNull HtmlParser<URL> searchResultsPageHtmlParser,
@NotNull UrlBuilder urlBuilder) {
this.siteCollector = siteCollector;
this.listingHtmlParser = listingHtmlParser;
this.searchResultsPageParser = searchResultsPageHtmlParser;
this.urlBuilder = urlBuilder;
}

@Override
public void crawl() {
Optional<Document> currentSearchResultsPage;
List<URL> searchResultPages = List.of();
int page = 1;
LOGGER.info("Starting to collect search result pages.");
while (page == 1 || searchResultPages.size() < 10) { // !searchResultPages.isEmpty()
String newSearchResultsPageUrl = urlBuilder.buildUrl(page);
LOGGER.info("----------------");
LOGGER.info("Search Results page: " + page);
currentSearchResultsPage = fetchSearchResultsPage(newSearchResultsPageUrl);
searchResultPages =
currentSearchResultsPage.map(searchResultsPageParser::parse).orElseGet(ArrayList::new);
searchResultsPages.addAll(searchResultPages);
searchResultPages.forEach(
(listingUrl) -> listingUrlToSearchUrl.put(listingUrl, newSearchResultsPageUrl));
LOGGER.info("# Home pages collected: " + searchResultsPages.size());
page++;
@Override
public void crawl() {
List<URL> listingUrls;
var page = 1;
LOGGER.info("Starting to collect search result pages.");
do {
String newSearchResultsPageUrl = urlBuilder.buildUrl(page);
LOGGER.info("----------------");
LOGGER.info("Search Results page: " + page);
listingUrls = fetchSearchResultsPage(newSearchResultsPageUrl)
.map(searchResultsPageParser::parse).orElseGet(List::of);
searchResultsPages.addAll(listingUrls);
LOGGER.info("# Home pages collected: " + searchResultsPages.size());
page++;
} while (!listingUrls.isEmpty());
LOGGER.info("Home page URLs collected successfully! Collecting home pages now!");
collectPagesConcurrently(searchResultsPages, this::getHome);
}
LOGGER.info("Home page URLs collected successfully! Collecting home pages now!");
collectPagesConcurrently(searchResultsPages, this::getHome);
}

private Optional<Document> fetchSearchResultsPage(String searchUrl) {
try {
Document currentSearchResultsPage;
LOGGER.info("Attempting to fetch search result page...");
currentSearchResultsPage =
siteCollector
.collect(searchUrl)
.orElseThrow(() -> new IOException("Initial Search Results Page failed"));
return Optional.of(currentSearchResultsPage);
} catch (IOException ex) {
LOGGER.error("Initial search results request failed.");
private Optional<Document> fetchSearchResultsPage(String searchUrl) {
try {
Document currentSearchResultsPage;
LOGGER.info("Attempting to fetch search result page...");
currentSearchResultsPage =
siteCollector
.collect(searchUrl)
.orElseThrow(() -> new IOException("Initial Search Results Page failed"));
return Optional.of(currentSearchResultsPage);
} catch (IOException ex) {
LOGGER.error("Initial search results request failed.");
}
return Optional.empty();
}
return Optional.empty();
}

private void collectPagesConcurrently(Set<URL> pages, Consumer<Document> htmlActionCallback) {
ExecutorService executorService = Executors.newWorkStealingPool(20);
List<Future<Optional<Document>>> futures;
List<Callable<Optional<Document>>> callables =
pages.stream()
.map((url) -> new FetchDocumentCallable(url.toString(), siteCollector))
.collect(Collectors.toList());
try {
futures = executorService.invokeAll(callables);
executorService.shutdown();
for (Future<Optional<Document>> future : futures) {
future.get().ifPresent(htmlActionCallback);
}
} catch (InterruptedException | ExecutionException interruptedException) {
LOGGER.error("Failed to collect home page urls: " + interruptedException.getMessage());
private void collectPagesConcurrently(Set<URL> pages, Consumer<Document> htmlActionCallback) {
ExecutorService executorService = Executors.newWorkStealingPool(20);
List<Future<Optional<Document>>> futures;
List<Callable<Optional<Document>>> callables =
pages.stream()
.map((url) -> new FetchDocumentCallable(url.toString(), siteCollector))
.collect(Collectors.toList());
try {
futures = executorService.invokeAll(callables);
executorService.shutdown();
for (Future<Optional<Document>> future : futures) {
future.get().ifPresent(htmlActionCallback);
}
} catch (InterruptedException | ExecutionException interruptedException) {
LOGGER.error("Failed to collect home page urls: " + interruptedException.getMessage());
}
}
}

private void getHome(Document resultsPage) {
// TODO: always a list of one element... consider adding a parseOne method to HtmlParser
this.collectedHomes.addAll(listingHtmlParser.parse(resultsPage));
LOGGER.info("Saved: " + resultsPage.title());
}
private void getHome(Document resultsPage) {
// TODO: always a list of one element... consider adding a parseOne method to HtmlParser
this.collectedHomes.addAll(listingHtmlParser.parse(resultsPage));
LOGGER.info("Saved: " + resultsPage.title());
}
}

0 comments on commit 8c6f03a

Please sign in to comment.