diff --git a/package-lock.json b/package-lock.json index 91e4261343..936ce8be22 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3303,6 +3303,11 @@ "@types/node": "*" } }, + "node_modules/@types/http-cache-semantics": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.1.tgz", + "integrity": "sha512-SZs7ekbP8CN0txVG2xVRH6EgKmEm31BOxA07vkFaETzZz1xh+cbt8BcI0slpymvwhx5dlFnQG2rTlPVQn+iRPQ==" + }, "node_modules/@types/istanbul-lib-coverage": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.4.tgz", @@ -7777,6 +7782,30 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/got": { + "version": "12.6.0", + "resolved": "https://registry.npmjs.org/got/-/got-12.6.0.tgz", + "integrity": "sha512-WTcaQ963xV97MN3x0/CbAriXFZcXCfgxVp91I+Ze6pawQOa7SgzwSx2zIJJsX+kTajMnVs0xcFD1TxZKFqhdnQ==", + "dependencies": { + "@sindresorhus/is": "^5.2.0", + "@szmarczak/http-timer": "^5.0.1", + "cacheable-lookup": "^7.0.0", + "cacheable-request": "^10.2.8", + "decompress-response": "^6.0.0", + "form-data-encoder": "^2.1.2", + "get-stream": "^6.0.1", + "http2-wrapper": "^2.1.10", + "lowercase-keys": "^3.0.0", + "p-cancelable": "^3.0.0", + "responselike": "^3.0.0" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sindresorhus/got?sponsor=1" + } + }, "node_modules/got-cjs": { "version": "12.5.4", "resolved": "https://registry.npmjs.org/got-cjs/-/got-cjs-12.5.4.tgz", @@ -7819,6 +7848,116 @@ "node": ">=15.10.0" } }, + "node_modules/got/node_modules/@sindresorhus/is": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-5.3.0.tgz", + "integrity": "sha512-CX6t4SYQ37lzxicAqsBtxA3OseeoVrh9cSJ5PFYam0GksYlupRfy1A+Q4aYD3zvcfECLc0zO2u+ZnR2UYKvCrw==", + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sindresorhus/is?sponsor=1" + } + }, + "node_modules/got/node_modules/@szmarczak/http-timer": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-5.0.1.tgz", + "integrity": "sha512-+PmQX0PiAYPMeVYe237LJAYvOMYW1j2rH5YROyS3b4CTVJum34HfRvKvAzozHAQG0TnHNdUfY9nCeUyRAs//cw==", + "dependencies": { + "defer-to-connect": "^2.0.1" + }, + "engines": { + "node": ">=14.16" + } + }, + "node_modules/got/node_modules/cacheable-lookup": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-7.0.0.tgz", + "integrity": "sha512-+qJyx4xiKra8mZrcwhjMRMUhD5NR1R8esPkzIYxX96JiecFoxAXFuz/GpR3+ev4PE1WamHip78wV0vcmPQtp8w==", + "engines": { + "node": ">=14.16" + } + }, + "node_modules/got/node_modules/cacheable-request": { + "version": "10.2.10", + "resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-10.2.10.tgz", + "integrity": "sha512-v6WB+Epm/qO4Hdlio/sfUn69r5Shgh39SsE9DSd4bIezP0mblOlObI+I0kUEM7J0JFc+I7pSeMeYaOYtX1N/VQ==", + "dependencies": { + "@types/http-cache-semantics": "^4.0.1", + "get-stream": "^6.0.1", + "http-cache-semantics": "^4.1.1", + "keyv": "^4.5.2", + "mimic-response": "^4.0.0", + "normalize-url": "^8.0.0", + "responselike": "^3.0.0" + }, + "engines": { + "node": ">=14.16" + } + }, + "node_modules/got/node_modules/form-data-encoder": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-2.1.4.tgz", + "integrity": "sha512-yDYSgNMraqvnxiEXO4hi88+YZxaHC6QKzb5N84iRCTDeRO7ZALpir/lVmf/uXUhnwUr2O4HU8s/n6x+yNjQkHw==", + "engines": { + "node": ">= 14.17" + } + }, + "node_modules/got/node_modules/lowercase-keys": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-3.0.0.tgz", + "integrity": "sha512-ozCC6gdQ+glXOQsveKD0YsDy8DSQFjDTz4zyzEHNV5+JP5D62LmfDZ6o1cycFx9ouG940M5dE8C8CTewdj2YWQ==", + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/got/node_modules/mimic-response": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-4.0.0.tgz", + "integrity": "sha512-e5ISH9xMYU0DzrT+jl8q2ze9D6eWBto+I8CNpe+VI+K2J/F/k3PdkdTdz4wvGVH4NTpo+NRYTVIuMQEMMcsLqg==", + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/got/node_modules/normalize-url": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-8.0.0.tgz", + "integrity": "sha512-uVFpKhj5MheNBJRTiMZ9pE/7hD1QTeEvugSJW/OmLzAp78PB5O6adfMNTvmfKhXBkvCzC+rqifWcVYpGFwTjnw==", + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/got/node_modules/p-cancelable": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-3.0.0.tgz", + "integrity": "sha512-mlVgR3PGuzlo0MmTdk4cXqXWlwQDLnONTAg6sm62XkMJEiRxN3GL3SffkYvqwonbkJBcrI7Uvv5Zh9yjvn2iUw==", + "engines": { + "node": ">=12.20" + } + }, + "node_modules/got/node_modules/responselike": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/responselike/-/responselike-3.0.0.tgz", + "integrity": "sha512-40yHxbNcl2+rzXvZuVkrYohathsSJlMTXKryG5y8uciHv1+xDLHQpgjG64JUO9nrEq2jGLH6IZ8BcZyw3wrweg==", + "dependencies": { + "lowercase-keys": "^3.0.0" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", @@ -15760,7 +15899,8 @@ "dependencies": { "@apify/scraper-tools": "^1.1.1", "@crawlee/cheerio": "^3.1.0", - "apify": "^3.1.1" + "apify": "^3.1.1", + "got": "^12.6.0" }, "devDependencies": { "@apify/tsconfig": "^0.1.0", @@ -18383,6 +18523,11 @@ "@types/node": "*" } }, + "@types/http-cache-semantics": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.1.tgz", + "integrity": "sha512-SZs7ekbP8CN0txVG2xVRH6EgKmEm31BOxA07vkFaETzZz1xh+cbt8BcI0slpymvwhx5dlFnQG2rTlPVQn+iRPQ==" + }, "@types/istanbul-lib-coverage": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.4.tgz", @@ -18746,6 +18891,7 @@ "@crawlee/cheerio": "^3.1.0", "@types/node": "^18.7.18", "apify": "^3.1.1", + "got": "^12.6.0", "ts-node": "^10.9.1", "typescript": "~5.0.0" } @@ -21830,6 +21976,91 @@ "get-intrinsic": "^1.1.3" } }, + "got": { + "version": "12.6.0", + "resolved": "https://registry.npmjs.org/got/-/got-12.6.0.tgz", + "integrity": "sha512-WTcaQ963xV97MN3x0/CbAriXFZcXCfgxVp91I+Ze6pawQOa7SgzwSx2zIJJsX+kTajMnVs0xcFD1TxZKFqhdnQ==", + "requires": { + "@sindresorhus/is": "^5.2.0", + "@szmarczak/http-timer": "^5.0.1", + "cacheable-lookup": "^7.0.0", + "cacheable-request": "^10.2.8", + "decompress-response": "^6.0.0", + "form-data-encoder": "^2.1.2", + "get-stream": "^6.0.1", + "http2-wrapper": "^2.1.10", + "lowercase-keys": "^3.0.0", + "p-cancelable": "^3.0.0", + "responselike": "^3.0.0" + }, + "dependencies": { + "@sindresorhus/is": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-5.3.0.tgz", + "integrity": "sha512-CX6t4SYQ37lzxicAqsBtxA3OseeoVrh9cSJ5PFYam0GksYlupRfy1A+Q4aYD3zvcfECLc0zO2u+ZnR2UYKvCrw==" + }, + "@szmarczak/http-timer": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-5.0.1.tgz", + "integrity": "sha512-+PmQX0PiAYPMeVYe237LJAYvOMYW1j2rH5YROyS3b4CTVJum34HfRvKvAzozHAQG0TnHNdUfY9nCeUyRAs//cw==", + "requires": { + "defer-to-connect": "^2.0.1" + } + }, + "cacheable-lookup": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-7.0.0.tgz", + "integrity": "sha512-+qJyx4xiKra8mZrcwhjMRMUhD5NR1R8esPkzIYxX96JiecFoxAXFuz/GpR3+ev4PE1WamHip78wV0vcmPQtp8w==" + }, + "cacheable-request": { + "version": "10.2.10", + "resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-10.2.10.tgz", + "integrity": "sha512-v6WB+Epm/qO4Hdlio/sfUn69r5Shgh39SsE9DSd4bIezP0mblOlObI+I0kUEM7J0JFc+I7pSeMeYaOYtX1N/VQ==", + "requires": { + "@types/http-cache-semantics": "^4.0.1", + "get-stream": "^6.0.1", + "http-cache-semantics": "^4.1.1", + "keyv": "^4.5.2", + "mimic-response": "^4.0.0", + "normalize-url": "^8.0.0", + "responselike": "^3.0.0" + } + }, + "form-data-encoder": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-2.1.4.tgz", + "integrity": "sha512-yDYSgNMraqvnxiEXO4hi88+YZxaHC6QKzb5N84iRCTDeRO7ZALpir/lVmf/uXUhnwUr2O4HU8s/n6x+yNjQkHw==" + }, + "lowercase-keys": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-3.0.0.tgz", + "integrity": "sha512-ozCC6gdQ+glXOQsveKD0YsDy8DSQFjDTz4zyzEHNV5+JP5D62LmfDZ6o1cycFx9ouG940M5dE8C8CTewdj2YWQ==" + }, + "mimic-response": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-4.0.0.tgz", + "integrity": "sha512-e5ISH9xMYU0DzrT+jl8q2ze9D6eWBto+I8CNpe+VI+K2J/F/k3PdkdTdz4wvGVH4NTpo+NRYTVIuMQEMMcsLqg==" + }, + "normalize-url": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-8.0.0.tgz", + "integrity": "sha512-uVFpKhj5MheNBJRTiMZ9pE/7hD1QTeEvugSJW/OmLzAp78PB5O6adfMNTvmfKhXBkvCzC+rqifWcVYpGFwTjnw==" + }, + "p-cancelable": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-3.0.0.tgz", + "integrity": "sha512-mlVgR3PGuzlo0MmTdk4cXqXWlwQDLnONTAg6sm62XkMJEiRxN3GL3SffkYvqwonbkJBcrI7Uvv5Zh9yjvn2iUw==" + }, + "responselike": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/responselike/-/responselike-3.0.0.tgz", + "integrity": "sha512-40yHxbNcl2+rzXvZuVkrYohathsSJlMTXKryG5y8uciHv1+xDLHQpgjG64JUO9nrEq2jGLH6IZ8BcZyw3wrweg==", + "requires": { + "lowercase-keys": "^3.0.0" + } + } + } + }, "got-cjs": { "version": "12.5.4", "resolved": "https://registry.npmjs.org/got-cjs/-/got-cjs-12.5.4.tgz", diff --git a/packages/actor-scraper/cheerio-scraper/.actor/actor.json b/packages/actor-scraper/cheerio-scraper/.actor/actor.json new file mode 100644 index 0000000000..f4128488f3 --- /dev/null +++ b/packages/actor-scraper/cheerio-scraper/.actor/actor.json @@ -0,0 +1,6 @@ +{ + "actorSpecification": 1, + "name": "cheerio-scraper", + "version": "0.1", + "buildTag": "latest" +} diff --git a/packages/actor-scraper/cheerio-scraper/apify.json b/packages/actor-scraper/cheerio-scraper/apify.json.deprecated similarity index 100% rename from packages/actor-scraper/cheerio-scraper/apify.json rename to packages/actor-scraper/cheerio-scraper/apify.json.deprecated diff --git a/packages/actor-scraper/cheerio-scraper/package.json b/packages/actor-scraper/cheerio-scraper/package.json index 4cfb5dd964..2cbffca2f1 100644 --- a/packages/actor-scraper/cheerio-scraper/package.json +++ b/packages/actor-scraper/cheerio-scraper/package.json @@ -7,7 +7,8 @@ "dependencies": { "@apify/scraper-tools": "^1.1.1", "@crawlee/cheerio": "^3.1.0", - "apify": "^3.1.1" + "apify": "^3.1.1", + "got": "^12.6.0" }, "devDependencies": { "@apify/tsconfig": "^0.1.0", diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts b/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts index f324876790..2b7715d919 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts @@ -35,6 +35,7 @@ export interface Input { datasetName?: string; keyValueStoreName?: string; requestQueueName?: string; + experimentalQueue?: boolean; } export const enum ProxyRotation { diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts index 8061759e48..8d0e78f0a7 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts @@ -27,6 +27,7 @@ import { IncomingMessage } from 'node:http'; import { dirname } from 'node:path'; import { fileURLToPath, URL } from 'node:url'; import { Input, ProxyRotation } from './consts.js'; +import { createRequestQueue } from './request_queue_experimental.js'; const { SESSION_MAX_USAGE_COUNTS, META_KEY } = scraperToolsConstants; const SCHEMA = JSON.parse(await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8')); @@ -135,6 +136,7 @@ export class CrawlerSetup implements CrawlerSetupOptions { this.requestList = await RequestList.open('CHEERIO_SCRAPER', startUrls); // RequestQueue + this.requestQueue = await RequestQueue.open(this.requestQueueName); // Dataset @@ -204,6 +206,12 @@ export class CrawlerSetup implements CrawlerSetupOptions { this.crawler = new CheerioCrawler(options); + if (this.input.experimentalQueue) { + // We have to override the queue after crawler creation. + // We could possibly extend the original request queue too. + this.crawler.requestQueue = await createRequestQueue() as any as RequestQueue; + } + return this.crawler; } diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/queue.ts b/packages/actor-scraper/cheerio-scraper/src/internals/queue.ts new file mode 100644 index 0000000000..a8b31dc521 --- /dev/null +++ b/packages/actor-scraper/cheerio-scraper/src/internals/queue.ts @@ -0,0 +1,27 @@ +export default class Queue { + private items: T[] = []; + + enqueue(item: T): void { + this.items.push(item); + } + + dequeue(): T | undefined { + return this.items.shift(); + } + + isEmpty(): boolean { + return this.items.length === 0; + } + + size(): number { + return this.items.length; + } + + toArray(): T[] { + return this.items; + } + + fromArray(items: T[]) { + this.items = items; + } +} diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/request_queue_experimental.ts b/packages/actor-scraper/cheerio-scraper/src/internals/request_queue_experimental.ts new file mode 100644 index 0000000000..0053febfc8 --- /dev/null +++ b/packages/actor-scraper/cheerio-scraper/src/internals/request_queue_experimental.ts @@ -0,0 +1,186 @@ +import { Actor, log } from 'apify'; +import { Request, Dictionary, RequestOptions } from '@crawlee/cheerio'; + +import got from 'got'; +import Queue from './queue.js'; + +// In memory request queue with deduplication and persistance +// Limit is around 2.5M requests to be save +// It supports only the critical functions for queue to work +export class ExperimentalRequestQueue { + set = new Set(); + requests = new Queue(); + // Fastest tested settings chunk size + // However it can vary with size of userData + maxBatchRequests = 150_000; + log = log.child({ prefix: 'ExperimentalRequestQueue' }); + // Fastest tested chunk for persisting set + maxBatchSet = 500_000; + setKey = 'INTERNAL-set-state'; + queueKey = 'INTERNAL-queue-state'; + defaultKeyValueStoreId = Actor.getEnv().defaultKeyValueStoreId; + handled = 0; + constructor() { + Actor.on('migrating', async () => { + await this.persistQueueState(); + this.log.info('State persisted!'); + }); + + Actor.on('aborting', async () => { + await this.persistQueueState(); + this.log.info('State persisted!'); + }); + } + + async initialize() { + const firstRequests = await Actor.getValue(`${this.queueKey}-0`); + + if (firstRequests) { + return this.loadState(); + } + } + + addRequest(requestLike: any) { + const uid = requestLike.uniqueKey ? requestLike.uniqueKey : requestLike.url; + // Faster then negative condition + if (this.set.has(uid)) { + return; + } + this.set.add(uid); + this.requests.enqueue(this._stripRequest(requestLike)); + } + + addRequests(requests: []) { + for (const request of requests) { + this.addRequest(request); + } + } + + markRequestHandled() { + this.handled += 1; + } + + handledCount() { + return this.handled; + } + + isEmpty() { + return this.requests.size() === 0; + } + + isFinished() { + return this.isEmpty(); + } + + fetchNextRequest(): any { + const baseRequest = this.requests.dequeue(); + return new Request(baseRequest as any as Request); + } + + reclaimRequest(request: Request) { + this.requests.enqueue(this._stripRequest(request)); + } + + async persistQueueState() { + const promises = [ + this.persistRequests(), + this.persistSet(), + ]; + + await Promise.all(promises); + } + + async persistSet() { + await this._saveInChunks(Array.from(this.set), this.setKey, this.maxBatchSet); + } + + async persistRequests() { + await this._saveInChunks(this.requests.toArray(), this.queueKey, this.maxBatchRequests); + } + + async loadState() { + await this._loadRequests(); + await this._loadSet(); + this.log.info('State Loaded!', { requestCount: this.requests.size() }); + } + + _stripRequest(requestLike: Request | RequestOptions) { + return { + url: requestLike.url, + // @ts-expect-error It is + retryCount: requestLike.retryCount || 0, + uniqueKey: requestLike.uniqueKey, + userData: requestLike.userData, + label: requestLike.label, + }; + } + + async _loadRequests() { + const items = await this._loadItemsFromState(this.queueKey); + + this.requests.fromArray(items); + } + + async _loadSet() { + const setData = await this._loadItemsFromState(this.setKey); + + this.set = new Set(setData); + } + + async _loadItemsFromState(keyRoot: string): Promise { + const storeClient = await Actor.apifyClient.keyValueStore(this.defaultKeyValueStoreId!); + + const allKeys = await storeClient.listKeys(); + const orderedKeys = allKeys.items + .filter(({ key }) => key.includes(keyRoot)) + .sort(({ key: key1 }, { key: key2 }) => this._sortByChunkOrder(key1, key2)); + + let finalItems: any[] = []; + for (const { key } of orderedKeys) { + const items = await Actor.getValue(key); + finalItems = finalItems.concat(items); + } + + await Promise.all(orderedKeys.map(({ key }) => { + return storeClient.deleteRecord(key); + })); + + return finalItems; + } + + async _saveInChunks(arr: T[], key: string, chunkSize: number): Promise { + const promises = []; + + for (let i = 0; i < arr.length; i += chunkSize) { + const chunk = arr.slice(i, i + chunkSize); + // Faster with got + promises.push(got({ + url: `https://api.apify.com/v2/key-value-stores/${this.defaultKeyValueStoreId}/records/${key}-${promises.length}?token=${Actor.getEnv().token}`, + method: 'PUT', + json: chunk, + http2: false, + }).catch((e: any) => this.log.exception(e.message, e.status))); + } + + return Promise.all(promises); + } + + _sortByChunkOrder(a: string, b: string) { + const regEx = /[^0-9]/g; + // Extract the numbers from the string + const aNumber = parseInt(a.replace(regEx, ''), 10); + const bNumber = parseInt(b.replace(regEx, ''), 10); + if (aNumber < bNumber) { + return -1; + } if (aNumber > bNumber) { + return 1; + } + return 0; + } +} + +export async function createRequestQueue() { + const queue = new ExperimentalRequestQueue(); + await queue.initialize(); + return queue; +}