From 121c794f5aaf473116dab29bcd0d961b855fb7f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Fri, 28 Oct 2022 11:51:09 +0200 Subject: [PATCH] feat: forEachParallel, stub tests --- package-lock.json | 85 ++----- packages/apify-extra/src/dataset.ts | 334 +++++----------------------- test/apify/extra-dataset.test.ts | 62 ++++++ 3 files changed, 136 insertions(+), 345 deletions(-) create mode 100644 test/apify/extra-dataset.test.ts diff --git a/package-lock.json b/package-lock.json index cf6471443e..462c1480c2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1151,7 +1151,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/@crawlee/cli/-/cli-3.1.0.tgz", "integrity": "sha512-gaZCtquvI6eVkrwTEvbId3SW8BHzzXcjA8HWsurxNnYGr+7TuLMyFJ755+E0OIDdcbQgd8iGDmEi9gONKzLaUg==", - "dev": true, "dependencies": { "@crawlee/templates": "^3.1.0", "ansi-colors": "^4.1.3", @@ -1223,7 +1222,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/@crawlee/jsdom/-/jsdom-3.1.0.tgz", "integrity": "sha512-+cXeEy/CTL9FEAbXtlm37hB1Z4kx/AABPD7zheMHB2Et6ZndXdSEGgNpwSpIIw6mSixHrM+J9H2Ao075IIoAiQ==", - "dev": true, "dependencies": { "@apify/utilities": "^2.1.4", "@crawlee/http": "^3.1.0", @@ -1314,7 +1312,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/@crawlee/templates/-/templates-3.1.0.tgz", "integrity": "sha512-gyMidulFF/DAlaS/NuRYckf7noAzOX+J3udDhTM5jwp3gUAUgO5F0y7ix8xG7P0osyPIn6A/GN6sCqYhjhSyhQ==", - "dev": true, "dependencies": { "ansi-colors": "^4.1.3", "inquirer": "^9.0.0", @@ -1329,7 +1326,6 @@ "version": "6.0.0", "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-6.0.0.tgz", "integrity": "sha512-IG23inYII3dWlU2EyiAiGj6Bwal5GzsgPMwjYGvc1HPE2dgbj4ZB5ToWBKSquKw74nB3TIuOwaI6/jSULzfgrw==", - "dev": true, "dependencies": { "type-fest": "^3.0.0" }, @@ -1355,7 +1351,6 @@ "version": "6.2.1", "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", - "dev": true, "engines": { "node": ">=12" }, @@ -1367,7 +1362,6 @@ "version": "5.1.0", "resolved": "https://registry.npmjs.org/bl/-/bl-5.1.0.tgz", "integrity": "sha512-tv1ZJHLfTDnXE6tMHv73YgSJaWR2AFuPwMntBe7XL/GBFHnT0CLnsHMogfk5+GzCDC5ZWarSCYaIGATZt9dNsQ==", - "dev": true, "dependencies": { "buffer": "^6.0.3", "inherits": "^2.0.4", @@ -1401,7 +1395,6 @@ "version": "5.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.1.2.tgz", "integrity": "sha512-E5CkT4jWURs1Vy5qGJye+XwCkNj7Od3Af7CP6SujMetSMkLs8Do2RWJK5yx1wamHV/op8Rz+9rltjaTQWDnEFQ==", - "dev": true, "engines": { "node": "^12.17.0 || ^14.13 || >=16.0.0" }, @@ -1461,7 +1454,6 @@ "version": "9.1.4", "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-9.1.4.tgz", "integrity": "sha512-9hiJxE5gkK/cM2d1mTEnuurGTAoHebbkX0BYl3h7iEg7FYfuNIom+nDfBCSWtvSnoSrWCeBxqqBZu26xdlJlXA==", - "dev": true, "dependencies": { "ansi-escapes": "^6.0.0", "chalk": "^5.1.2", @@ -1561,7 +1553,6 @@ "version": "5.1.2", "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz", "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==", - "dev": true, "dependencies": { "eastasianwidth": "^0.2.0", "emoji-regex": "^9.2.2", @@ -4661,7 +4652,6 @@ "version": "7.0.1", "resolved": "https://registry.npmjs.org/acorn-globals/-/acorn-globals-7.0.1.tgz", "integrity": "sha512-umOSDSDrfHbTNPuNpC2NSnnA3LUrqpevPb4T9jRx4MagXNS0rs+gwiTcAvqCRmsD6utzsrzNt+ebm00SNWiC3Q==", - "dev": true, "dependencies": { "acorn": "^8.1.0", "acorn-walk": "^8.0.2" @@ -4680,7 +4670,6 @@ "version": "8.2.0", "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz", "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==", - "dev": true, "engines": { "node": ">=0.4.0" } @@ -4867,6 +4856,10 @@ "follow-redirects": "^1.14.0" } }, + "node_modules/apify-extra": { + "resolved": "packages/apify-extra", + "link": true + }, "node_modules/aproba": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", @@ -5782,6 +5775,7 @@ "version": "7.0.4", "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz", "integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==", + "dev": true, "dependencies": { "string-width": "^4.2.0", "strip-ansi": "^6.0.0", @@ -6234,7 +6228,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/crawlee/-/crawlee-3.1.0.tgz", "integrity": "sha512-Z1yvIYDj5lQsYA0WSGrbuEQNcK5qI0CI2e1uT6hpJ0KkXsV5eNyZEu6PJ0PHJ4RbvHvRWrBn3+p49cvatWU4/g==", - "dev": true, "dependencies": { "@crawlee/basic": "^3.1.0", "@crawlee/browser": "^3.1.0", @@ -6340,8 +6333,7 @@ "node_modules/cssstyle/node_modules/cssom": { "version": "0.3.8", "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.3.8.tgz", - "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==", - "dev": true + "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==" }, "node_modules/csv-stringify": { "version": "6.2.0", @@ -6444,8 +6436,7 @@ "node_modules/decimal.js": { "version": "10.4.2", "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.2.tgz", - "integrity": "sha512-ic1yEvwT6GuvaYwBLLY6/aFFgjZdySKTE8en/fkU3QICTmRtgtSlFn0u0BXN06InZwtfCelR7j8LRiDI/02iGA==", - "dev": true + "integrity": "sha512-ic1yEvwT6GuvaYwBLLY6/aFFgjZdySKTE8en/fkU3QICTmRtgtSlFn0u0BXN06InZwtfCelR7j8LRiDI/02iGA==" }, "node_modules/decompress-response": { "version": "6.0.0", @@ -6485,7 +6476,6 @@ "version": "1.0.4", "resolved": "https://registry.npmjs.org/defaults/-/defaults-1.0.4.tgz", "integrity": "sha512-eFuaLoy/Rxalv2kr+lqMlUnrDWV+3j4pljOIJgLIhI058IQfWJ7vXhyEIHu+HtC738klGALYxOKDO0bQP3tg8A==", - "dev": true, "dependencies": { "clone": "^1.0.2" }, @@ -6746,8 +6736,7 @@ "node_modules/eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", - "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", - "dev": true + "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==" }, "node_modules/ejs": { "version": "3.1.8", @@ -10181,7 +10170,6 @@ "version": "20.0.1", "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-20.0.1.tgz", "integrity": "sha512-pksjj7Rqoa+wdpkKcLzQRHhJCEE42qQhl/xLMUKHgoSejaKOdaXEAnqs6uDNwMl/fciHTzKeR8Wm8cw7N+g98A==", - "dev": true, "dependencies": { "abab": "^2.0.6", "acorn": "^8.8.0", @@ -12541,7 +12529,6 @@ "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==", - "dev": true, "engines": { "node": ">=6" } @@ -12720,7 +12707,6 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==", - "dev": true, "engines": { "node": ">=8" } @@ -12824,7 +12810,6 @@ "version": "4.2.0", "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-4.2.0.tgz", "integrity": "sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==", - "dev": true, "dependencies": { "find-up": "^4.0.0" }, @@ -12836,7 +12821,6 @@ "version": "4.1.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz", "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==", - "dev": true, "dependencies": { "locate-path": "^5.0.0", "path-exists": "^4.0.0" @@ -12849,7 +12833,6 @@ "version": "5.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==", - "dev": true, "dependencies": { "p-locate": "^4.1.0" }, @@ -12861,7 +12844,6 @@ "version": "2.3.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", - "dev": true, "dependencies": { "p-try": "^2.0.0" }, @@ -12876,7 +12858,6 @@ "version": "4.1.0", "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz", "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==", - "dev": true, "dependencies": { "p-limit": "^2.2.0" }, @@ -13771,7 +13752,6 @@ "version": "7.5.7", "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.5.7.tgz", "integrity": "sha512-z9MzKh/UcOqB3i20H6rtrlaE/CgjLOvheWK/9ILrbhROGTweAi1BaFsTT9FbwZi5Trr1qNRs+MXkhmR06awzQA==", - "dev": true, "dependencies": { "tslib": "^2.1.0" } @@ -15596,7 +15576,6 @@ "version": "17.6.0", "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.6.0.tgz", "integrity": "sha512-8H/wTDqlSwoSnScvV2N/JHfLWOKuh5MVla9hqLjK3nsfyy6Y4kDSYSvkU5YCUEPOSnRXfIyx3Sq+B/IWudTo4g==", - "dev": true, "dependencies": { "cliui": "^8.0.1", "escalade": "^3.1.1", @@ -15623,7 +15602,6 @@ "version": "8.0.1", "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", - "dev": true, "dependencies": { "string-width": "^4.2.0", "strip-ansi": "^6.0.1", @@ -16678,7 +16656,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/@crawlee/cli/-/cli-3.1.0.tgz", "integrity": "sha512-gaZCtquvI6eVkrwTEvbId3SW8BHzzXcjA8HWsurxNnYGr+7TuLMyFJ755+E0OIDdcbQgd8iGDmEi9gONKzLaUg==", - "dev": true, "requires": { "@crawlee/templates": "^3.1.0", "ansi-colors": "^4.1.3", @@ -16738,7 +16715,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/@crawlee/jsdom/-/jsdom-3.1.0.tgz", "integrity": "sha512-+cXeEy/CTL9FEAbXtlm37hB1Z4kx/AABPD7zheMHB2Et6ZndXdSEGgNpwSpIIw6mSixHrM+J9H2Ao075IIoAiQ==", - "dev": true, "requires": { "@apify/utilities": "^2.1.4", "@crawlee/http": "^3.1.0", @@ -16801,7 +16777,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/@crawlee/templates/-/templates-3.1.0.tgz", "integrity": "sha512-gyMidulFF/DAlaS/NuRYckf7noAzOX+J3udDhTM5jwp3gUAUgO5F0y7ix8xG7P0osyPIn6A/GN6sCqYhjhSyhQ==", - "dev": true, "requires": { "ansi-colors": "^4.1.3", "inquirer": "^9.0.0", @@ -16813,7 +16788,6 @@ "version": "6.0.0", "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-6.0.0.tgz", "integrity": "sha512-IG23inYII3dWlU2EyiAiGj6Bwal5GzsgPMwjYGvc1HPE2dgbj4ZB5ToWBKSquKw74nB3TIuOwaI6/jSULzfgrw==", - "dev": true, "requires": { "type-fest": "^3.0.0" } @@ -16826,14 +16800,12 @@ "ansi-styles": { "version": "6.2.1", "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", - "dev": true + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==" }, "bl": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/bl/-/bl-5.1.0.tgz", "integrity": "sha512-tv1ZJHLfTDnXE6tMHv73YgSJaWR2AFuPwMntBe7XL/GBFHnT0CLnsHMogfk5+GzCDC5ZWarSCYaIGATZt9dNsQ==", - "dev": true, "requires": { "buffer": "^6.0.3", "inherits": "^2.0.4", @@ -16852,8 +16824,7 @@ "chalk": { "version": "5.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.1.2.tgz", - "integrity": "sha512-E5CkT4jWURs1Vy5qGJye+XwCkNj7Od3Af7CP6SujMetSMkLs8Do2RWJK5yx1wamHV/op8Rz+9rltjaTQWDnEFQ==", - "dev": true + "integrity": "sha512-E5CkT4jWURs1Vy5qGJye+XwCkNj7Od3Af7CP6SujMetSMkLs8Do2RWJK5yx1wamHV/op8Rz+9rltjaTQWDnEFQ==" }, "cli-cursor": { "version": "4.0.0", @@ -16886,7 +16857,6 @@ "version": "9.1.4", "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-9.1.4.tgz", "integrity": "sha512-9hiJxE5gkK/cM2d1mTEnuurGTAoHebbkX0BYl3h7iEg7FYfuNIom+nDfBCSWtvSnoSrWCeBxqqBZu26xdlJlXA==", - "dev": true, "requires": { "ansi-escapes": "^6.0.0", "chalk": "^5.1.2", @@ -19476,7 +19446,6 @@ "version": "7.0.1", "resolved": "https://registry.npmjs.org/acorn-globals/-/acorn-globals-7.0.1.tgz", "integrity": "sha512-umOSDSDrfHbTNPuNpC2NSnnA3LUrqpevPb4T9jRx4MagXNS0rs+gwiTcAvqCRmsD6utzsrzNt+ebm00SNWiC3Q==", - "dev": true, "requires": { "acorn": "^8.1.0", "acorn-walk": "^8.0.2" @@ -19492,8 +19461,7 @@ "acorn-walk": { "version": "8.2.0", "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz", - "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==", - "dev": true + "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==" }, "actor-cheerio-scraper": { "version": "file:packages/actor-scraper/cheerio-scraper", @@ -19691,6 +19659,13 @@ } } }, + "apify-extra": { + "version": "file:packages/apify-extra", + "requires": { + "@types/bluebird": "^3.5.37", + "bluebird": "^3.7.2" + } + }, "aproba": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", @@ -20366,6 +20341,7 @@ "version": "7.0.4", "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz", "integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==", + "dev": true, "requires": { "string-width": "^4.2.0", "strip-ansi": "^6.0.0", @@ -20719,7 +20695,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/crawlee/-/crawlee-3.1.0.tgz", "integrity": "sha512-Z1yvIYDj5lQsYA0WSGrbuEQNcK5qI0CI2e1uT6hpJ0KkXsV5eNyZEu6PJ0PHJ4RbvHvRWrBn3+p49cvatWU4/g==", - "dev": true, "requires": { "@crawlee/basic": "^3.1.0", "@crawlee/browser": "^3.1.0", @@ -20870,8 +20845,7 @@ "decimal.js": { "version": "10.4.2", "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.2.tgz", - "integrity": "sha512-ic1yEvwT6GuvaYwBLLY6/aFFgjZdySKTE8en/fkU3QICTmRtgtSlFn0u0BXN06InZwtfCelR7j8LRiDI/02iGA==", - "dev": true + "integrity": "sha512-ic1yEvwT6GuvaYwBLLY6/aFFgjZdySKTE8en/fkU3QICTmRtgtSlFn0u0BXN06InZwtfCelR7j8LRiDI/02iGA==" }, "decompress-response": { "version": "6.0.0", @@ -20902,7 +20876,6 @@ "version": "1.0.4", "resolved": "https://registry.npmjs.org/defaults/-/defaults-1.0.4.tgz", "integrity": "sha512-eFuaLoy/Rxalv2kr+lqMlUnrDWV+3j4pljOIJgLIhI058IQfWJ7vXhyEIHu+HtC738klGALYxOKDO0bQP3tg8A==", - "dev": true, "requires": { "clone": "^1.0.2" } @@ -21091,8 +21064,7 @@ "eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", - "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", - "dev": true + "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==" }, "ejs": { "version": "3.1.8", @@ -23679,7 +23651,6 @@ "version": "20.0.1", "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-20.0.1.tgz", "integrity": "sha512-pksjj7Rqoa+wdpkKcLzQRHhJCEE42qQhl/xLMUKHgoSejaKOdaXEAnqs6uDNwMl/fciHTzKeR8Wm8cw7N+g98A==", - "dev": true, "requires": { "abab": "^2.0.6", "acorn": "^8.8.0", @@ -25436,8 +25407,7 @@ "p-try": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", - "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==", - "dev": true + "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==" }, "p-waterfall": { "version": "2.1.1", @@ -25575,8 +25545,7 @@ "path-exists": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", - "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==", - "dev": true + "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==" }, "path-is-absolute": { "version": "1.0.1", @@ -25647,7 +25616,6 @@ "version": "4.2.0", "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-4.2.0.tgz", "integrity": "sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==", - "dev": true, "requires": { "find-up": "^4.0.0" }, @@ -25656,7 +25624,6 @@ "version": "4.1.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz", "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==", - "dev": true, "requires": { "locate-path": "^5.0.0", "path-exists": "^4.0.0" @@ -25666,7 +25633,6 @@ "version": "5.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==", - "dev": true, "requires": { "p-locate": "^4.1.0" } @@ -25675,7 +25641,6 @@ "version": "2.3.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", - "dev": true, "requires": { "p-try": "^2.0.0" } @@ -25684,7 +25649,6 @@ "version": "4.1.0", "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz", "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==", - "dev": true, "requires": { "p-limit": "^2.2.0" } @@ -26348,7 +26312,6 @@ "version": "7.5.7", "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.5.7.tgz", "integrity": "sha512-z9MzKh/UcOqB3i20H6rtrlaE/CgjLOvheWK/9ILrbhROGTweAi1BaFsTT9FbwZi5Trr1qNRs+MXkhmR06awzQA==", - "dev": true, "requires": { "tslib": "^2.1.0" } @@ -27706,7 +27669,6 @@ "version": "17.6.0", "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.6.0.tgz", "integrity": "sha512-8H/wTDqlSwoSnScvV2N/JHfLWOKuh5MVla9hqLjK3nsfyy6Y4kDSYSvkU5YCUEPOSnRXfIyx3Sq+B/IWudTo4g==", - "dev": true, "requires": { "cliui": "^8.0.1", "escalade": "^3.1.1", @@ -27721,7 +27683,6 @@ "version": "8.0.1", "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", - "dev": true, "requires": { "string-width": "^4.2.0", "strip-ansi": "^6.0.1", diff --git a/packages/apify-extra/src/dataset.ts b/packages/apify-extra/src/dataset.ts index 0e456918c8..1544af0cd5 100644 --- a/packages/apify-extra/src/dataset.ts +++ b/packages/apify-extra/src/dataset.ts @@ -1,6 +1,6 @@ +/* eslint-disable max-classes-per-file */ import { Actor, Dataset as OriginalDataset } from 'apify'; import { log } from 'crawlee'; -import type { DatasetInfo } from '@crawlee/types'; import { APIFY_EXTRA_KV_RECORD_PREFIX, APIFY_EXTRA_LOG_PREFIX } from './const'; @@ -27,7 +27,7 @@ export interface ParallelPersistedPushDataOptions { idempotencyKey?: string; } -async function waitForCompletion(promises: (() => Promise)[], maxConcurrency: number): Promise { +export async function waitForCompletion(promises: (() => Promise)[], maxConcurrency: number): Promise { async function worker() { let job; /* eslint-disable-next-line no-cond-assign */ @@ -37,6 +37,28 @@ async function waitForCompletion(promises: (() => Promise)[], maxConcurren await Promise.all([...new Array(maxConcurrency)].map(() => worker())); } +export class ChunkTracker { + private readonly chunks: Record = {}; + + public add(chunkId: string): void { + this.chunks[chunkId] = true; + } + + public has(chunkId: string): boolean { + return this.chunks[chunkId] === true; + } + + public get(): string[] { + return Object.keys(this.chunks); + } + + constructor(data?: Record | null) { + if (data) { + Object.assign(this.chunks, data); + } + } +} + export class Dataset extends OriginalDataset { /** * Returns batches of items to be pushed to dataset. @@ -104,293 +126,39 @@ export class Dataset extends OriginalDataset { } }; - public async forEachParallel(func: (item: DatasetItem) => void, options: any = {}) { + public async forEachParallel( + func: Parameters[0], + options: Parameters[1] & { parallelLoads?: number; batchSize?: number; persistState: boolean }, + ) { const { parallelLoads = 20, batchSize = 50000 } = options; + const { offset: globalOffset = 0, limit: globalLimit = 0 } = options; - const { itemCount } = await this.getInfo() ?? { itemCount: 0 }; - - return waitForCompletion( - [...new Array(Math.ceil(itemCount / batchSize))] - .map((_, i) => () => this.forEach(func, { limit: batchSize, offset: batchSize * i })), - parallelLoads); - } - - // /** - // * Loads items from one or many datasets in parallel by chunking the items from each dataset into batches, - // * retaining order of both items and datasets. Useful for large loads. - // * By default returns one array of items in order of datasets provided. - // * By changing concatItems or concatDatasets options, you can get array of arrays (of arrays) back - // * Requires bluebird dependency and copy calculateLocalOffsetLimit function!!! - // * - // * @param {string[]} datasetIds IDs or names of datasets you want to load - // * @param {object} options Options with default values. - // * If both concatItems and concatDatasets are false, output of this function is an array of datasets containing arrays - // * of batches containig array of items. - // * concatItems concats all batches of one dataset into one array of items. - // * concatDatasets concat all datasets into one array of batches - // * Using both concatItems and concatDatasets gives you back a sinlge array of all items in order. - // * Both are true by default. - // * Data are not returned by fed to the supplied async function on the fly (reduces memory usage) - // * Will not load batches that were already processed before migration, does nothing if processFn is not used. - // * It does not persist the state inside processFn, that is a responsibillity of the caller (if needed) - // * You must not manipulate input parameters (and underlying datasets) between migrations or this will break - // */ - // public static async getDataParallel( - // datasetIds: string[], - // options: LoadDatasetItemsInParallelOptions = {}, - // ): Promise { - // const { - // processFn, - // parallelLoads = 20, - // batchSize = 50000, - // offset = 0, - // limit = 999999999, - // concatItems = true, - // concatDatasets = true, - // debugLog = false, - // persistLoadingStateForProcesFn = false, - // fields, - // // Figure out better name since this is useful for datasets by name on platform - // loadFromLocalDataset = false, - // } = options; - - // const LOG_PREFIX = `${APIFY_EXTRA_LOG_PREFIX}[loadDatasetItemsInParallel]:`; - - // if (!Actor.isAtHome() && loadFromLocalDataset && fields) { - // log.warning(`${LOG_PREFIX} fields option does not work on local datasets`); - // } - - // const client = Actor.apifyClient; - // const loadStart = Date.now(); - - // // If we use processFnLoadingState, we skip requests that are done - // const createRequestArray = async (processFnLoadingState: any) => { - // // We increment for each dataset so we remember their order - // let datasetIndex = 0; - - // // This array will be used to create promises to run in parallel - // const requestInfoArr = []; - - // for (const datasetId of datasetIds) { - // if (processFnLoadingState && !processFnLoadingState[datasetId]) { - // processFnLoadingState[datasetId] = {}; - // } - // // We get the number of items first and then we precreate request info objects - // let datasetInfo: DatasetInfo | undefined; - // if (loadFromLocalDataset) { - // const dataset = await Actor.openDataset(datasetId); - // datasetInfo = await dataset.getInfo(); - // } else { - // datasetInfo = await client.dataset(datasetId).get(); - // } - // if (!datasetInfo) { - // throw new Error(`${LOG_PREFIX} Dataset ${datasetId} was not found`); - // } - // const { itemCount } = datasetInfo; - // if (debugLog) { - // log.info(`Dataset ${datasetId} has ${itemCount} items`); - // } - // const numberOfBatches = Math.ceil(itemCount / batchSize); - - // for (let i = 0; i < numberOfBatches; i++) { - // const localOffsetLimit = calculateLocalOffsetLimit({ offset, limit, localStart: i * batchSize, batchSize }); - // if (!localOffsetLimit) { - // continue; - // } - - // if (processFnLoadingState) { - // if (!processFnLoadingState[datasetId][localOffsetLimit.offset]) { - // processFnLoadingState[datasetId][localOffsetLimit.offset] = { done: false }; - // } else if (processFnLoadingState[datasetId][localOffsetLimit.offset].done) { - // if (debugLog) { - // log.info(`Batch for dataset ${datasetId}, offset: ${localOffsetLimit.offset} was already processed, skipping...`); - // } - // continue; - // } - // } - - // requestInfoArr.push({ - // index: i, - // offset: localOffsetLimit.offset, - // limit: localOffsetLimit.limit, - // datasetId, - // datasetIndex, - // }); - // } - - // datasetIndex++; - // } - // return requestInfoArr; - // }; - - // // This is array of arrays. Top level array is for each dataset and inside one entry for each batch (in order) - // const loadedBatchedArr: DatasetItem[][][] = []; - - // let totalLoaded = 0; - // const totalLoadedPerDataset: Record = {}; - - // const processFnLoadingState: any = persistLoadingStateForProcesFn - // ? (await Actor.getValue(PROCESS_FN_LOADING_STATE_KV_RECORD_KEY) || {}) - // : null; - - // Actor.on('persistState', async () => { - // await Actor.setValue(PROCESS_FN_LOADING_STATE_KV_RECORD_KEY, processFnLoadingState); - // }); + const chunkTrackerName = `${APIFY_EXTRA_KV_RECORD_PREFIX}CHUNKS${this.name}`; + const chunkTracker = new ChunkTracker(await Actor.getValue>(chunkTrackerName)); - // const requestInfoArr = await createRequestArray(processFnLoadingState); - // if (debugLog) { - // log.info(`Number of requests to do: ${requestInfoArr.length}`); - // } - - // // Now we execute all the requests in parallel (with defined concurrency) - // await concurrentWait(requestInfoArr.map((requestInfoObj) => async () => { - // const { index, datasetId, datasetIndex } = requestInfoObj; - - // const getDataOptions = { - // offset: requestInfoObj.offset, - // limit: requestInfoObj.limit, - // fields, - // }; - // let datasetResult; - // if (loadFromLocalDataset) { - // // This open should be cached - // const dataset = await Actor.openDataset(datasetId); - - // if (!Actor.isAtHome()) { - // delete getDataOptions.fields; - // } - // datasetResult = await dataset.getData(getDataOptions); - // } else { - // datasetResult = await client.dataset(datasetId).listItems(getDataOptions); - // } - - // const { items } = datasetResult; - - // if (!totalLoadedPerDataset[datasetId]) { - // totalLoadedPerDataset[datasetId] = 0; - // } - - // totalLoadedPerDataset[datasetId] += items.length; - // totalLoaded += items.length; - - // if (debugLog) { - // log.info( - // `Items loaded from dataset ${datasetId}: ${items.length}, offset: ${requestInfoObj.offset}, - // total loaded from dataset ${datasetId}: ${totalLoadedPerDataset[datasetId]}, - // total loaded: ${totalLoaded}`, - // ); - // } - // // We either collect the data or we process them on the fly - // if (processFn) { - // await processFn(items, { datasetId, datasetOffset: requestInfoObj.offset }); - // if (processFnLoadingState) { - // processFnLoadingState[datasetId][requestInfoObj.offset].done = true; - // } - // } else { - // if (!loadedBatchedArr[datasetIndex]) { - // loadedBatchedArr[datasetIndex] = []; - // } - // // Now we correctly assign the items into the main array - // loadedBatchedArr[datasetIndex][index] = items; - // } - // }), parallelLoads); - - // if (debugLog) { - // log.info(`Loading took ${Math.round((Date.now() - loadStart) / 1000)} seconds`); - // } - - // if (processFnLoadingState) { - // await Actor.setValue(PROCESS_FN_LOADING_STATE_KV_RECORD_KEY, processFnLoadingState); - // } - - // if (processFn) { - // return undefined; - // } - - // let resultItems: DatasetItem[] | DatasetItem[][] | DatasetItem[][][] = loadedBatchedArr; - // if (concatItems) { - // for (let i = 0; i < loadedBatchedArr.length; i++) { - // resultItems[i] = loadedBatchedArr[i].flatMap((item) => item); - // } - // } - -// if (concatDatasets) { -// resultItems = loadedBatchedArr.flatMap((item) => item); -// } -// return loadedBatchedArr; -// }; -} - -interface CalculateLocalOffsetLimitParams { - offset: number; - limit: number; - localStart: number; - batchSize: number; -} - -type CalculateLocalOffsetLimitResult = - null | - { offset: number; limit: number }; + let isMigrating = false; + const migrationCallback = async () => { + isMigrating = true; + await Actor.setValue(chunkTrackerName, chunkTracker.get()); + }; -// Returns either null if offset/limit does not fit the current chunk -// or { offset, limit } object -const calculateLocalOffsetLimit = ({ offset, limit, localStart, batchSize }: CalculateLocalOffsetLimitParams): CalculateLocalOffsetLimitResult => { - const localEnd = localStart + batchSize; - const inputEnd = offset + limit; + Actor.on('migrating', migrationCallback); + Actor.on('aborting', migrationCallback); - // Offset starts after the current chunk - if (offset >= localEnd) { - return null; - } - // Offset + limit ends before our chunk - if (inputEnd <= localStart) { - return null; - } + Actor.on('persistState', async () => { + await Actor.setValue(chunkTrackerName, chunkTracker.get()); + }); - // Now we know that the some data are in the current batch - const calculateLimit = () => { - // limit overflows current batch - if (inputEnd >= localEnd) { - // Now either the offset is less than local start and we do whole batch - if (offset < localStart) { - return batchSize; - } - // Or it is inside the current batch and we slice it from the start (including whole batch) - return localEnd - offset; - // eslint-disable-next-line no-else-return - } else { // Consider (inputEnd < localEnd) Means limit ends inside current batch - if (offset < localStart) { - return inputEnd - localStart; - } - // This means both offset and limit are inside current batch - return inputEnd - offset; - } - }; + const { itemCount } = await this.getInfo() ?? { itemCount: 0 }; - return { - offset: Math.max(localStart, offset), - limit: calculateLimit(), + return waitForCompletion( + [...new Array(Math.ceil((itemCount < globalLimit ? itemCount : globalLimit) / batchSize))] + .filter((_, i) => !chunkTracker.has(`${globalOffset + i * batchSize}`)) + .map((_, i) => async () => { + if (isMigrating) await new Promise(() => {}); // blocks indefinitely - after a while, stops the entire execution + await this.forEach(func, { ...options, limit: batchSize, offset: globalOffset + batchSize * i }); + chunkTracker.add(`${globalOffset + batchSize * i}`); + }), + parallelLoads); }; -}; - -const PROCESS_FN_LOADING_STATE_KV_RECORD_KEY = `${APIFY_EXTRA_KV_RECORD_PREFIX}PROCESS-FN-LOADING-STATE`; - -export interface LoadDatasetItemsInParallelOptions { - parallelLoads?: number; - batchSize?: number; - offset?: number; - limit?: number; - concatItems?: boolean; - concatDatasets?: boolean; - debugLog?: boolean; - persistLoadingStateForProcesFn?: boolean; - fields?: string[]; - // If run outside of Apify platform, will fetch local datasets instead. On Apify platform, this is ignored. - loadFromLocalDataset?: boolean; - processFn?: (items: DatasetItem[], params: { datasetId: string; datasetOffset: number }) => Promise; } - -export type LoadDatasetItemsInParallelResult = - undefined | - DatasetItem[] | - DatasetItem[][]; diff --git a/test/apify/extra-dataset.test.ts b/test/apify/extra-dataset.test.ts new file mode 100644 index 0000000000..3f76496034 --- /dev/null +++ b/test/apify/extra-dataset.test.ts @@ -0,0 +1,62 @@ +import { ENV_VARS } from '@apify/consts'; +import { Actor, Configuration, PlatformEventManager, log } from 'apify'; +import { Server } from 'ws'; +import { Dataset } from 'apify-extra'; + +describe('apify-extra dataset', () => { + let wss: Server = null; + const config = Configuration.getGlobalConfig(); + const events = new PlatformEventManager(config); + config.useEventManager(events); + + beforeEach(() => { + wss = new Server({ port: 9099 }); + process.env[ENV_VARS.ACTOR_EVENTS_WS_URL] = 'ws://localhost:9099/someRunId'; + process.env[ENV_VARS.TOKEN] = 'dummy'; + }); + afterEach((done) => { + delete process.env[ENV_VARS.ACTOR_EVENTS_WS_URL]; + delete process.env[ENV_VARS.TOKEN]; + wss.close(done); + }); + + test('forEachParallel', async () => { + // let wsClosed = false; + // const isWsConnected = new Promise((resolve) => { + // wss.on('connection', (ws, req) => { + // ws.on('close', () => { + // wsClosed = true; + // }); + // resolve(ws); + // expect(req.url).toBe('/someRunId'); + // const send = (obj: Dictionary) => ws.send(JSON.stringify(obj)); + + // setTimeout(() => send({ name: 'migrating' }), 10); + // }); + // }); + + const forEachSpy = jest.spyOn(Dataset.prototype, 'forEach').mockImplementation(); + jest.spyOn(Dataset.prototype, 'getInfo').mockImplementation(async () => ({ + itemCount: 20, + } as any)); + jest.spyOn(Actor.prototype, 'getValue').mockImplementation(async () => ({})); + + // await isWsConnected; + + const dataset = new Dataset({ + client: Configuration.getStorageClient(), + id: 'dataset-forEachParallel-test', + }); + + await dataset.forEachParallel(() => { log.debug('noop'); }, { persistState: true }); + + // // Cleanup. + // await new Promise((resolve) => { + // wss.close(async () => { + // await sleep(10); // Here must be short sleep to get following line to later tick + // expect(wsClosed).toBe(true); + // resolve(); + // }); + // }); + }, 60e3); +});