From 8f0fbc5b9ddbc31cfc8bc17336589e250856bff6 Mon Sep 17 00:00:00 2001 From: Aditya Karnam Date: Wed, 22 Nov 2023 00:56:44 -0600 Subject: [PATCH 1/8] feat: create crawler api server --- package-lock.json | 690 ++++++++++++++++++++++++++++++++++++++++++++++ package.json | 7 + src/server.ts | 41 +++ 3 files changed, 738 insertions(+) create mode 100644 src/server.ts diff --git a/package-lock.json b/package-lock.json index 406ff6d6..75c4a212 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,10 @@ "license": "ISC", "dependencies": { "commander": "^11.1.0", + "cors": "^2.8.5", "crawlee": "^3.0.0", + "express": "^4.18.2", + "express-fileupload": "^1.4.3", "glob": "^10.3.10", "inquirer": "^9.2.12", "playwright": "*", @@ -22,6 +25,9 @@ }, "devDependencies": { "@apify/tsconfig": "^0.1.0", + "@types/cors": "^2.8.17", + "@types/express": "^4.17.21", + "@types/express-fileupload": "^1.4.4", "@types/inquirer": "^9.0.7", "@types/node": "^20.0.0", "ts-node": "^10.8.0", @@ -829,14 +835,91 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/body-parser": { + "version": "1.19.5", + "resolved": "https://registry.npmjs.org/@types/body-parser/-/body-parser-1.19.5.tgz", + "integrity": "sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==", + "dev": true, + "dependencies": { + "@types/connect": "*", + "@types/node": "*" + } + }, + "node_modules/@types/busboy": { + "version": "1.5.3", + "resolved": "https://registry.npmjs.org/@types/busboy/-/busboy-1.5.3.tgz", + "integrity": "sha512-YMBLFN/xBD8bnqywIlGyYqsNFXu6bsiY7h3Ae0kO17qEuTjsqeyYMRPSUDacIKIquws2Y6KjmxAyNx8xB3xQbw==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/connect": { + "version": "3.4.38", + "resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz", + "integrity": "sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/content-type": { "version": "1.1.8", "license": "MIT" }, + "node_modules/@types/cors": { + "version": "2.8.17", + "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz", + "integrity": "sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/express": { + "version": "4.17.21", + "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz", + "integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==", + "dev": true, + "dependencies": { + "@types/body-parser": "*", + "@types/express-serve-static-core": "^4.17.33", + "@types/qs": "*", + "@types/serve-static": "*" + } + }, + "node_modules/@types/express-fileupload": { + "version": "1.4.4", + "resolved": "https://registry.npmjs.org/@types/express-fileupload/-/express-fileupload-1.4.4.tgz", + "integrity": "sha512-kxCs5oJ40JPhvh3LpxCeGfuSZIl8/6bk85u1YqNcIbfQCmUm3u+Ao1oOiSt/VdbEPs+V3JQg8giqxAyqXlpbWg==", + "dev": true, + "dependencies": { + "@types/busboy": "*", + "@types/express": "*" + } + }, + "node_modules/@types/express-serve-static-core": { + "version": "4.17.41", + "resolved": "https://registry.npmjs.org/@types/express-serve-static-core/-/express-serve-static-core-4.17.41.tgz", + "integrity": "sha512-OaJ7XLaelTgrvlZD8/aa0vvvxZdUmlCn6MtWeB7TkiKW70BQLc9XEPpDLPdbo52ZhXUCrznlWdCHWxJWtdyajA==", + "dev": true, + "dependencies": { + "@types/node": "*", + "@types/qs": "*", + "@types/range-parser": "*", + "@types/send": "*" + } + }, "node_modules/@types/http-cache-semantics": { "version": "4.0.4", "license": "MIT" }, + "node_modules/@types/http-errors": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz", + "integrity": "sha512-D0CFMMtydbJAegzOyHjtiKPLlvnm3iTZyZRSZoLq2mRhDdmLfIWOCYPfQJ4cu2erKghU++QvjcUjp/5h7hESpA==", + "dev": true + }, "node_modules/@types/inquirer": { "version": "9.0.7", "dev": true, @@ -855,6 +938,12 @@ "parse5": "^7.0.0" } }, + "node_modules/@types/mime": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/@types/mime/-/mime-1.3.5.tgz", + "integrity": "sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==", + "dev": true + }, "node_modules/@types/node": { "version": "20.9.2", "license": "MIT", @@ -862,6 +951,39 @@ "undici-types": "~5.26.4" } }, + "node_modules/@types/qs": { + "version": "6.9.10", + "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.10.tgz", + "integrity": "sha512-3Gnx08Ns1sEoCrWssEgTSJs/rsT2vhGP+Ja9cnnk9k4ALxinORlQneLXFeFKOTJMOeZUFD1s7w+w2AphTpvzZw==", + "dev": true + }, + "node_modules/@types/range-parser": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/@types/range-parser/-/range-parser-1.2.7.tgz", + "integrity": "sha512-hKormJbkJqzQGhziax5PItDUTMAM9uE2XXQmM37dyd4hVM+5aVl7oVxMVUiVQn2oCQFN/LKCZdvSM0pFRqbSmQ==", + "dev": true + }, + "node_modules/@types/send": { + "version": "0.17.4", + "resolved": "https://registry.npmjs.org/@types/send/-/send-0.17.4.tgz", + "integrity": "sha512-x2EM6TJOybec7c52BX0ZspPodMsQUd5L6PRwOunVyVUhXiBSKf3AezDL8Dgvgt5o0UfKNfuA0eMLr2wLT4AiBA==", + "dev": true, + "dependencies": { + "@types/mime": "^1", + "@types/node": "*" + } + }, + "node_modules/@types/serve-static": { + "version": "1.15.5", + "resolved": "https://registry.npmjs.org/@types/serve-static/-/serve-static-1.15.5.tgz", + "integrity": "sha512-PDRk21MnK70hja/YF8AHfC7yIsiQHn1rcXx7ijCFBX/k+XQJhQT/gw3xekXKJvx+5SXaMMS8oqQy09Mzvz2TuQ==", + "dev": true, + "dependencies": { + "@types/http-errors": "*", + "@types/mime": "*", + "@types/node": "*" + } + }, "node_modules/@types/through": { "version": "0.0.33", "dev": true, @@ -886,6 +1008,18 @@ "version": "2.0.6", "license": "BSD-3-Clause" }, + "node_modules/accepts": { + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", + "integrity": "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==", + "dependencies": { + "mime-types": "~2.1.34", + "negotiator": "0.6.3" + }, + "engines": { + "node": ">= 0.6" + } + }, "node_modules/acorn": { "version": "8.11.2", "dev": true, @@ -977,6 +1111,11 @@ "dev": true, "license": "MIT" }, + "node_modules/array-flatten": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", + "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==" + }, "node_modules/asynckit": { "version": "0.4.0", "license": "MIT" @@ -1012,6 +1151,53 @@ "readable-stream": "^3.4.0" } }, + "node_modules/body-parser": { + "version": "1.20.1", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz", + "integrity": "sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==", + "dependencies": { + "bytes": "3.1.2", + "content-type": "~1.0.4", + "debug": "2.6.9", + "depd": "2.0.0", + "destroy": "1.2.0", + "http-errors": "2.0.0", + "iconv-lite": "0.4.24", + "on-finished": "2.4.1", + "qs": "6.11.0", + "raw-body": "2.5.1", + "type-is": "~1.6.18", + "unpipe": "1.0.0" + }, + "engines": { + "node": ">= 0.8", + "npm": "1.2.8000 || >= 1.4.16" + } + }, + "node_modules/body-parser/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/body-parser/node_modules/iconv-lite": { + "version": "0.4.24", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", + "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/body-parser/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" + }, "node_modules/boolbase": { "version": "1.0.0", "license": "ISC" @@ -1075,6 +1261,25 @@ "ieee754": "^1.1.13" } }, + "node_modules/busboy": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz", + "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==", + "dependencies": { + "streamsearch": "^1.1.0" + }, + "engines": { + "node": ">=10.16.0" + } + }, + "node_modules/bytes": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", + "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/cacheable-lookup": { "version": "7.0.0", "license": "MIT", @@ -1292,6 +1497,17 @@ "node": ">=16" } }, + "node_modules/content-disposition": { + "version": "0.5.4", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", + "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==", + "dependencies": { + "safe-buffer": "5.2.1" + }, + "engines": { + "node": ">= 0.6" + } + }, "node_modules/content-type": { "version": "1.0.5", "license": "MIT", @@ -1299,6 +1515,31 @@ "node": ">= 0.6" } }, + "node_modules/cookie": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz", + "integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/cookie-signature": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", + "integrity": "sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==" + }, + "node_modules/cors": { + "version": "2.8.5", + "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.5.tgz", + "integrity": "sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==", + "dependencies": { + "object-assign": "^4", + "vary": "^1" + }, + "engines": { + "node": ">= 0.10" + } + }, "node_modules/crawlee": { "version": "3.6.1", "license": "Apache-2.0", @@ -1486,6 +1727,23 @@ "node": ">=0.4.0" } }, + "node_modules/depd": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", + "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/destroy": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", + "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==", + "engines": { + "node": ">= 0.8", + "npm": "1.2.8000 || >= 1.4.16" + } + }, "node_modules/devtools-protocol": { "version": "0.0.1226504", "license": "BSD-3-Clause" @@ -1576,6 +1834,11 @@ "version": "0.2.0", "license": "MIT" }, + "node_modules/ee-first": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", + "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==" + }, "node_modules/electron-to-chromium": { "version": "1.4.588", "license": "ISC" @@ -1584,6 +1847,14 @@ "version": "8.0.0", "license": "MIT" }, + "node_modules/encodeurl": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", + "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/entities": { "version": "4.5.0", "license": "BSD-2-Clause", @@ -1601,6 +1872,11 @@ "node": ">=6" } }, + "node_modules/escape-html": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", + "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==" + }, "node_modules/escape-string-regexp": { "version": "5.0.0", "license": "MIT", @@ -1611,6 +1887,14 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/etag": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", + "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==", + "engines": { + "node": ">= 0.6" + } + }, "node_modules/event-stream": { "version": "3.3.4", "license": "MIT", @@ -1624,6 +1908,71 @@ "through": "~2.3.1" } }, + "node_modules/express": { + "version": "4.18.2", + "resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz", + "integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==", + "dependencies": { + "accepts": "~1.3.8", + "array-flatten": "1.1.1", + "body-parser": "1.20.1", + "content-disposition": "0.5.4", + "content-type": "~1.0.4", + "cookie": "0.5.0", + "cookie-signature": "1.0.6", + "debug": "2.6.9", + "depd": "2.0.0", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "finalhandler": "1.2.0", + "fresh": "0.5.2", + "http-errors": "2.0.0", + "merge-descriptors": "1.0.1", + "methods": "~1.1.2", + "on-finished": "2.4.1", + "parseurl": "~1.3.3", + "path-to-regexp": "0.1.7", + "proxy-addr": "~2.0.7", + "qs": "6.11.0", + "range-parser": "~1.2.1", + "safe-buffer": "5.2.1", + "send": "0.18.0", + "serve-static": "1.15.0", + "setprototypeof": "1.2.0", + "statuses": "2.0.1", + "type-is": "~1.6.18", + "utils-merge": "1.0.1", + "vary": "~1.1.2" + }, + "engines": { + "node": ">= 0.10.0" + } + }, + "node_modules/express-fileupload": { + "version": "1.4.3", + "resolved": "https://registry.npmjs.org/express-fileupload/-/express-fileupload-1.4.3.tgz", + "integrity": "sha512-vRzZo2YELm68DfR/CX8RMXgeK9BTAANxigrKACPjCXFGEzkCt/QWbqaIXP3W61uaX/hLj0CAo3/EVelpSQXkqA==", + "dependencies": { + "busboy": "^1.6.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/express/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/express/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" + }, "node_modules/external-editor": { "version": "3.1.0", "license": "MIT", @@ -1674,6 +2023,36 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/finalhandler": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz", + "integrity": "sha512-5uXcUVftlQMFnWC9qu/svkWv3GTd2PfUhK/3PLkYNAe7FbqJMt3515HaxE6eRL74GdsriiwujiawdaB1BpEISg==", + "dependencies": { + "debug": "2.6.9", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "on-finished": "2.4.1", + "parseurl": "~1.3.3", + "statuses": "2.0.1", + "unpipe": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/finalhandler/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/finalhandler/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" + }, "node_modules/find-up": { "version": "4.1.0", "license": "MIT", @@ -1753,6 +2132,22 @@ "node": ">= 14.17" } }, + "node_modules/forwarded": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", + "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/fresh": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", + "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==", + "engines": { + "node": ">= 0.6" + } + }, "node_modules/from": { "version": "0.1.7", "license": "MIT" @@ -2079,6 +2474,21 @@ "version": "4.1.1", "license": "BSD-2-Clause" }, + "node_modules/http-errors": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", + "integrity": "sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==", + "dependencies": { + "depd": "2.0.0", + "inherits": "2.0.4", + "setprototypeof": "1.2.0", + "statuses": "2.0.1", + "toidentifier": "1.0.1" + }, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/http-proxy-agent": { "version": "5.0.0", "license": "MIT", @@ -2190,6 +2600,14 @@ "node": ">=14.18.0" } }, + "node_modules/ipaddr.js": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", + "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==", + "engines": { + "node": ">= 0.10" + } + }, "node_modules/is-fullwidth-code-point": { "version": "3.0.0", "license": "MIT", @@ -2416,6 +2834,38 @@ "node_modules/map-stream": { "version": "0.1.0" }, + "node_modules/media-typer": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", + "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/merge-descriptors": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", + "integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w==" + }, + "node_modules/methods": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", + "integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", + "bin": { + "mime": "cli.js" + }, + "engines": { + "node": ">=4" + } + }, "node_modules/mime-db": { "version": "1.52.0", "license": "MIT", @@ -2497,6 +2947,14 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, + "node_modules/negotiator": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", + "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==", + "engines": { + "node": ">= 0.6" + } + }, "node_modules/node-releases": { "version": "2.0.13", "license": "MIT" @@ -2525,6 +2983,33 @@ "version": "2.2.7", "license": "MIT" }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-inspect": { + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz", + "integrity": "sha512-5qoj1RUiKOMsCCNLV1CBiPYE10sziTsnmNxkAI/rZhiD63CF7IqdFGC/XzjWjpSgLf0LxXX3bDFIh0E18f6UhQ==", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/on-finished": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz", + "integrity": "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==", + "dependencies": { + "ee-first": "1.1.1" + }, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/onetime": { "version": "5.1.2", "license": "MIT", @@ -2684,6 +3169,14 @@ "url": "https://github.com/inikulin/parse5?sponsor=1" } }, + "node_modules/parseurl": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", + "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/path-exists": { "version": "4.0.0", "license": "MIT", @@ -2712,6 +3205,11 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/path-to-regexp": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", + "integrity": "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==" + }, "node_modules/pause-stream": { "version": "0.0.11", "license": [ @@ -2789,6 +3287,18 @@ "version": "3.0.7", "license": "ISC" }, + "node_modules/proxy-addr": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", + "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==", + "dependencies": { + "forwarded": "0.2.0", + "ipaddr.js": "1.9.1" + }, + "engines": { + "node": ">= 0.10" + } + }, "node_modules/proxy-chain": { "version": "2.4.0", "license": "Apache-2.0", @@ -2810,6 +3320,20 @@ "node": ">=6" } }, + "node_modules/qs": { + "version": "6.11.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz", + "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==", + "dependencies": { + "side-channel": "^1.0.4" + }, + "engines": { + "node": ">=0.6" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/querystringify": { "version": "2.2.0", "license": "MIT" @@ -2824,6 +3348,39 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/range-parser": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", + "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/raw-body": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz", + "integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==", + "dependencies": { + "bytes": "3.1.2", + "http-errors": "2.0.0", + "iconv-lite": "0.4.24", + "unpipe": "1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/raw-body/node_modules/iconv-lite": { + "version": "0.4.24", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", + "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/readable-stream": { "version": "3.6.2", "license": "MIT", @@ -2953,6 +3510,61 @@ "node": ">=v12.22.7" } }, + "node_modules/send": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", + "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "dependencies": { + "debug": "2.6.9", + "depd": "2.0.0", + "destroy": "1.2.0", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "2.0.0", + "mime": "1.6.0", + "ms": "2.1.3", + "on-finished": "2.4.1", + "range-parser": "~1.2.1", + "statuses": "2.0.1" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/send/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/send/node_modules/debug/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" + }, + "node_modules/send/node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" + }, + "node_modules/serve-static": { + "version": "1.15.0", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz", + "integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==", + "dependencies": { + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "parseurl": "~1.3.3", + "send": "0.18.0" + }, + "engines": { + "node": ">= 0.8.0" + } + }, "node_modules/set-function-length": { "version": "1.1.1", "license": "MIT", @@ -2966,6 +3578,11 @@ "node": ">= 0.4" } }, + "node_modules/setprototypeof": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", + "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==" + }, "node_modules/shebang-command": { "version": "2.0.0", "license": "MIT", @@ -2983,6 +3600,19 @@ "node": ">=8" } }, + "node_modules/side-channel": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz", + "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==", + "dependencies": { + "call-bind": "^1.0.0", + "get-intrinsic": "^1.0.2", + "object-inspect": "^1.9.0" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/signal-exit": { "version": "4.1.0", "license": "ISC", @@ -3003,6 +3633,14 @@ "node": "*" } }, + "node_modules/statuses": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz", + "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/stream-chain": { "version": "2.2.5", "license": "BSD-3-Clause" @@ -3021,6 +3659,14 @@ "stream-chain": "^2.2.5" } }, + "node_modules/streamsearch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz", + "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==", + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/string_decoder": { "version": "1.3.0", "license": "MIT", @@ -3096,6 +3742,14 @@ "node": ">=0.6.0" } }, + "node_modules/toidentifier": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", + "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==", + "engines": { + "node": ">=0.6" + } + }, "node_modules/tough-cookie": { "version": "4.1.3", "license": "BSD-3-Clause", @@ -3182,6 +3836,18 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/type-is": { + "version": "1.6.18", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", + "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", + "dependencies": { + "media-typer": "0.3.0", + "mime-types": "~2.1.24" + }, + "engines": { + "node": ">= 0.6" + } + }, "node_modules/typescript": { "version": "5.2.2", "dev": true, @@ -3209,6 +3875,14 @@ "node": ">= 10.0.0" } }, + "node_modules/unpipe": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", + "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/update-browserslist-db": { "version": "1.0.13", "funding": [ @@ -3249,6 +3923,14 @@ "version": "1.0.2", "license": "MIT" }, + "node_modules/utils-merge": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", + "integrity": "sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==", + "engines": { + "node": ">= 0.4.0" + } + }, "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "dev": true, @@ -3261,6 +3943,14 @@ "node": ">=0.10.0" } }, + "node_modules/vary": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", + "integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/w3c-xmlserializer": { "version": "4.0.0", "license": "MIT", diff --git a/package.json b/package.json index 93dde756..6a1325b3 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,10 @@ "description": "Crawl a site to generate knowledge files to create your own custom GPT", "dependencies": { "commander": "^11.1.0", + "cors": "^2.8.5", "crawlee": "^3.0.0", + "express": "^4.18.2", + "express-fileupload": "^1.4.3", "glob": "^10.3.10", "inquirer": "^9.2.12", "playwright": "*", @@ -16,6 +19,9 @@ }, "devDependencies": { "@apify/tsconfig": "^0.1.0", + "@types/cors": "^2.8.17", + "@types/express": "^4.17.21", + "@types/express-fileupload": "^1.4.4", "@types/inquirer": "^9.0.7", "@types/node": "^20.0.0", "ts-node": "^10.8.0", @@ -26,6 +32,7 @@ "start": "npm run start:dev", "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js", "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js", + "start:server": "NODE_ENV=development npm run build && node dist/src/server.js", "start:prod": "node dist/main.js", "build": "tsc", "fmt": "prettier --write ." diff --git a/src/server.ts b/src/server.ts new file mode 100644 index 00000000..49a06323 --- /dev/null +++ b/src/server.ts @@ -0,0 +1,41 @@ +// file: app/src/api.ts + +import express from 'express'; +import cors from 'cors'; +import { readFile, writeFile } from 'fs/promises'; +import { crawl, write } from "./core.js"; +import { Config } from './config.js'; + +// Create a new express application instance +const app = express(); +const port = 3000; // You may want to make the port configurable + +// Enable JSON and file upload functionality +app.use(cors()); +app.use(express.json()); + +// Define a POST route to accept config and run the crawler +app.post('/crawl', async (req, res) => { + // Read the configuration file sent as form-data + const config: Config = req.body; + + // Placeholder for handling crawler events and operations + try { + await crawl(config); + await write(config); + + // Read the output file after crawling and send it in the response + const outputFileContent = await readFile(config.outputFileName, 'utf-8'); + res.contentType('application/json'); + return res.send(outputFileContent); + } catch (error) { + return res.status(500).json({ message: 'Error occurred during crawling', error }); + } +}); + +// Start the Express server +app.listen(port, () => { + console.log(`API server listening at http://localhost:${port}`); +}); + +export default app; From 39eded8279897cc371789b39d9c319e755ff63a3 Mon Sep 17 00:00:00 2001 From: Aditya Karnam Date: Mon, 27 Nov 2023 23:52:52 -0600 Subject: [PATCH 2/8] fix: config port and host --- .env.example | 3 +++ .gitignore | 3 ++- package-lock.json | 12 ++++++++++++ package.json | 2 ++ src/server.ts | 22 ++++++++-------------- 5 files changed, 27 insertions(+), 15 deletions(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..4ad153b8 --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +API_PORT=5000 +API_HOST=localhost +NODE_ENV=development diff --git a/.gitignore b/.gitignore index 00e8a0c0..14142ee7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ crawlee_storage storage # any output from the crawler -*.json \ No newline at end of file +*.json +.env \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 75c4a212..c2924912 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,6 +13,7 @@ "commander": "^11.1.0", "cors": "^2.8.5", "crawlee": "^3.0.0", + "dotenv": "^16.3.1", "express": "^4.18.2", "express-fileupload": "^1.4.3", "glob": "^10.3.10", @@ -1826,6 +1827,17 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/dotenv": { + "version": "16.3.1", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.3.1.tgz", + "integrity": "sha512-IPzF4w4/Rd94bA9imS68tZBaYyBWSCE47V1RGuMrB94iyTOIEwRmVL2x/4An+6mETpLrKJ5hQkB8W4kFAadeIQ==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/motdotla/dotenv?sponsor=1" + } + }, "node_modules/duplexer": { "version": "0.1.2", "license": "MIT" diff --git a/package.json b/package.json index 6a1325b3..cab14272 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,7 @@ "commander": "^11.1.0", "cors": "^2.8.5", "crawlee": "^3.0.0", + "dotenv": "^16.3.1", "express": "^4.18.2", "express-fileupload": "^1.4.3", "glob": "^10.3.10", @@ -34,6 +35,7 @@ "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js", "start:server": "NODE_ENV=development npm run build && node dist/src/server.js", "start:prod": "node dist/main.js", + "start:server:prod": "npm run build && node dist/src/server.js", "build": "tsc", "fmt": "prettier --write ." }, diff --git a/src/server.ts b/src/server.ts index 49a06323..a1d840bf 100644 --- a/src/server.ts +++ b/src/server.ts @@ -1,30 +1,25 @@ -// file: app/src/api.ts - import express from 'express'; import cors from 'cors'; -import { readFile, writeFile } from 'fs/promises'; +import { readFile } from 'fs/promises'; import { crawl, write } from "./core.js"; import { Config } from './config.js'; +import { configDotenv } from 'dotenv'; + +configDotenv(); -// Create a new express application instance const app = express(); -const port = 3000; // You may want to make the port configurable +const port = Number(process.env.API_PORT) || 3000; +const hostname = process.env.API_HOST || 'localhost'; -// Enable JSON and file upload functionality app.use(cors()); app.use(express.json()); // Define a POST route to accept config and run the crawler app.post('/crawl', async (req, res) => { - // Read the configuration file sent as form-data const config: Config = req.body; - - // Placeholder for handling crawler events and operations try { await crawl(config); await write(config); - - // Read the output file after crawling and send it in the response const outputFileContent = await readFile(config.outputFileName, 'utf-8'); res.contentType('application/json'); return res.send(outputFileContent); @@ -33,9 +28,8 @@ app.post('/crawl', async (req, res) => { } }); -// Start the Express server -app.listen(port, () => { - console.log(`API server listening at http://localhost:${port}`); +app.listen(port, hostname, () => { + console.log(`API server listening at http://${hostname}:${port}`); }); export default app; From 5cdafaaeac6bd6463df7559708a440285357c2ea Mon Sep 17 00:00:00 2001 From: Aditya Karnam Date: Tue, 28 Nov 2023 00:38:56 -0600 Subject: [PATCH 3/8] feat: config schema validation --- .env.example | 1 + package-lock.json | 44 ++++++++++++++++++++++++++++++++++++++++++++ package.json | 1 + src/config.ts | 18 ++++++++++++++++++ src/server.ts | 9 +++++---- 5 files changed, 69 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index 4ad153b8..31e212b6 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,4 @@ API_PORT=5000 API_HOST=localhost +MAX_PAGES_TO_CRAWL=45 NODE_ENV=development diff --git a/package-lock.json b/package-lock.json index c2924912..0a874005 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,6 +18,7 @@ "express-fileupload": "^1.4.3", "glob": "^10.3.10", "inquirer": "^9.2.12", + "joi": "^17.11.0", "playwright": "*", "prettier": "^3.1.0" }, @@ -575,6 +576,19 @@ "node": ">=12" } }, + "node_modules/@hapi/hoek": { + "version": "9.3.0", + "resolved": "https://registry.npmjs.org/@hapi/hoek/-/hoek-9.3.0.tgz", + "integrity": "sha512-/c6rf4UJlmHlC9b5BaNvzAcFv7HZ2QHaV0D4/HNlBdvFnvQq8RI4kYdhyPCl7Xj+oWvTWQ8ujhqS53LIgAe6KQ==" + }, + "node_modules/@hapi/topo": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/@hapi/topo/-/topo-5.1.0.tgz", + "integrity": "sha512-foQZKJig7Ob0BMAYBfcJk8d77QtOe7Wo4ox7ff1lQYoNNAb6jwcY1ncdoy2e9wQZzvNy7ODZCYJkK8kzmcAnAg==", + "dependencies": { + "@hapi/hoek": "^9.0.0" + } + }, "node_modules/@isaacs/cliui": { "version": "8.0.2", "license": "ISC", @@ -789,6 +803,24 @@ "npm": ">=7.0.0" } }, + "node_modules/@sideway/address": { + "version": "4.1.4", + "resolved": "https://registry.npmjs.org/@sideway/address/-/address-4.1.4.tgz", + "integrity": "sha512-7vwq+rOHVWjyXxVlR76Agnvhy8I9rpzjosTESvmhNeXOXdZZB15Fl+TI9x1SiHZH5Jv2wTGduSxFDIaq0m3DUw==", + "dependencies": { + "@hapi/hoek": "^9.0.0" + } + }, + "node_modules/@sideway/formula": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/@sideway/formula/-/formula-3.0.1.tgz", + "integrity": "sha512-/poHZJJVjx3L+zVD6g9KgHfYnb443oi7wLu/XKojDviHy6HOEOA6z1Trk5aR1dGcmPenJEgb2sK2I80LeS3MIg==" + }, + "node_modules/@sideway/pinpoint": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@sideway/pinpoint/-/pinpoint-2.0.0.tgz", + "integrity": "sha512-RNiOoTPkptFtSVzQevY/yWtZwf/RxyVnPy/OcA9HBM3MlGDnBEYL5B41H0MTn0Uec8Hi+2qUtTfG2WWZBmMejQ==" + }, "node_modules/@sindresorhus/is": { "version": "4.6.0", "license": "MIT", @@ -2675,6 +2707,18 @@ "@pkgjs/parseargs": "^0.11.0" } }, + "node_modules/joi": { + "version": "17.11.0", + "resolved": "https://registry.npmjs.org/joi/-/joi-17.11.0.tgz", + "integrity": "sha512-NgB+lZLNoqISVy1rZocE9PZI36bL/77ie924Ri43yEvi9GUUMPeyVIr8KdFTMUlby1p0PBYMk9spIxEUQYqrJQ==", + "dependencies": { + "@hapi/hoek": "^9.0.0", + "@hapi/topo": "^5.0.0", + "@sideway/address": "^4.1.3", + "@sideway/formula": "^3.0.1", + "@sideway/pinpoint": "^2.0.0" + } + }, "node_modules/jquery": { "version": "3.7.1", "license": "MIT" diff --git a/package.json b/package.json index cab14272..4afd87b0 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "express-fileupload": "^1.4.3", "glob": "^10.3.10", "inquirer": "^9.2.12", + "joi": "^17.11.0", "playwright": "*", "prettier": "^3.1.0" }, diff --git a/src/config.ts b/src/config.ts index d42203d2..9873b04a 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,4 +1,8 @@ import type { Page } from "playwright"; +import Joi from "joi"; +import { configDotenv } from "dotenv"; + +configDotenv(); export type Config = { /** @@ -39,3 +43,17 @@ export type Config = { /** Optional timeout for waiting for a selector to appear */ waitForSelectorTimeout?: number; }; + +export const ConfigSchema = Joi.object({ + url: Joi.string().uri().required(), + match: Joi.alternatives(Joi.string().required(), Joi.array().items(Joi.string().required())).required(), + selector: Joi.string().default(''), + maxPagesToCrawl: Joi.number().integer().default(process.env.MAX_PAGES_TO_CRAWL || 50), + outputFileName: Joi.string().default('output.json'), + cookie: Joi.object({ + name: Joi.string().required(), + value: Joi.string().required(), + }).optional(), + onVisitPage: Joi.function().optional(), + waitForSelectorTimeout: Joi.number().optional(), +}); diff --git a/src/server.ts b/src/server.ts index a1d840bf..5882f2d9 100644 --- a/src/server.ts +++ b/src/server.ts @@ -2,7 +2,7 @@ import express from 'express'; import cors from 'cors'; import { readFile } from 'fs/promises'; import { crawl, write } from "./core.js"; -import { Config } from './config.js'; +import { Config, ConfigSchema } from './config.js'; import { configDotenv } from 'dotenv'; configDotenv(); @@ -18,9 +18,10 @@ app.use(express.json()); app.post('/crawl', async (req, res) => { const config: Config = req.body; try { - await crawl(config); - await write(config); - const outputFileContent = await readFile(config.outputFileName, 'utf-8'); + const validatedConfig = ConfigSchema.validate(config).value; + await crawl(validatedConfig); + await write(validatedConfig); + const outputFileContent = await readFile(validatedConfig.outputFileName, 'utf-8'); res.contentType('application/json'); return res.send(outputFileContent); } catch (error) { From 4508c38983cf92d800b1ec48a4c7adedea92a15d Mon Sep 17 00:00:00 2001 From: Aditya Karnam Date: Sun, 24 Dec 2023 18:01:37 -0600 Subject: [PATCH 4/8] feat: add swagger doc --- package-lock.json | 155 +++++++++++++++++++++++++++++++++++++++++++++- package.json | 5 +- src/server.ts | 4 ++ swagger.js | 14 +++++ 4 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 swagger.js diff --git a/package-lock.json b/package-lock.json index 0a874005..0dad474a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -20,7 +20,8 @@ "inquirer": "^9.2.12", "joi": "^17.11.0", "playwright": "*", - "prettier": "^3.1.0" + "prettier": "^3.1.0", + "swagger-ui-express": "^5.0.0" }, "bin": { "gpt-crawler": "dist/src/cli.js" @@ -32,6 +33,8 @@ "@types/express-fileupload": "^1.4.4", "@types/inquirer": "^9.0.7", "@types/node": "^20.0.0", + "@types/swagger-ui-express": "^4.1.6", + "swagger-autogen": "^2.23.7", "ts-node": "^10.8.0", "typescript": "^5.0.0" } @@ -1017,6 +1020,16 @@ "@types/node": "*" } }, + "node_modules/@types/swagger-ui-express": { + "version": "4.1.6", + "resolved": "https://registry.npmjs.org/@types/swagger-ui-express/-/swagger-ui-express-4.1.6.tgz", + "integrity": "sha512-UVSiGYXa5IzdJJG3hrc86e8KdZWLYxyEsVoUI4iPXc7CO4VZ3AfNP8d/8+hrDRIqz+HAaSMtZSqAsF3Nq2X/Dg==", + "dev": true, + "dependencies": { + "@types/express": "*", + "@types/serve-static": "*" + } + }, "node_modules/@types/through": { "version": "0.0.33", "dev": true, @@ -1530,6 +1543,12 @@ "node": ">=16" } }, + "node_modules/concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", + "dev": true + }, "node_modules/content-disposition": { "version": "0.5.4", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", @@ -1724,6 +1743,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/deepmerge": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", + "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/defaults": { "version": "1.0.4", "license": "MIT", @@ -2208,6 +2236,12 @@ "node": ">=14.14" } }, + "node_modules/fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", + "dev": true + }, "node_modules/function-bind": { "version": "1.1.2", "license": "MIT", @@ -2616,6 +2650,16 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", + "dev": true, + "dependencies": { + "once": "^1.3.0", + "wrappy": "1" + } + }, "node_modules/inherits": { "version": "2.0.4", "license": "ISC" @@ -3066,6 +3110,15 @@ "node": ">= 0.8" } }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "dev": true, + "dependencies": { + "wrappy": "1" + } + }, "node_modules/onetime": { "version": "5.1.2", "license": "MIT", @@ -3240,6 +3293,15 @@ "node": ">=8" } }, + "node_modules/path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/path-key": { "version": "3.1.1", "license": "MIT", @@ -3762,6 +3824,91 @@ "node": ">=8" } }, + "node_modules/swagger-autogen": { + "version": "2.23.7", + "resolved": "https://registry.npmjs.org/swagger-autogen/-/swagger-autogen-2.23.7.tgz", + "integrity": "sha512-vr7uRmuV0DCxWc0wokLJAwX3GwQFJ0jwN+AWk0hKxre2EZwusnkGSGdVFd82u7fQLgwSTnbWkxUL7HXuz5LTZQ==", + "dev": true, + "dependencies": { + "acorn": "^7.4.1", + "deepmerge": "^4.2.2", + "glob": "^7.1.7", + "json5": "^2.2.3" + } + }, + "node_modules/swagger-autogen/node_modules/acorn": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", + "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==", + "dev": true, + "bin": { + "acorn": "bin/acorn" + }, + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/swagger-autogen/node_modules/brace-expansion": { + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", + "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "dev": true, + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/swagger-autogen/node_modules/glob": { + "version": "7.2.3", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", + "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", + "dev": true, + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.1.1", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/swagger-autogen/node_modules/minimatch": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", + "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", + "dev": true, + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/swagger-ui-dist": { + "version": "5.10.5", + "resolved": "https://registry.npmjs.org/swagger-ui-dist/-/swagger-ui-dist-5.10.5.tgz", + "integrity": "sha512-Uv8E7hV/nXALQKgW86X1i58gl1O6DFg+Uq54sDwhYqucBBxj/47dLNw872TNILNlOTuPA6dRvUMGQdmlpaX8qQ==" + }, + "node_modules/swagger-ui-express": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/swagger-ui-express/-/swagger-ui-express-5.0.0.tgz", + "integrity": "sha512-tsU9tODVvhyfkNSvf03E6FAk+z+5cU3lXAzMy6Pv4av2Gt2xA0++fogwC4qo19XuFf6hdxevPuVCSKFuMHJhFA==", + "dependencies": { + "swagger-ui-dist": ">=5.0.0" + }, + "engines": { + "node": ">= v0.10.32" + }, + "peerDependencies": { + "express": ">=4.0.0 || >=5.0.0-beta" + } + }, "node_modules/symbol-tree": { "version": "3.2.4", "license": "MIT" @@ -4084,6 +4231,12 @@ "node": ">=8" } }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "dev": true + }, "node_modules/ws": { "version": "8.14.2", "license": "MIT", diff --git a/package.json b/package.json index 4afd87b0..2fabf7d3 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,8 @@ "inquirer": "^9.2.12", "joi": "^17.11.0", "playwright": "*", - "prettier": "^3.1.0" + "prettier": "^3.1.0", + "swagger-ui-express": "^5.0.0" }, "devDependencies": { "@apify/tsconfig": "^0.1.0", @@ -26,6 +27,8 @@ "@types/express-fileupload": "^1.4.4", "@types/inquirer": "^9.0.7", "@types/node": "^20.0.0", + "@types/swagger-ui-express": "^4.1.6", + "swagger-autogen": "^2.23.7", "ts-node": "^10.8.0", "typescript": "^5.0.0" }, diff --git a/src/server.ts b/src/server.ts index 5882f2d9..0d31146c 100644 --- a/src/server.ts +++ b/src/server.ts @@ -4,6 +4,9 @@ import { readFile } from 'fs/promises'; import { crawl, write } from "./core.js"; import { Config, ConfigSchema } from './config.js'; import { configDotenv } from 'dotenv'; +import swaggerUi from 'swagger-ui-express'; +// @ts-ignore +import swaggerDocument from '../swagger-output.json' assert { type: 'json' }; configDotenv(); @@ -13,6 +16,7 @@ const hostname = process.env.API_HOST || 'localhost'; app.use(cors()); app.use(express.json()); +app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerDocument)); // Define a POST route to accept config and run the crawler app.post('/crawl', async (req, res) => { diff --git a/swagger.js b/swagger.js new file mode 100644 index 00000000..67a411c5 --- /dev/null +++ b/swagger.js @@ -0,0 +1,14 @@ +import swaggerAutogen from 'swagger-autogen'; + +const doc = { + info: { + title: 'GPT Crawler API', + description: 'GPT Crawler' + }, + host: 'localhost:5000' +}; + +const outputFile = 'swagger-output.json'; +const routes = ['./src/server.ts']; + +swaggerAutogen()(outputFile, routes, doc); From af2a44e7fd24b0cc4320b4dacd12ed5eefe9af09 Mon Sep 17 00:00:00 2001 From: Aditya Karnam Date: Sun, 24 Dec 2023 18:23:36 -0600 Subject: [PATCH 5/8] fix: use z instead of joi --- package-lock.json | 87 ----------------------------------------------- package.json | 1 - src/server.ts | 4 +-- 3 files changed, 2 insertions(+), 90 deletions(-) diff --git a/package-lock.json b/package-lock.json index 4690c38b..3cb78c8a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -20,7 +20,6 @@ "glob": "^10.3.10", "gpt-tokenizer": "^2.1.2", "inquirer": "^9.2.12", - "joi": "^17.11.0", "playwright": "*", "prettier": "^3.1.0", "swagger-ui-express": "^5.0.0" @@ -718,19 +717,6 @@ "node": ">=12" } }, - "node_modules/@hapi/hoek": { - "version": "9.3.0", - "resolved": "https://registry.npmjs.org/@hapi/hoek/-/hoek-9.3.0.tgz", - "integrity": "sha512-/c6rf4UJlmHlC9b5BaNvzAcFv7HZ2QHaV0D4/HNlBdvFnvQq8RI4kYdhyPCl7Xj+oWvTWQ8ujhqS53LIgAe6KQ==" - }, - "node_modules/@hapi/topo": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/@hapi/topo/-/topo-5.1.0.tgz", - "integrity": "sha512-foQZKJig7Ob0BMAYBfcJk8d77QtOe7Wo4ox7ff1lQYoNNAb6jwcY1ncdoy2e9wQZzvNy7ODZCYJkK8kzmcAnAg==", - "dependencies": { - "@hapi/hoek": "^9.0.0" - } - }, "node_modules/@isaacs/cliui": { "version": "8.0.2", "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", @@ -1509,24 +1495,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/@sideway/address": { - "version": "4.1.4", - "resolved": "https://registry.npmjs.org/@sideway/address/-/address-4.1.4.tgz", - "integrity": "sha512-7vwq+rOHVWjyXxVlR76Agnvhy8I9rpzjosTESvmhNeXOXdZZB15Fl+TI9x1SiHZH5Jv2wTGduSxFDIaq0m3DUw==", - "dependencies": { - "@hapi/hoek": "^9.0.0" - } - }, - "node_modules/@sideway/formula": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/@sideway/formula/-/formula-3.0.1.tgz", - "integrity": "sha512-/poHZJJVjx3L+zVD6g9KgHfYnb443oi7wLu/XKojDviHy6HOEOA6z1Trk5aR1dGcmPenJEgb2sK2I80LeS3MIg==" - }, - "node_modules/@sideway/pinpoint": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@sideway/pinpoint/-/pinpoint-2.0.0.tgz", - "integrity": "sha512-RNiOoTPkptFtSVzQevY/yWtZwf/RxyVnPy/OcA9HBM3MlGDnBEYL5B41H0MTn0Uec8Hi+2qUtTfG2WWZBmMejQ==" - }, "node_modules/@sindresorhus/is": { "version": "5.6.0", "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-5.6.0.tgz", @@ -4568,18 +4536,6 @@ "node": ">= 0.6.0" } }, - "node_modules/joi": { - "version": "17.11.0", - "resolved": "https://registry.npmjs.org/joi/-/joi-17.11.0.tgz", - "integrity": "sha512-NgB+lZLNoqISVy1rZocE9PZI36bL/77ie924Ri43yEvi9GUUMPeyVIr8KdFTMUlby1p0PBYMk9spIxEUQYqrJQ==", - "dependencies": { - "@hapi/hoek": "^9.0.0", - "@hapi/topo": "^5.0.0", - "@sideway/address": "^4.1.3", - "@sideway/formula": "^3.0.1", - "@sideway/pinpoint": "^2.0.0" - } - }, "node_modules/jquery": { "version": "3.7.1", "resolved": "https://registry.npmjs.org/jquery/-/jquery-3.7.1.tgz", @@ -11357,19 +11313,6 @@ "@jridgewell/trace-mapping": "0.3.9" } }, - "@hapi/hoek": { - "version": "9.3.0", - "resolved": "https://registry.npmjs.org/@hapi/hoek/-/hoek-9.3.0.tgz", - "integrity": "sha512-/c6rf4UJlmHlC9b5BaNvzAcFv7HZ2QHaV0D4/HNlBdvFnvQq8RI4kYdhyPCl7Xj+oWvTWQ8ujhqS53LIgAe6KQ==" - }, - "@hapi/topo": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/@hapi/topo/-/topo-5.1.0.tgz", - "integrity": "sha512-foQZKJig7Ob0BMAYBfcJk8d77QtOe7Wo4ox7ff1lQYoNNAb6jwcY1ncdoy2e9wQZzvNy7ODZCYJkK8kzmcAnAg==", - "requires": { - "@hapi/hoek": "^9.0.0" - } - }, "@isaacs/cliui": { "version": "8.0.2", "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", @@ -11911,24 +11854,6 @@ } } }, - "@sideway/address": { - "version": "4.1.4", - "resolved": "https://registry.npmjs.org/@sideway/address/-/address-4.1.4.tgz", - "integrity": "sha512-7vwq+rOHVWjyXxVlR76Agnvhy8I9rpzjosTESvmhNeXOXdZZB15Fl+TI9x1SiHZH5Jv2wTGduSxFDIaq0m3DUw==", - "requires": { - "@hapi/hoek": "^9.0.0" - } - }, - "@sideway/formula": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/@sideway/formula/-/formula-3.0.1.tgz", - "integrity": "sha512-/poHZJJVjx3L+zVD6g9KgHfYnb443oi7wLu/XKojDviHy6HOEOA6z1Trk5aR1dGcmPenJEgb2sK2I80LeS3MIg==" - }, - "@sideway/pinpoint": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@sideway/pinpoint/-/pinpoint-2.0.0.tgz", - "integrity": "sha512-RNiOoTPkptFtSVzQevY/yWtZwf/RxyVnPy/OcA9HBM3MlGDnBEYL5B41H0MTn0Uec8Hi+2qUtTfG2WWZBmMejQ==" - }, "@sindresorhus/is": { "version": "5.6.0", "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-5.6.0.tgz", @@ -14137,18 +14062,6 @@ "integrity": "sha512-qjdpeo2yKlYTH7nFdK0vbZWuTCesk4o63v5iVOlhMQPfuIZQfW/HI35SjfhA+4qpg36rnFSvUK5b1m+ckIblQQ==", "dev": true }, - "joi": { - "version": "17.11.0", - "resolved": "https://registry.npmjs.org/joi/-/joi-17.11.0.tgz", - "integrity": "sha512-NgB+lZLNoqISVy1rZocE9PZI36bL/77ie924Ri43yEvi9GUUMPeyVIr8KdFTMUlby1p0PBYMk9spIxEUQYqrJQ==", - "requires": { - "@hapi/hoek": "^9.0.0", - "@hapi/topo": "^5.0.0", - "@sideway/address": "^4.1.3", - "@sideway/formula": "^3.0.1", - "@sideway/pinpoint": "^2.0.0" - } - }, "jquery": { "version": "3.7.1", "resolved": "https://registry.npmjs.org/jquery/-/jquery-3.7.1.tgz", diff --git a/package.json b/package.json index 84c6c747..4e5883f2 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,6 @@ "glob": "^10.3.10", "gpt-tokenizer": "^2.1.2", "inquirer": "^9.2.12", - "joi": "^17.11.0", "playwright": "*", "prettier": "^3.1.0", "swagger-ui-express": "^5.0.0" diff --git a/src/server.ts b/src/server.ts index 0d31146c..ccf02a63 100644 --- a/src/server.ts +++ b/src/server.ts @@ -2,7 +2,7 @@ import express from 'express'; import cors from 'cors'; import { readFile } from 'fs/promises'; import { crawl, write } from "./core.js"; -import { Config, ConfigSchema } from './config.js'; +import { Config, configSchema } from './config.js'; import { configDotenv } from 'dotenv'; import swaggerUi from 'swagger-ui-express'; // @ts-ignore @@ -22,7 +22,7 @@ app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerDocument)); app.post('/crawl', async (req, res) => { const config: Config = req.body; try { - const validatedConfig = ConfigSchema.validate(config).value; + const validatedConfig = configSchema.parse(config); await crawl(validatedConfig); await write(validatedConfig); const outputFileContent = await readFile(validatedConfig.outputFileName, 'utf-8'); From 9fc1402d46c161a0ee2c6e5528d7aa425ac89c90 Mon Sep 17 00:00:00 2001 From: Aditya Karnam Date: Sun, 24 Dec 2023 18:30:06 -0600 Subject: [PATCH 6/8] chore: remove joi import --- src/config.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/config.ts b/src/config.ts index 34351f60..d7ed7d15 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,5 +1,4 @@ import { z } from "zod"; -import Joi from "joi"; import type { Page } from "playwright"; import { configDotenv } from "dotenv"; From 8e15bb3b426101cc9832ee82f63bc9a19630e026 Mon Sep 17 00:00:00 2001 From: Aditya Karnam Date: Sun, 24 Dec 2023 18:34:02 -0600 Subject: [PATCH 7/8] fix: run prettier --- src/server.ts | 49 +++++++++++++++++++++++++++---------------------- swagger.js | 16 ++++++++-------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/src/server.ts b/src/server.ts index ccf02a63..bf3ff510 100644 --- a/src/server.ts +++ b/src/server.ts @@ -1,40 +1,45 @@ -import express from 'express'; -import cors from 'cors'; -import { readFile } from 'fs/promises'; +import express from "express"; +import cors from "cors"; +import { readFile } from "fs/promises"; import { crawl, write } from "./core.js"; -import { Config, configSchema } from './config.js'; -import { configDotenv } from 'dotenv'; -import swaggerUi from 'swagger-ui-express'; +import { Config, configSchema } from "./config.js"; +import { configDotenv } from "dotenv"; +import swaggerUi from "swagger-ui-express"; // @ts-ignore -import swaggerDocument from '../swagger-output.json' assert { type: 'json' }; +import swaggerDocument from "../swagger-output.json" assert { type: "json" }; configDotenv(); const app = express(); const port = Number(process.env.API_PORT) || 3000; -const hostname = process.env.API_HOST || 'localhost'; +const hostname = process.env.API_HOST || "localhost"; app.use(cors()); app.use(express.json()); -app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerDocument)); +app.use("/api-docs", swaggerUi.serve, swaggerUi.setup(swaggerDocument)); // Define a POST route to accept config and run the crawler -app.post('/crawl', async (req, res) => { - const config: Config = req.body; - try { - const validatedConfig = configSchema.parse(config); - await crawl(validatedConfig); - await write(validatedConfig); - const outputFileContent = await readFile(validatedConfig.outputFileName, 'utf-8'); - res.contentType('application/json'); - return res.send(outputFileContent); - } catch (error) { - return res.status(500).json({ message: 'Error occurred during crawling', error }); - } +app.post("/crawl", async (req, res) => { + const config: Config = req.body; + try { + const validatedConfig = configSchema.parse(config); + await crawl(validatedConfig); + await write(validatedConfig); + const outputFileContent = await readFile( + validatedConfig.outputFileName, + "utf-8", + ); + res.contentType("application/json"); + return res.send(outputFileContent); + } catch (error) { + return res + .status(500) + .json({ message: "Error occurred during crawling", error }); + } }); app.listen(port, hostname, () => { - console.log(`API server listening at http://${hostname}:${port}`); + console.log(`API server listening at http://${hostname}:${port}`); }); export default app; diff --git a/swagger.js b/swagger.js index 67a411c5..3955887f 100644 --- a/swagger.js +++ b/swagger.js @@ -1,14 +1,14 @@ -import swaggerAutogen from 'swagger-autogen'; +import swaggerAutogen from "swagger-autogen"; const doc = { - info: { - title: 'GPT Crawler API', - description: 'GPT Crawler' - }, - host: 'localhost:5000' + info: { + title: "GPT Crawler API", + description: "GPT Crawler", + }, + host: "localhost:5000", }; -const outputFile = 'swagger-output.json'; -const routes = ['./src/server.ts']; +const outputFile = "swagger-output.json"; +const routes = ["./src/server.ts"]; swaggerAutogen()(outputFile, routes, doc); From 77071463d314e340395a53969729f08a3a07d1d3 Mon Sep 17 00:00:00 2001 From: Aditya Karnam Date: Sun, 24 Dec 2023 20:46:43 -0600 Subject: [PATCH 8/8] fix: refactor --- src/core.ts | 42 +++++++++++++++++++++++++++++++++++++++--- src/server.ts | 13 ++++++------- tsconfig.json | 1 + 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/core.ts b/src/core.ts index 8e03bbe5..aaa43efc 100644 --- a/src/core.ts +++ b/src/core.ts @@ -5,8 +5,10 @@ import { glob } from "glob"; import { Config, configSchema } from "./config.js"; import { Page } from "playwright"; import { isWithinTokenLimit } from "gpt-tokenizer"; +import { PathLike } from "fs"; let pageCounter = 0; +let crawler: PlaywrightCrawler; export function getPageHtml(page: Page, selector = "body") { return page.evaluate((selector) => { @@ -52,7 +54,7 @@ export async function crawl(config: Config) { if (process.env.NO_CRAWL !== "true") { // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. - const crawler = new PlaywrightCrawler({ + crawler = new PlaywrightCrawler({ // Use the requestHandler to process each of the crawled pages. async requestHandler({ request, page, enqueueLinks, log, pushData }) { if (config.cookie) { @@ -143,6 +145,7 @@ export async function crawl(config: Config) { } export async function write(config: Config) { + let nextFileNameString: PathLike = ""; const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true, }); @@ -163,8 +166,14 @@ export async function write(config: Config) { `${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`; const writeBatchToFile = async (): Promise => { - await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2)); - console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`); + nextFileNameString = nextFileName(); + await writeFile( + nextFileNameString, + JSON.stringify(currentResults, null, 2), + ); + console.log( + `Wrote ${currentResults.length} items to ${nextFileNameString}`, + ); currentResults = []; currentSize = 0; fileCounter++; @@ -213,4 +222,31 @@ export async function write(config: Config) { if (currentResults.length > 0) { await writeBatchToFile(); } + + return nextFileNameString; } + +class GPTCrawlerCore { + config: Config; + + constructor(config: Config) { + this.config = config; + } + + async crawl() { + await crawl(this.config); + } + + async write(): Promise { + // we need to wait for the file path as the path can change + return new Promise((resolve, reject) => { + write(this.config) + .then((outputFilePath) => { + resolve(outputFilePath); + }) + .catch(reject); + }); + } +} + +export default GPTCrawlerCore; diff --git a/src/server.ts b/src/server.ts index bf3ff510..50497a99 100644 --- a/src/server.ts +++ b/src/server.ts @@ -1,12 +1,13 @@ import express from "express"; import cors from "cors"; import { readFile } from "fs/promises"; -import { crawl, write } from "./core.js"; import { Config, configSchema } from "./config.js"; import { configDotenv } from "dotenv"; import swaggerUi from "swagger-ui-express"; // @ts-ignore import swaggerDocument from "../swagger-output.json" assert { type: "json" }; +import GPTCrawlerCore from "./core.js"; +import { PathLike } from "fs"; configDotenv(); @@ -23,12 +24,10 @@ app.post("/crawl", async (req, res) => { const config: Config = req.body; try { const validatedConfig = configSchema.parse(config); - await crawl(validatedConfig); - await write(validatedConfig); - const outputFileContent = await readFile( - validatedConfig.outputFileName, - "utf-8", - ); + const crawler = new GPTCrawlerCore(validatedConfig); + await crawler.crawl(); + const outputFileName: PathLike = await crawler.write(); + const outputFileContent = await readFile(outputFileName, "utf-8"); res.contentType("application/json"); return res.send(outputFileContent); } catch (error) { diff --git a/tsconfig.json b/tsconfig.json index ddf6ba3b..a4193efa 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -4,6 +4,7 @@ "module": "ES2022", "target": "ES2022", "outDir": "dist", + "moduleResolution": "node", "resolveJsonModule": true, "noUnusedLocals": false, "skipLibCheck": true,