From a20a003c740307945ba752d31c3912ba485963a5 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 12:12:22 -0500 Subject: [PATCH] revert to pdf parse --- apps/api/package.json | 2 +- apps/api/pnpm-lock.yaml | 309 +----------------- .../scraper/scrapeURL/engines/pdf/index.ts | 17 +- apps/api/tsconfig.json | 3 +- 4 files changed, 22 insertions(+), 309 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 1ccfee5e2..c4e70901d 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -58,7 +58,6 @@ "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", "@nangohq/node": "^0.40.8", - "@opendocsg/pdf2md": "^0.2.1", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", @@ -101,6 +100,7 @@ "mongoose": "^8.4.4", "natural": "^7.0.7", "openai": "^4.57.0", + "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", "promptable": "^0.0.10", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 9014c42e7..569eafd9c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -29,9 +29,6 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 - '@opendocsg/pdf2md': - specifier: ^0.2.1 - version: 0.2.1 '@sentry/cli': specifier: ^2.33.1 version: 2.33.1 @@ -158,6 +155,9 @@ importers: openai: specifier: ^4.57.0 version: 4.57.0(zod@3.23.8) + pdf-parse: + specifier: ^1.1.1 + version: 1.1.1 pos: specifier: ^0.4.2 version: 0.4.2 @@ -794,10 +794,6 @@ packages: resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==} engines: {node: '>=18'} - '@mapbox/node-pre-gyp@1.0.11': - resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==} - hasBin: true - '@mixmark-io/domino@2.2.0': resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} @@ -841,10 +837,6 @@ packages: '@one-ini/wasm@0.1.1': resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==} - '@opendocsg/pdf2md@0.2.1': - resolution: {integrity: sha512-k/yvfrTb+GPTIIm/bMm5IsenTqAFl+IqvkBgFwFlmflS5TT7FOfyRLp8MypVWLAG4G9AnT7AZFbdQYgN/CR5BA==} - hasBin: true - '@opentelemetry/api-logs@0.52.1': resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==} engines: {node: '>=14'} @@ -1623,9 +1615,6 @@ packages: resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} engines: {node: '>=10.0.0'} - abbrev@1.1.1: - resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} - abbrev@2.0.0: resolution: {integrity: sha512-6/mh1E2u2YgEsCHdY0Yx5oW+61gZU+1vXaoiHHrpKeuRNNgFvS+/jrwHiQhB5apAf5oB7UB7E19ol2R2LKH8hQ==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -1719,14 +1708,6 @@ packages: resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==} engines: {node: '>=0.2.6'} - aproba@2.0.0: - resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==} - - are-we-there-yet@2.0.0: - resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==} - engines: {node: '>=10'} - deprecated: This package is no longer supported. - arg@4.1.3: resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} @@ -1927,10 +1908,6 @@ packages: caniuse-lite@1.0.30001627: resolution: {integrity: sha512-4zgNiB8nTyV/tHhwZrFs88ryjls/lHiqFhrxCW4qSTeuRByBVnPYpDInchOIySWknznucaf31Z4KYqjfbrecVw==} - canvas@2.11.2: - resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==} - engines: {node: '>=6'} - chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -1961,10 +1938,6 @@ packages: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} - chownr@2.0.0: - resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} - engines: {node: '>=10'} - chownr@3.0.0: resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==} engines: {node: '>=18'} @@ -2022,10 +1995,6 @@ packages: color-string@1.9.1: resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} - color-support@1.1.3: - resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==} - hasBin: true - color@3.2.1: resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==} @@ -2052,9 +2021,6 @@ packages: config-chain@1.1.13: resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==} - console-control-strings@1.1.0: - resolution: {integrity: sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==} - content-disposition@0.5.4: resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==} engines: {node: '>= 0.6'} @@ -2181,10 +2147,6 @@ packages: resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} engines: {node: '>=10'} - decompress-response@4.2.1: - resolution: {integrity: sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==} - engines: {node: '>=8'} - dedent@1.5.3: resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==} peerDependencies: @@ -2209,9 +2171,6 @@ packages: resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} engines: {node: '>=0.4.0'} - delegates@1.0.0: - resolution: {integrity: sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==} - denque@2.1.0: resolution: {integrity: sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==} engines: {node: '>=0.10'} @@ -2324,9 +2283,6 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} - enumify@1.0.4: - resolution: {integrity: sha512-5mwWXaVzJaqyUdOW/PDH5QySRgmQ8VvujmxmvXoXj9w0n+6omhVuyD56eI37FMqy/LxueJzsQ4DrHVQzuT/TXg==} - env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -2528,10 +2484,6 @@ packages: resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==} engines: {node: '>=14.14'} - fs-minipass@2.1.0: - resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==} - engines: {node: '>= 8'} - fs.realpath@1.0.0: resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} @@ -2543,11 +2495,6 @@ packages: function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} - gauge@3.0.2: - resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==} - engines: {node: '>=10'} - deprecated: This package is no longer supported. - generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -2631,9 +2578,6 @@ packages: resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==} engines: {node: '>= 0.4'} - has-unicode@2.0.1: - resolution: {integrity: sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==} - hasown@2.0.2: resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} engines: {node: '>= 0.4'} @@ -3312,10 +3256,6 @@ packages: resolution: {integrity: sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==} engines: {node: '>=12'} - make-dir@3.1.0: - resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} - engines: {node: '>=8'} - make-dir@4.0.0: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} @@ -3386,10 +3326,6 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} - mimic-response@2.1.0: - resolution: {integrity: sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==} - engines: {node: '>=8'} - minimatch@3.1.2: resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} @@ -3408,22 +3344,10 @@ packages: minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} - minipass@3.3.6: - resolution: {integrity: sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==} - engines: {node: '>=8'} - - minipass@5.0.0: - resolution: {integrity: sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==} - engines: {node: '>=8'} - minipass@7.1.2: resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} engines: {node: '>=16 || 14 >=14.17'} - minizlib@2.1.2: - resolution: {integrity: sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==} - engines: {node: '>= 8'} - minizlib@3.0.1: resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==} engines: {node: '>= 18'} @@ -3435,11 +3359,6 @@ packages: resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} hasBin: true - mkdirp@1.0.4: - resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==} - engines: {node: '>=10'} - hasBin: true - mkdirp@3.0.1: resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==} engines: {node: '>=10'} @@ -3528,9 +3447,6 @@ packages: resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==} hasBin: true - nan@2.22.0: - resolution: {integrity: sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==} - natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} @@ -3591,11 +3507,6 @@ packages: engines: {node: '>=8.10.0'} hasBin: true - nopt@5.0.0: - resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==} - engines: {node: '>=6'} - hasBin: true - nopt@7.2.1: resolution: {integrity: sha512-taM24ViiimT/XntxbPyJQzCG+p4EKOpgD3mxFwW38mGjVUrfERQOeY4EDHjdnptttfHuHQXFx+lTP08Q+mLa/w==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -3613,10 +3524,6 @@ packages: resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} engines: {node: '>=8'} - npmlog@5.0.1: - resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==} - deprecated: This package is no longer supported. - nth-check@2.1.1: resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} @@ -4042,11 +3949,6 @@ packages: resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} engines: {node: '>= 4'} - rimraf@3.0.2: - resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} - deprecated: Rimraf versions prior to v4 are no longer supported - hasBin: true - rimraf@5.0.7: resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==} engines: {node: '>=14.18'} @@ -4117,9 +4019,6 @@ packages: resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==} engines: {node: '>= 0.8.0'} - set-blocking@2.0.0: - resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==} - set-function-length@1.2.2: resolution: {integrity: sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==} engines: {node: '>= 0.4'} @@ -4158,12 +4057,6 @@ packages: resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} engines: {node: '>=14'} - simple-concat@1.0.1: - resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} - - simple-get@3.1.1: - resolution: {integrity: sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==} - simple-swizzle@0.2.2: resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} @@ -4322,10 +4215,6 @@ packages: tar-stream@3.1.7: resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==} - tar@6.2.1: - resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==} - engines: {node: '>=10'} - tar@7.2.0: resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==} engines: {node: '>=18'} @@ -4480,9 +4369,6 @@ packages: resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==} engines: {node: '>= 10.0.0'} - unpdf@0.12.1: - resolution: {integrity: sha512-ktP8+TTLDBrlu/j8rQVNbHoMMpFXzkVAkb1rt/JdshFC3jOHdZjuGCNl/voPL0kraUrUOH7ZC88kVxMvlvDBzA==} - unpipe@1.0.0: resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} engines: {node: '>= 0.8'} @@ -4570,9 +4456,6 @@ packages: engines: {node: '>= 8'} hasBin: true - wide-align@1.1.5: - resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==} - winston-transport@4.8.0: resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==} engines: {node: '>= 12.0.0'} @@ -5724,22 +5607,6 @@ snapshots: - langchain - openai - '@mapbox/node-pre-gyp@1.0.11': - dependencies: - detect-libc: 2.0.3 - https-proxy-agent: 5.0.1 - make-dir: 3.1.0 - node-fetch: 2.7.0 - nopt: 5.0.0 - npmlog: 5.0.1 - rimraf: 3.0.2 - semver: 7.6.2 - tar: 6.2.1 - transitivePeerDependencies: - - encoding - - supports-color - optional: true - '@mixmark-io/domino@2.2.0': {} '@mongodb-js/saslprep@1.1.7': @@ -5772,15 +5639,6 @@ snapshots: '@one-ini/wasm@0.1.1': {} - '@opendocsg/pdf2md@0.2.1': - dependencies: - enumify: 1.0.4 - minimist: 1.2.8 - unpdf: 0.12.1 - transitivePeerDependencies: - - encoding - - supports-color - '@opentelemetry/api-logs@0.52.1': dependencies: '@opentelemetry/api': 1.9.0 @@ -6815,9 +6673,6 @@ snapshots: '@xmldom/xmldom@0.8.10': {} - abbrev@1.1.1: - optional: true - abbrev@2.0.0: {} abort-controller@3.0.0: @@ -6900,15 +6755,6 @@ snapshots: dependencies: sylvester: 0.0.12 - aproba@2.0.0: - optional: true - - are-we-there-yet@2.0.0: - dependencies: - delegates: 1.0.0 - readable-stream: 3.6.2 - optional: true - arg@4.1.3: {} argparse@1.0.10: @@ -7162,16 +7008,6 @@ snapshots: caniuse-lite@1.0.30001627: {} - canvas@2.11.2: - dependencies: - '@mapbox/node-pre-gyp': 1.0.11 - nan: 2.22.0 - simple-get: 3.1.1 - transitivePeerDependencies: - - encoding - - supports-color - optional: true - chalk@2.4.2: dependencies: ansi-styles: 3.2.1 @@ -7220,9 +7056,6 @@ snapshots: optionalDependencies: fsevents: 2.3.3 - chownr@2.0.0: - optional: true - chownr@3.0.0: {} chromium-bidi@0.5.24(devtools-protocol@0.0.1299070): @@ -7288,9 +7121,6 @@ snapshots: color-name: 1.1.4 simple-swizzle: 0.2.2 - color-support@1.1.3: - optional: true - color@3.2.1: dependencies: color-convert: 1.9.3 @@ -7318,9 +7148,6 @@ snapshots: ini: 1.3.8 proto-list: 1.2.4 - console-control-strings@1.1.0: - optional: true - content-disposition@0.5.4: dependencies: safe-buffer: 5.2.1 @@ -7428,11 +7255,6 @@ snapshots: decamelize@4.0.0: {} - decompress-response@4.2.1: - dependencies: - mimic-response: 2.1.0 - optional: true - dedent@1.5.3: {} deepmerge@4.3.1: {} @@ -7451,9 +7273,6 @@ snapshots: delayed-stream@1.0.0: {} - delegates@1.0.0: - optional: true - denque@2.1.0: {} depd@2.0.0: {} @@ -7545,8 +7364,6 @@ snapshots: entities@4.5.0: {} - enumify@1.0.4: {} - env-paths@2.2.1: {} error-ex@1.3.2: @@ -7661,7 +7478,7 @@ snapshots: extract-zip@2.0.1: dependencies: - debug: 4.3.4 + debug: 4.3.5 get-stream: 5.2.0 yauzl: 2.10.0 optionalDependencies: @@ -7772,11 +7589,6 @@ snapshots: jsonfile: 6.1.0 universalify: 2.0.1 - fs-minipass@2.1.0: - dependencies: - minipass: 3.3.6 - optional: true - fs.realpath@1.0.0: {} fsevents@2.3.3: @@ -7784,19 +7596,6 @@ snapshots: function-bind@1.1.2: {} - gauge@3.0.2: - dependencies: - aproba: 2.0.0 - color-support: 1.1.3 - console-control-strings: 1.1.0 - has-unicode: 2.0.1 - object-assign: 4.1.1 - signal-exit: 3.0.7 - string-width: 4.2.3 - strip-ansi: 6.0.1 - wide-align: 1.1.5 - optional: true - generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -7823,7 +7622,7 @@ snapshots: dependencies: basic-ftp: 5.0.5 data-uri-to-buffer: 6.0.2 - debug: 4.3.4 + debug: 4.3.5 fs-extra: 11.2.0 transitivePeerDependencies: - supports-color @@ -7884,9 +7683,6 @@ snapshots: has-symbols@1.0.3: {} - has-unicode@2.0.1: - optional: true - hasown@2.0.2: dependencies: function-bind: 1.1.2 @@ -7927,7 +7723,7 @@ snapshots: http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -7975,7 +7771,7 @@ snapshots: https-proxy-agent@7.0.5: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -8667,11 +8463,6 @@ snapshots: luxon@3.4.4: {} - make-dir@3.1.0: - dependencies: - semver: 6.3.1 - optional: true - make-dir@4.0.0: dependencies: semver: 7.6.2 @@ -8732,9 +8523,6 @@ snapshots: mimic-fn@2.1.0: {} - mimic-response@2.1.0: - optional: true - minimatch@3.1.2: dependencies: brace-expansion: 1.1.11 @@ -8753,22 +8541,8 @@ snapshots: minimist@1.2.8: {} - minipass@3.3.6: - dependencies: - yallist: 4.0.0 - optional: true - - minipass@5.0.0: - optional: true - minipass@7.1.2: {} - minizlib@2.1.2: - dependencies: - minipass: 3.3.6 - yallist: 4.0.0 - optional: true - minizlib@3.0.1: dependencies: minipass: 7.1.2 @@ -8780,9 +8554,6 @@ snapshots: dependencies: minimist: 1.2.8 - mkdirp@1.0.4: - optional: true - mkdirp@3.0.1: {} ml-array-mean@1.1.6: @@ -8875,9 +8646,6 @@ snapshots: mustache@4.2.0: {} - nan@2.22.0: - optional: true - natural-compare@1.4.0: {} natural@7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3): @@ -8924,8 +8692,7 @@ snapshots: node-domexception@1.0.0: {} - node-ensure@0.0.0: - optional: true + node-ensure@0.0.0: {} node-fetch@2.7.0: dependencies: @@ -8959,11 +8726,6 @@ snapshots: touch: 3.1.1 undefsafe: 2.0.5 - nopt@5.0.0: - dependencies: - abbrev: 1.1.1 - optional: true - nopt@7.2.1: dependencies: abbrev: 2.0.0 @@ -8976,14 +8738,6 @@ snapshots: dependencies: path-key: 3.1.1 - npmlog@5.0.1: - dependencies: - are-we-there-yet: 2.0.0 - console-control-strings: 1.1.0 - gauge: 3.0.2 - set-blocking: 2.0.0 - optional: true - nth-check@2.1.1: dependencies: boolbase: 1.0.0 @@ -9082,7 +8836,7 @@ snapshots: dependencies: '@tootallnate/quickjs-emscripten': 0.23.0 agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 get-uri: 6.0.3 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 @@ -9155,7 +8909,6 @@ snapshots: node-ensure: 0.0.0 transitivePeerDependencies: - supports-color - optional: true peberminta@0.9.0: {} @@ -9278,7 +9031,7 @@ snapshots: proxy-agent@6.4.0: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 lru-cache: 7.18.3 @@ -9457,11 +9210,6 @@ snapshots: retry@0.13.1: {} - rimraf@3.0.2: - dependencies: - glob: 7.2.3 - optional: true - rimraf@5.0.7: dependencies: glob: 10.4.2 @@ -9537,9 +9285,6 @@ snapshots: transitivePeerDependencies: - supports-color - set-blocking@2.0.0: - optional: true - set-function-length@1.2.2: dependencies: define-data-property: 1.1.4 @@ -9576,16 +9321,6 @@ snapshots: signal-exit@4.1.0: {} - simple-concat@1.0.1: - optional: true - - simple-get@3.1.1: - dependencies: - decompress-response: 4.2.1 - once: 1.4.0 - simple-concat: 1.0.1 - optional: true - simple-swizzle@0.2.2: dependencies: is-arrayish: 0.3.2 @@ -9603,7 +9338,7 @@ snapshots: socks-proxy-agent@8.0.4: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 socks: 2.8.3 transitivePeerDependencies: - supports-color @@ -9759,16 +9494,6 @@ snapshots: fast-fifo: 1.3.2 streamx: 2.18.0 - tar@6.2.1: - dependencies: - chownr: 2.0.0 - fs-minipass: 2.1.0 - minipass: 5.0.0 - minizlib: 2.1.2 - mkdirp: 1.0.4 - yallist: 4.0.0 - optional: true - tar@7.2.0: dependencies: '@isaacs/fs-minipass': 4.0.1 @@ -9901,13 +9626,6 @@ snapshots: universalify@2.0.1: {} - unpdf@0.12.1: - optionalDependencies: - canvas: 2.11.2 - transitivePeerDependencies: - - encoding - - supports-color - unpipe@1.0.0: {} unstructured-client@0.11.3(zod@3.23.8): @@ -9980,11 +9698,6 @@ snapshots: dependencies: isexe: 2.0.0 - wide-align@1.1.5: - dependencies: - string-width: 4.2.3 - optional: true - winston-transport@4.8.0: dependencies: logform: 2.6.1 diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index c3baa6944..0983e4b16 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -6,7 +6,7 @@ import { robustFetch } from "../../lib/fetch"; import { z } from "zod"; import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; -import pdf2md from "@opendocsg/pdf2md"; +import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { RemoveFeatureError } from "../../error"; @@ -113,11 +113,10 @@ async function scrapePDFWithParsePDF( meta: Meta, tempFilePath: string, ): Promise { - meta.logger.debug("Processing PDF document with pdf2md", { tempFilePath }); + meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); - const pdfBuffer = await fs.readFile(tempFilePath); - const markdown = await pdf2md(pdfBuffer); - const escaped = escapeHtml(markdown); + const result = await PdfParse(await fs.readFile(tempFilePath)); + const escaped = escapeHtml(result.text); return { markdown: escaped, @@ -142,7 +141,7 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom let result: PDFProcessorResult | null = null; - // First, try parsing with pdf2md + // First, try parsing with PdfParse result = await scrapePDFWithParsePDF( { ...meta, @@ -170,14 +169,14 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- using pdf2md result", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- using pdf2md result", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -194,4 +193,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom html: result.html, markdown: result.markdown, }; -} +} \ No newline at end of file diff --git a/apps/api/tsconfig.json b/apps/api/tsconfig.json index 29093be6f..ab2a9546c 100644 --- a/apps/api/tsconfig.json +++ b/apps/api/tsconfig.json @@ -3,6 +3,7 @@ "rootDir": "./src", "lib": ["ES2022", "DOM"], + // or higher "target": "ES2022", @@ -18,7 +19,7 @@ "*": ["node_modules/*", "src/types/*"], }, - "inlineSources": true + "inlineSources": true, }, "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] }