Skip to content

Commit

Permalink
Adding headers for accessing pdf file url (langchain-ai#10370)
Browse files Browse the repository at this point in the history
- Description: Set up 'file_headers' params for accessing pdf file url
  - Tag maintainer: @hwchase17 

✅ make format, make lint, make test

---------

Co-authored-by: Bagatur <[email protected]>
Co-authored-by: Eugene Yurtsev <[email protected]>
Co-authored-by: Bagatur <[email protected]>
  • Loading branch information
4 people authored Sep 13, 2023
1 parent a345105 commit 2dc3c64
Showing 1 changed file with 40 additions and 28 deletions.
68 changes: 40 additions & 28 deletions libs/langchain/langchain/document_loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import Any, Iterator, List, Mapping, Optional, Sequence, Union
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union
from urllib.parse import urlparse

import requests
Expand Down Expand Up @@ -62,14 +62,20 @@ def _get_elements(self) -> List:
class BasePDFLoader(BaseLoader, ABC):
"""Base Loader class for `PDF` files.
Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, use it, then clean up the temporary file after completion
If the file is a web path, it will download it to a temporary file, use it, then
clean up the temporary file after completion.
"""

def __init__(self, file_path: str):
"""Initialize with a file path."""
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
"""Initialize with a file path.
Args:
file_path: Either a local, S3 or web path to a PDF file.
headers: Headers to use for GET request to download a file from a web path.
"""
self.file_path = file_path
self.web_path = None
self.headers = headers
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)

Expand All @@ -78,18 +84,15 @@ def __init__(self, file_path: str):
self.temp_dir = tempfile.TemporaryDirectory()
_, suffix = os.path.splitext(self.file_path)
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
if self._is_s3_url(self.file_path):
self.web_path = self.file_path
else:
r = requests.get(self.file_path)

self.web_path = self.file_path
if not self._is_s3_url(self.file_path):
r = requests.get(self.file_path, headers=self.headers)
if r.status_code != 200:
raise ValueError(
"Check the url of your file; returned status code %s"
% r.status_code
)

self.web_path = self.file_path
with open(temp_pdf, mode="wb") as f:
f.write(r.content)
self.file_path = str(temp_pdf)
Expand Down Expand Up @@ -138,7 +141,10 @@ class PyPDFLoader(BasePDFLoader):
"""

def __init__(
self, file_path: str, password: Optional[Union[str, bytes]] = None
self,
file_path: str,
password: Optional[Union[str, bytes]] = None,
headers: Optional[Dict] = None,
) -> None:
"""Initialize with a file path."""
try:
Expand All @@ -148,7 +154,7 @@ def __init__(
"pypdf package not found, please install it with " "`pip install pypdf`"
)
self.parser = PyPDFParser(password=password)
super().__init__(file_path)
super().__init__(file_path, headers=headers)

def load(self) -> List[Document]:
"""Load given path as pages."""
Expand All @@ -165,9 +171,9 @@ def lazy_load(
class PyPDFium2Loader(BasePDFLoader):
"""Load `PDF` using `pypdfium2` and chunks at character level."""

def __init__(self, file_path: str):
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
"""Initialize with a file path."""
super().__init__(file_path)
super().__init__(file_path, headers=headers)
self.parser = PyPDFium2Parser()

def load(self) -> List[Document]:
Expand Down Expand Up @@ -230,7 +236,7 @@ def load(self) -> List[Document]:
class PDFMinerLoader(BasePDFLoader):
"""Load `PDF` files using `PDFMiner`."""

def __init__(self, file_path: str) -> None:
def __init__(self, file_path: str, *, headers: Optional[Dict] = None) -> None:
"""Initialize with file path."""
try:
from pdfminer.high_level import extract_text # noqa:F401
Expand All @@ -240,7 +246,7 @@ def __init__(self, file_path: str) -> None:
"`pip install pdfminer.six`"
)

super().__init__(file_path)
super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser()

def load(self) -> List[Document]:
Expand All @@ -258,7 +264,7 @@ def lazy_load(
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
"""Load `PDF` files as HTML content using `PDFMiner`."""

def __init__(self, file_path: str):
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
"""Initialize with a file path."""
try:
from pdfminer.high_level import extract_text_to_fp # noqa:F401
Expand All @@ -268,7 +274,7 @@ def __init__(self, file_path: str):
"`pip install pdfminer.six`"
)

super().__init__(file_path)
super().__init__(file_path, headers=headers)

def load(self) -> List[Document]:
"""Load file."""
Expand All @@ -292,7 +298,7 @@ def load(self) -> List[Document]:
class PyMuPDFLoader(BasePDFLoader):
"""Load `PDF` files using `PyMuPDF`."""

def __init__(self, file_path: str) -> None:
def __init__(self, file_path: str, *, headers: Optional[Dict] = None) -> None:
"""Initialize with a file path."""
try:
import fitz # noqa:F401
Expand All @@ -302,7 +308,7 @@ def __init__(self, file_path: str) -> None:
"`pip install pymupdf`"
)

super().__init__(file_path)
super().__init__(file_path, headers=headers)

def load(self, **kwargs: Optional[Any]) -> List[Document]:
"""Load file."""
Expand Down Expand Up @@ -335,19 +341,19 @@ def __init__(
should_clean_pdf: a flag to clean the PDF file. Default is False.
**kwargs: additional keyword arguments.
"""
super().__init__(file_path)
self.mathpix_api_key = get_from_dict_or_env(
kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
)
self.mathpix_api_id = get_from_dict_or_env(
kwargs, "mathpix_api_id", "MATHPIX_API_ID"
)
super().__init__(file_path, **kwargs)
self.processed_file_format = processed_file_format
self.max_wait_time_seconds = max_wait_time_seconds
self.should_clean_pdf = should_clean_pdf

@property
def headers(self) -> dict:
def _mathpix_headers(self) -> Dict[str, str]:
return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}

@property
Expand All @@ -363,7 +369,7 @@ def send_pdf(self) -> str:
with open(self.file_path, "rb") as f:
files = {"file": f}
response = requests.post(
self.url, headers=self.headers, files=files, data=self.data
self.url, headers=self._mathpix_headers, files=files, data=self.data
)
response_data = response.json()
if "pdf_id" in response_data:
Expand Down Expand Up @@ -441,6 +447,7 @@ def __init__(
file_path: str,
text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False,
headers: Optional[Dict] = None,
) -> None:
"""Initialize with a file path."""
try:
Expand All @@ -451,7 +458,7 @@ def __init__(
"`pip install pdfplumber`"
)

super().__init__(file_path)
super().__init__(file_path, headers=headers)
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe

Expand Down Expand Up @@ -493,6 +500,7 @@ def __init__(
credentials_profile_name: Optional[str] = None,
region_name: Optional[str] = None,
endpoint_url: Optional[str] = None,
headers: Optional[Dict] = None,
) -> None:
"""Initialize the loader.
Expand All @@ -507,7 +515,7 @@ def __init__(
endpoint_url: endpoint url for the textract service (Optional)
"""
super().__init__(file_path)
super().__init__(file_path, headers=headers)

try:
import textractcaller as tc # noqa: F401
Expand Down Expand Up @@ -608,7 +616,11 @@ class DocumentIntelligenceLoader(BasePDFLoader):
"""Loads a PDF with Azure Document Intelligence"""

def __init__(
self, file_path: str, client: Any, model: str = "prebuilt-document"
self,
file_path: str,
client: Any,
model: str = "prebuilt-document",
headers: Optional[Dict] = None,
) -> None:
"""
Initialize the object for file processing with Azure Document Intelligence
Expand Down Expand Up @@ -638,7 +650,7 @@ def __init__(
"""

self.parser = DocumentIntelligenceParser(client=client, model=model)
super().__init__(file_path)
super().__init__(file_path, headers=headers)

def load(self) -> List[Document]:
"""Load given path as pages."""
Expand Down

0 comments on commit 2dc3c64

Please sign in to comment.