CodSpeedHQ · coco-speed · Jul 28, 2024 · Jul 31, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -57,12 +57,12 @@ jobs:
  runs-on: ubuntu-20.04
  strategy:
  matrix:
- python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"]
  use-crypto-lib: ["cryptography"]
  include:
- - python-version: "3.7"
+ - python-version: "3.8"
  use-crypto-lib: "pycryptodome"
- - python-version: "3.7"
+ - python-version: "3.8"
  use-crypto-lib: "none"
  steps:
  - name: Update APT packages
@@ -83,14 +83,14 @@ jobs:
  key: cache-downloaded-files
  - name: Setup Python
  uses: actions/setup-python@v5
- if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+ if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
  with:
  python-version: ${{ matrix.python-version }}
  cache: 'pip'
  cache-dependency-path: '**/requirements/ci.txt'
  - name: Setup Python (3.11+)
  uses: actions/setup-python@v5
- if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+ if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
  with:
  python-version: ${{ matrix.python-version }}
  allow-prereleases: true
@@ -102,11 +102,11 @@ jobs:
  - name: Install requirements (Python 3)
  run: |
  pip install -r requirements/ci.txt
- if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+ if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
  - name: Install requirements (Python 3.11+)
  run: |
  pip install -r requirements/ci-3.11.txt
- if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+ if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
  - name: Remove pycryptodome and cryptography
  run: |
  pip uninstall pycryptodome cryptography -y

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -12,6 +12,9 @@ on:
 permissions:
  contents: write
 
+env:
+ HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
+
 jobs:
  build_and_publish:
  name: Publish a new version
@@ -24,15 +27,15 @@ jobs:
  - name: Extract version from commit message
  id: extract_version
  run: |
- VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
+ VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
  echo "version=$VERSION" >> $GITHUB_OUTPUT
 
  - name: Extract tag message from commit message
  id: extract_message
  run: |
  VERSION="${{ steps.extract_version.outputs.version }}"
  delimiter="$(openssl rand -hex 8)"
- MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" )
+ MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" )
  echo "message<<${delimiter}" >> $GITHUB_OUTPUT
  echo "$MESSAGE" >> $GITHUB_OUTPUT
  echo "${delimiter}" >> $GITHUB_OUTPUT

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr
 * [ediamondscience](https://github.com/ediamondscience)
 * [Ermeson, Felipe](https://github.com/FelipeErmeson)
 * [Freitag, François](https://github.com/francoisfreitag)
+* [Gagnon, William G.](https://github.com/williamgagnon)
 * [Górny, Michał](https://github.com/mgorny)
 * [Grillo, Miguel](https://github.com/Ineffable22)
 * [Gutteridge, David H.](https://github.com/dhgutteridge)

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -28,7 +28,6 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 import math
-import sys
 from decimal import Decimal
 from pathlib import Path
 from typing import (
@@ -38,6 +37,7 @@
  Iterable,
  Iterator,
  List,
+ Literal,
  Optional,
  Sequence,
  Set,
@@ -85,12 +85,6 @@
  StreamObject,
 )
 
-if sys.version_info >= (3, 8):
- from typing import Literal
-else:
- from typing_extensions import Literal
-
-
 MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'
 
 

diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
@@ -2,13 +2,7 @@
 
 from abc import abstractmethod
 from pathlib import Path
-from typing import IO, Any, Dict, List, Optional, Tuple, Union
-
-try:
- # Python 3.8+: https://peps.python.org/pep-0586
- from typing import Protocol
-except ImportError:
- from typing_extensions import Protocol # type: ignore[assignment]
+from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union
 
 from ._utils import StrByteType, StreamType
 

diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -1,22 +1,16 @@
 """Extract PDF text preserving the layout of the source PDF"""
 
-import sys
 from itertools import groupby
 from math import ceil
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, TypedDict
 
 from ..._utils import logger_warning
 from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
 from ._font import Font
 from ._text_state_manager import TextStateManager
 from ._text_state_params import TextStateParams
 
-if sys.version_info >= (3, 8):
- from typing import Literal, TypedDict
-else:
- from typing_extensions import Literal, TypedDict
-
 
 class BTGroup(TypedDict):
  """

diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -1,8 +1,9 @@
 """Font constants and classes for "layout" mode text operations"""
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, Sequence, Union
+from typing import Any, Dict, Sequence, Union, cast
 
+from ...errors import ParseError
 from ...generic import IndirectObject
 from ._font_widths import STANDARD_WIDTHS
 
@@ -58,6 +59,7 @@ def __post_init__(self) -> None:
  skip_count = 0
  _w = d_font.get("/W", [])
  for idx, w_entry in enumerate(_w):
+ w_entry = w_entry.get_object()
  if skip_count:
  skip_count -= 1
  continue
@@ -66,32 +68,38 @@ def __post_init__(self) -> None:
  # warning and or use reader's "strict" to force an ex???
  continue
  # check for format (1): `int [int int int int ...]`
- if isinstance(_w[idx + 1], Sequence):
- start_idx, width_list = _w[idx : idx + 2]
+ w_next_entry = _w[idx + 1].get_object()
+ if isinstance(w_next_entry, Sequence):
+ start_idx, width_list = w_entry, w_next_entry
  self.width_map.update(
  {
  ord_map[_cidx]: _width
  for _cidx, _width in zip(
- range(start_idx, start_idx + len(width_list), 1),
+ range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1),
  width_list,
  )
  if _cidx in ord_map
  }
  )
  skip_count = 1
  # check for format (2): `int int int`
- if not isinstance(_w[idx + 1], Sequence) and not isinstance(
- _w[idx + 2], Sequence
- ):
- start_idx, stop_idx, const_width = _w[idx : idx + 3]
+ elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)):
+ start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object()
  self.width_map.update(
  {
  ord_map[_cidx]: const_width
- for _cidx in range(start_idx, stop_idx + 1, 1)
+ for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1)
  if _cidx in ord_map
  }
  )
  skip_count = 2
+ else:
+ # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
+ # while expecting more elements). This raises an IndexError which is sufficient.
+ raise ParseError(
+ f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
+ ) # pragma: no cover
+
  if not self.width_map and "/BaseFont" in self.font_dictionary:
  for key in STANDARD_WIDTHS:
  if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -347,14 +347,11 @@ def b_(s: Union[str, bytes]) -> bytes:
  return bc[s]
  try:
  r = s.encode("latin-1")
- if len(s) < 2:
- bc[s] = r
- return r
- except Exception:
+ except UnicodeEncodeError:
  r = s.encode("utf-8")
-  if len(s) < 2:
-  bc[s] = r
-  return r
+ if len(s) < 2:
+ bc[s] = r
+ return r
 
 
 def str_(b: Any) -> str:
@@ -390,20 +387,6 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
 WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"
 
 
-def paeth_predictor(left: int, up: int, up_left: int) -> int:
- p = left + up - up_left
- dist_left = abs(p - left)
- dist_up = abs(p - up)
- dist_up_left = abs(p - up_left)
-
- if dist_left <= dist_up and dist_left <= dist_up_left:
- return left
- elif dist_up <= dist_up_left:
- return up
- else:
- return up_left
-
-
 def deprecate(msg: str, stacklevel: int = 3) -> None:
  warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel)
 

diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
@@ -2,11 +2,11 @@
 
 import sys
 from io import BytesIO
-from typing import Any, List, Tuple, Union, cast
+from typing import Any, List, Literal, Tuple, Union, cast
 
 from ._utils import check_if_whitespace_only, logger_warning
 from .constants import ColorSpaces
-from .errors import PdfReadError
+from .errors import EmptyImageDataError, PdfReadError
 from .generic import (
  ArrayObject,
  DecodedStreamObject,
@@ -15,13 +15,6 @@
  NullObject,
 )
 
-if sys.version_info[:2] >= (3, 8):
- from typing import Literal
-else:
- # PEP 586 introduced typing.Literal with Python 3.8
- # For older Python versions, the backport typing_extensions is necessary:
- from typing_extensions import Literal
-
 if sys.version_info[:2] >= (3, 10):
  from typing import TypeAlias
 else:
@@ -148,9 +141,14 @@ def _extended_image_frombytes(
  img = Image.frombytes(mode, size, data)
  except ValueError as exc:
  nb_pix = size[0] * size[1]
- if len(data) % nb_pix != 0:
+ data_length = len(data)
+ if data_length == 0:
+ raise EmptyImageDataError(
+ "Data is 0 bytes, cannot process an image from empty data."
+ ) from exc
+ if data_length % nb_pix != 0:
  raise exc
- k = nb_pix * len(mode) / len(data)
+ k = nb_pix * len(mode) / data_length
  data = b"".join([bytes((x,) * int(k)) for x in data])
  img = Image.frombytes(mode, size, data)
  return img

diff --git a/pypdf/annotations/_markup_annotations.py b/pypdf/annotations/_markup_annotations.py
@@ -104,9 +104,9 @@ def __init__(
  self[NameObject("/Rect")] = RectangleObject(rect)
 
  font_str = "font: "
- if bold is True:
+ if bold:
  font_str = f"{font_str}bold "
- if italic is True:
+ if italic:
  font_str = f"{font_str}italic "
  font_str = f"{font_str}{font} {font_size}"
  font_str = f"{font_str};text-align:left;color:#{font_color}"

diff --git a/pypdf/annotations/_non_markup_annotations.py b/pypdf/annotations/_non_markup_annotations.py
@@ -1,6 +1,5 @@
 from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
 
-from ..constants import AnnotationFlag
 from ..generic._base import (
  BooleanObject,
  NameObject,
@@ -12,8 +11,6 @@
 from ..generic._rectangle import RectangleObject
 from ._base import AnnotationDictionary
 
-DEFAULT_ANNOTATION_FLAG = AnnotationFlag(0)
-
 
 class Link(AnnotationDictionary):
  def __init__(

diff --git a/pypdf/errors.py b/pypdf/errors.py
@@ -59,4 +59,8 @@ class EmptyFileError(PdfReadError):
  """Raised when a PDF file is empty or has no content."""
 
 
+class EmptyImageDataError(PyPdfError):
+ """Raised when trying to process an image that has no data."""
+
+
 STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
diff --git a/pypdf/types.py b/pypdf/types.py
@@ -1,13 +1,7 @@
 """Helpers for working with PDF types."""
 
 import sys
-from typing import List, Union
-
-if sys.version_info[:2] >= (3, 8):
- # Python 3.8+: https://peps.python.org/pep-0586
- from typing import Literal
-else:
- from typing_extensions import Literal
+from typing import List, Literal, Union
 
 if sys.version_info[:2] >= (3, 10):
  # Python 3.10+: https://www.python.org/dev/peps/pep-0484

diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt
@@ -6,7 +6,7 @@
 #
 attrs==23.1.0
  # via flake8-bugbear
-coverage[toml]==7.3.0
+coverage[toml]==7.6.0
  # via
  # -r requirements/ci.in
  # pytest-cov
@@ -35,7 +35,7 @@ mypy-extensions==1.0.0
  # via mypy
 packaging==23.1
  # via pytest
-pillow==10.0.1
+pillow==10.4.0
  # via
  # -r requirements/ci.in
  # fpdf2

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -10,6 +10,7 @@
 
 from pypdf import PdfReader, mult
 from pypdf._text_extraction import set_custom_rtl
+from pypdf.errors import ParseError
 
 from . import get_data_from_url
 
@@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths():
  encoding="utf-8"
  )
  assert expected == reader.pages[0].extract_text(extraction_mode="layout")
+
+
+@pytest.mark.enable_socket()
+def test_layout_mode_indirect_sequence_font_widths():
+ # Cover the situation where the sequence for font widths is an IndirectObject
+ # ref https://github.com/py-pdf/pypdf/pull/2788
+ url = "https://github.com/user-attachments/files/16491621/2788_example.pdf"
+ name ="2788_example.pdf"
+ reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+ assert reader.pages[0].extract_text(extraction_mode="layout") == ""
+ url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf"
+ name = "2788_example_malformed.pdf"
+ reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+ with pytest.raises(ParseError) as exc:
+ reader.pages[0].extract_text(extraction_mode="layout")
+ assert str(exc.value).startswith("Invalid font width definition")