CodSpeedHQ · coco-speed · Jul 28, 2024 · Jul 31, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -57,12 +57,12 @@ jobs:
  runs-on: ubuntu-20.04
  strategy:
  matrix:
- python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"]
  use-crypto-lib: ["cryptography"]
  include:
- - python-version: "3.7"
+ - python-version: "3.8"
  use-crypto-lib: "pycryptodome"
- - python-version: "3.7"
+ - python-version: "3.8"
  use-crypto-lib: "none"
  steps:
  - name: Update APT packages
@@ -83,14 +83,14 @@ jobs:
  key: cache-downloaded-files
  - name: Setup Python
  uses: actions/setup-python@v5
- if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+ if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
  with:
  python-version: ${{ matrix.python-version }}
  cache: 'pip'
  cache-dependency-path: '**/requirements/ci.txt'
  - name: Setup Python (3.11+)
  uses: actions/setup-python@v5
- if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+ if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
  with:
  python-version: ${{ matrix.python-version }}
  allow-prereleases: true
@@ -102,11 +102,11 @@ jobs:
  - name: Install requirements (Python 3)
  run: |
  pip install -r requirements/ci.txt
- if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+ if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
  - name: Install requirements (Python 3.11+)
  run: |
  pip install -r requirements/ci-3.11.txt
- if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+ if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
  - name: Remove pycryptodome and cryptography
  run: |
  pip uninstall pycryptodome cryptography -y

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -12,6 +12,9 @@ on:
 permissions:
  contents: write
 
+env:
+ HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
+
 jobs:
  build_and_publish:
  name: Publish a new version
@@ -24,15 +27,15 @@ jobs:
  - name: Extract version from commit message
  id: extract_version
  run: |
- VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
+ VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
  echo "version=$VERSION" >> $GITHUB_OUTPUT
 
  - name: Extract tag message from commit message
  id: extract_message
  run: |
  VERSION="${{ steps.extract_version.outputs.version }}"
  delimiter="$(openssl rand -hex 8)"
- MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" )
+ MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" )
  echo "message<<${delimiter}" >> $GITHUB_OUTPUT
  echo "$MESSAGE" >> $GITHUB_OUTPUT
  echo "${delimiter}" >> $GITHUB_OUTPUT

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr
 * [ediamondscience](https://github.com/ediamondscience)
 * [Ermeson, Felipe](https://github.com/FelipeErmeson)
 * [Freitag, François](https://github.com/francoisfreitag)
+* [Gagnon, William G.](https://github.com/williamgagnon)
 * [Górny, Michał](https://github.com/mgorny)
 * [Grillo, Miguel](https://github.com/Ineffable22)
 * [Gutteridge, David H.](https://github.com/dhgutteridge)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
@@ -9,23 +9,17 @@ Some PDF documents contain the same object multiple times. For example, if an
 image appears three times in a PDF it could be embedded three times. Or it can
 be embedded once and referenced twice.
 
-This can be done by reading and writing the file:
+When adding data to a PdfWriter, the data is copied while respecting the original format.
+For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object.
 
-```python
-from pypdf import PdfReader, PdfWriter
-
-reader = PdfReader("big-old-file.pdf")
-writer = PdfWriter()
+Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed).
 
-for page in reader.pages:
- writer.add_page(page)
+In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)`
 
-if reader.metadata is not None:
- writer.add_metadata(reader.metadata)
+* `remove_identicals` enables/disables compression merging identical objects.
+* `remove_orphans` enables/disables suppression of unused objects.
 
-with open("smaller-new-file.pdf", "wb") as fp:
- writer.write(fp)
-```
+It is recommended to apply this process just before writing to the file/stream.
 
 It depends on the PDF how well this works, but we have seen an 86% file
 reduction (from 5.7 MB to 0.8 MB) within a real PDF.

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -3,11 +3,10 @@
 from typing import Any, Dict, List, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
-from ._utils import b_, logger_error, logger_warning
+from ._utils import logger_error, logger_warning
 from .generic import (
  DecodedStreamObject,
  DictionaryObject,
- IndirectObject,
  NullObject,
  StreamObject,
 )
@@ -258,8 +257,8 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
  tu = ft["/ToUnicode"]
  cm: bytes
  if isinstance(tu, StreamObject):
- cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data())
- elif isinstance(tu, str) and tu.startswith("/Identity"):
+ cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
+ else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
  # the full range 0000-FFFF will be processed
  cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
  if isinstance(cm, str):
@@ -448,34 +447,27 @@ def compute_space_width(
  en: int = cast(int, ft["/LastChar"])
  if st > space_code or en < space_code:
  raise Exception("Not in range")
- if w[space_code - st] == 0:
+ if w[space_code - st].get_object() == 0:
  raise Exception("null width")
- sp_width = w[space_code - st]
+ sp_width = w[space_code - st].get_object()
  except Exception:
  if "/FontDescriptor" in ft and "/MissingWidth" in cast(
  DictionaryObject, ft["/FontDescriptor"]
  ):
- sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore
+ sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
  else:
  # will consider width of char as avg(width)/2
  m = 0
  cpt = 0
- for x in w:
- if x > 0:
- m += x
+ for xx in w:
+ xx = xx.get_object()
+ if xx > 0:
+ m += xx
  cpt += 1
  sp_width = m / max(1, cpt) / 2
 
- if isinstance(sp_width, IndirectObject):
- # According to
- # 'Table 122 - Entries common to all font descriptors (continued)'
- # the MissingWidth should be a number, but according to #2286 it can
- # be an indirect object
- obj = sp_width.get_object()
- if obj is None or isinstance(obj, NullObject):
- return 0.0
- return obj # type: ignore
-
+ if sp_width is None or isinstance(sp_width, NullObject):
+ sp_width = 0.0
  return sp_width
 
 

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -49,7 +49,6 @@
 from ._page import PageObject, _VirtualList
 from ._page_labels import index2label as page_index2page_label
 from ._utils import (
- b_,
  deprecate_with_replacement,
  logger_warning,
  parse_iso8824_date,
@@ -1122,7 +1121,12 @@ def _flatten(
  obj = page.get_object()
  if obj:
  # damaged file may have invalid child in /Pages
- self._flatten(obj, inherit, **addt)
+ try:
+ self._flatten(obj, inherit, **addt)
+ except RecursionError:
+ raise PdfReadError(
+ "Maximum recursion depth reached during page flattening."
+ )
  elif t == "/Page":
  for attr_in, value in list(inherit.items()):
  # if the page has it's own value, it does not inherit the
@@ -1258,7 +1262,7 @@ def xfa(self) -> Optional[Dict[str, Any]]:
  if isinstance(f, IndirectObject):
  field = cast(Optional[EncodedStreamObject], f.get_object())
  if field:
- es = zlib.decompress(b_(field._data))
+ es = zlib.decompress(field._data)
  retval[tag] = es
  return retval
 

diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
@@ -43,7 +43,7 @@
  rc4_encrypt,
 )
 
-from ._utils import b_, logger_warning
+from ._utils import logger_warning
 from .generic import (
  ArrayObject,
  ByteStringObject,
@@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
  elif isinstance(obj, StreamObject):
  obj2 = StreamObject()
  obj2.update(obj)
- obj2.set_data(self.stm_crypt.encrypt(b_(obj._data)))
+ obj2.set_data(self.stm_crypt.encrypt(obj._data))
  for key, value in obj.items(): # Dont forget the Stream dict.
  obj2[key] = self.encrypt_object(value)
  obj = obj2
@@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject:
  data = self.str_crypt.decrypt(obj.original_bytes)
  obj = create_string_object(data)
  elif isinstance(obj, StreamObject):
- obj._data = self.stm_crypt.decrypt(b_(obj._data))
+ obj._data = self.stm_crypt.decrypt(obj._data)
  for key, value in obj.items(): # Dont forget the Stream dict.
  obj[key] = self.decrypt_object(value)
  elif isinstance(obj, DictionaryObject):

diff --git a/pypdf/_merger.py b/pypdf/_merger.py
@@ -46,7 +46,6 @@
 from ._utils import (
  StrByteType,
  deprecate_with_replacement,
- str_,
 )
 from ._writer import PdfWriter
 from .constants import GoToActionArguments, TypArguments, TypFitArguments
@@ -82,6 +81,15 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
  self.id = id
 
 
+# transfered from _utils : as this function is only required here
+# and merger will be soon deprecated
+def str_(b: Any) -> str: # pragma: no cover
+ if isinstance(b, bytes):
+ return b.decode("latin-1")
+ else:
+ return str(b) # will return b.__str__() if defined
+
+
 class PdfMerger:
  """
  Use :class:`PdfWriter` instead.

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -28,7 +28,6 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 import math
-import sys
 from decimal import Decimal
 from pathlib import Path
 from typing import (
@@ -38,6 +37,7 @@
  Iterable,
  Iterator,
  List,
+ Literal,
  Optional,
  Sequence,
  Set,
@@ -85,12 +85,6 @@
  StreamObject,
 )
 
-if sys.version_info >= (3, 8):
- from typing import Literal
-else:
- from typing_extensions import Literal
-
-
 MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'
 
 
@@ -852,7 +846,7 @@ def _add_transformation_matrix(
  FloatObject(e),
  FloatObject(f),
  ],
- " cm",
+ b"cm",
  ],
  )
  return contents
@@ -870,7 +864,7 @@ def _get_contents_as_bytes(self) -> Optional[bytes]:
  if isinstance(obj, list):
  return b"".join(x.get_object().get_data() for x in obj)
  else:
- return cast(bytes, cast(EncodedStreamObject, obj).get_data())
+ return cast(EncodedStreamObject, obj).get_data()
  else:
  return None
 
@@ -1063,11 +1057,11 @@ def _merge_page(
  rect.height,
  ],
  ),
- "re",
+ b"re",
  ),
  )
- page2content.operations.insert(1, ([], "W"))
- page2content.operations.insert(2, ([], "n"))
+ page2content.operations.insert(1, ([], b"W"))
+ page2content.operations.insert(2, ([], b"n"))
  if page2transformation is not None:
  page2content = page2transformation(page2content)
  page2content = PageObject._content_stream_rename(
@@ -1201,11 +1195,11 @@ def _merge_page_writer(
  rect.height,
  ],
  ),
- "re",
+ b"re",
  ),
  )
- page2content.operations.insert(1, ([], "W"))
- page2content.operations.insert(2, ([], "n"))
+ page2content.operations.insert(1, ([], b"W"))
+ page2content.operations.insert(2, ([], b"n"))
  if page2transformation is not None:
  page2content = page2transformation(page2content)
  page2content = PageObject._content_stream_rename(

diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
@@ -2,13 +2,7 @@
 
 from abc import abstractmethod
 from pathlib import Path
-from typing import IO, Any, Dict, List, Optional, Tuple, Union
-
-try:
- # Python 3.8+: https://peps.python.org/pep-0586
- from typing import Protocol
-except ImportError:
- from typing_extensions import Protocol # type: ignore[assignment]
+from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union
 
 from ._utils import StrByteType, StreamType
 

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -51,7 +51,6 @@
 from ._utils import (
  StrByteType,
  StreamType,
- b_,
  logger_warning,
  read_non_whitespace,
  read_previous_line,
@@ -328,7 +327,7 @@ def _get_object_from_stream(
  assert cast(str, obj_stm["/Type"]) == "/ObjStm"
  # /N is the number of indirect objects in the stream
  assert idx < obj_stm["/N"]
- stream_data = BytesIO(b_(obj_stm.get_data()))
+ stream_data = BytesIO(obj_stm.get_data())
  for i in range(obj_stm["/N"]): # type: ignore
  read_non_whitespace(stream_data)
  stream_data.seek(-1, 1)
@@ -542,7 +541,10 @@ def read_object_header(self, stream: StreamType) -> Tuple[int, int]:
  def cache_get_indirect_object(
  self, generation: int, idnum: int
  ) -> Optional[PdfObject]:
- return self.resolved_objects.get((generation, idnum))
+ try:
+ return self.resolved_objects.get((generation, idnum))
+ except RecursionError:
+ raise PdfReadError("Maximum recursion depth reached.")
 
  def cache_indirect_object(
  self, generation: int, idnum: int, obj: Optional[PdfObject]
@@ -932,7 +934,7 @@ def _read_pdf15_xref_stream(
  xrefstream = cast(ContentStream, read_object(stream, self))
  assert cast(str, xrefstream["/Type"]) == "/XRef"
  self.cache_indirect_object(generation, idnum, xrefstream)
- stream_data = BytesIO(b_(xrefstream.get_data()))
+ stream_data = BytesIO(xrefstream.get_data())
  # Index pairs specify the subsections in the dictionary. If
  # none create one subsection that spans everything.
  idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])