py-pdf · MartinThoma · Jan 2, 2024 · Dec 28, 2023
diff --git a/docs/user/suppress-warnings.md b/docs/user/suppress-warnings.md
@@ -5,7 +5,8 @@ pypdf makes use of 3 mechanisms to show that something went wrong:
 * **Log messages** are informative messages that can be used for post-mortem
   analysis. Most of the time, users can ignore them. They come in different
   *levels*, such as info / warning / error indicating the severity.
-  Examples are non-standard compliant PDF files which pypdf can deal with.
+  Examples are non-standard compliant PDF files which pypdf can deal with or
+  a missing implementation that leads to a part of the text not being extracted.
 * **Warnings** are avoidable issues, such as using deprecated classes /
   functions / parameters. Another example is missing capabilities of pypdf.
   In those cases, pypdf users should adjust their code. Warnings

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -1,11 +1,9 @@
-import warnings
 from binascii import unhexlify
 from math import ceil
 from typing import Any, Dict, List, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
-from ._utils import b_, logger_warning
-from .errors import PdfReadWarning
+from ._utils import b_, logger_error, logger_warning
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
@@ -180,18 +178,15 @@
             else:
                 raise Exception("not found")
         except Exception:
-            warnings.warn(
-                f"Advanced encoding {enc} not implemented yet",
-                PdfReadWarning,
-            )
+            logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
             encoding = enc
     elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
         try:
             encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
         except Exception:
-            warnings.warn(
+            logger_error(
                 f"Advanced encoding {encoding} not implemented yet",
-                PdfReadWarning,
+                __name__,
             )
             encoding = charset_encoding["/StandardCoding"].copy()
     else:

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -446,6 +446,18 @@ def deprecation_no_replacement(name: str, removed_in: str) -> None:
     deprecation(DEPR_MSG_NO_REPLACEMENT_HAPPENED.format(name, removed_in))
 
 
+def logger_error(msg: str, src: str) -> None:
+    """
+    Use this instead of logger.error directly.
+
+    That allows people to overwrite it more easily.
+
+    See the docs on when to use which:
+    https://pypdf.readthedocs.io/en/latest/user/suppress-warnings.html
+    """
+    logging.getLogger(src).error(msg)
+
+
 def logger_warning(msg: str, src: str) -> None:
     """
     Use this instead of logger.warning directly.

diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -5,7 +5,6 @@
 
 from pypdf import PdfReader
 from pypdf._cmap import build_char_map
-from pypdf.errors import PdfReadWarning
 
 from . import get_data_from_url
 
@@ -85,11 +84,11 @@ def test_text_extraction_fast(caplog, url: str, name: str, strict: bool):
 
 
 @pytest.mark.enable_socket()
-def test_parse_encoding_advanced_encoding_not_implemented():
+def test_parse_encoding_advanced_encoding_not_implemented(caplog):
     reader = PdfReader(BytesIO(get_data_from_url(name="tika-957144.pdf")))
-    with pytest.warns(PdfReadWarning, match="Advanced encoding .* not implemented yet"):
-        for page in reader.pages:
-            page.extract_text()
+    for page in reader.pages:
+        page.extract_text()
+    assert "Advanced encoding /WinAnsEncoding not implemented yet" in caplog.text
 
 
 @pytest.mark.enable_socket()