From 1a49d15e588d08122b48258ad13d0adfe40d23ed Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 28 Sep 2021 10:58:51 +0300
Subject: [PATCH 1/5] bpo-36819: Fix crashes in built-in encoders with weird
 error handlers

If the error handler returns position less or equal than the starting
position of non-encodable characters, most of built-in encoders didn't
properly re-size the output buffer. This led to out-of-bounds writes,
and segfaults.
---
 Lib/test/test_codeccallbacks.py               | 52 ++++++++++++++++
 .../2021-09-28-10-58-30.bpo-36819.cyV50C.rst  |  2 +
 Objects/stringlib/codecs.h                    | 15 ++++-
 Objects/unicodeobject.c                       | 60 ++++++++++++-------
 4 files changed, 106 insertions(+), 23 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst

diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 243f002c4ecadd..29be9405d23960 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -22,6 +22,18 @@ def handle(self, exc):
             self.pos = len(exc.object)
         return ("<?>", oldpos)
 
+class RepeatedPosReturn:
+    def __init__(self, repl="<?>"):
+        self.repl = repl
+        self.pos = 0
+        self.count = 0
+
+    def handle(self, exc):
+        if self.count > 0:
+            self.count -= 1
+            return (self.repl, self.pos)
+        return (self.repl, exc.end)
+
 # A UnicodeEncodeError object with a bad start attribute
 class BadStartUnicodeEncodeError(UnicodeEncodeError):
     def __init__(self):
@@ -940,6 +952,46 @@ def __getitem__(self, key):
             self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
             self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
 
+    def test_decodehelper_bug36819(self):
+        handler = RepeatedPosReturn("x")
+        codecs.register_error("test.bug36819", handler.handle)
+
+        testcases = [
+            ("ascii", b"abcd\xff"),
+            ("utf-8", b"abcd\xff"),
+            ("utf-16be", b'\x00a\x00b\x00c\x00d\xdc\xff'),
+            ("utf-32be", b'\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00d\x00\x00\xdc\xff'),
+            ("iso-8859-6", b"abcd\xff"),
+        ]
+        for enc, data in testcases:
+            with self.subTest(encoding=enc):
+                handler.count = 50
+                decoded = data.decode(enc, "test.bug36819")
+                self.assertEqual(decoded, 'abcdx' * 51)
+
+    def test_encodehelper_bug36819(self):
+        handler = RepeatedPosReturn("x")
+        codecs.register_error("test.bug36819", handler.handle)
+
+        string = "abcd\udcff"
+        encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"]  # built-in
+        encodings += ["iso-8859-15"]  # charmap codec
+        if sys.platform == 'win32':
+            encodings = ["mbcs", "oem"]  # code page codecs
+        for enc in encodings:
+            with self.subTest(encoding=enc):
+                # The interpreter should segfault after a handful of attempts.
+                # 50 was chosen to try to ensure a segfault without a fix,
+                # but not OOM a machine with one.
+                handler.count = 50
+                encoded = string.encode(enc, "test.bug36819")
+                self.assertEqual(encoded.decode(enc), "abcdx" * 51)
+        if sys.platform == "win32":
+            handler.count = 50
+            encoded = codecs.code_page_encode(437, string, "test.bug36819")
+            self.assertEqual(encoded[0].decode(), "abcdx" * 51)
+            self.assertEqual(encoded[1], len(string))
+
     def test_translatehelper(self):
         # enhance coverage of:
         # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst b/Misc/NEWS.d/next/Core and Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst
new file mode 100644
index 00000000000000..32bb55a90e6c4d
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst	
@@ -0,0 +1,2 @@
+Fix crashes in built-in encoders with error handlers that return position
+less or equal than the starting position of non-encodable characters.
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index b17cda18f54b3e..958cc86147815d 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -387,8 +387,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
                 if (!rep)
                     goto error;
 
-                /* subtract preallocated bytes */
-                writer->min_size -= max_char_size * (newpos - startpos);
+                if (newpos < startpos) {
+                    writer->overallocate = 1;
+                    p = _PyBytesWriter_Prepare(writer, p,
+                                               max_char_size * (startpos - newpos));
+                    if (p == NULL)
+                        goto error;
+                }
+                else {
+                    /* subtract preallocated bytes */
+                    writer->min_size -= max_char_size * (newpos - startpos);
+                    /* Only overallocate the buffer if it's not the last write */
+                    writer->overallocate = (newpos < size);
+                }
 
                 if (PyBytes_Check(rep)) {
                     p = _PyBytesWriter_WriteBytes(writer, p,
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 02bf56e681e56e..66994eb9dfdbd0 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5906,7 +5906,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
 
     pos = 0;
     while (pos < len) {
-        Py_ssize_t repsize, moreunits;
+        Py_ssize_t newpos, repsize, moreunits;
 
         if (kind == PyUnicode_2BYTE_KIND) {
             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
@@ -5923,7 +5923,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
         rep = unicode_encode_call_errorhandler(
                 errors, &errorHandler,
                 encoding, "surrogates not allowed",
-                str, &exc, pos, pos + 1, &pos);
+                str, &exc, pos, pos + 1, &newpos);
         if (!rep)
             goto error;
 
@@ -5931,7 +5931,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
             repsize = PyBytes_GET_SIZE(rep);
             if (repsize & 3) {
                 raise_encode_exception(&exc, encoding,
-                                       str, pos - 1, pos,
+                                       str, pos, pos + 1,
                                        "surrogates not allowed");
                 goto error;
             }
@@ -5944,28 +5944,30 @@ _PyUnicode_EncodeUTF32(PyObject *str,
             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
             if (!PyUnicode_IS_ASCII(rep)) {
                 raise_encode_exception(&exc, encoding,
-                                       str, pos - 1, pos,
+                                       str, pos, pos + 1,
                                        "surrogates not allowed");
                 goto error;
             }
         }
+        moreunits += pos - newpos;
+        pos = newpos;
 
         /* four bytes are reserved for each surrogate */
-        if (moreunits > 1) {
+        if (moreunits > 0) {
             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
                 /* integer overflow */
                 PyErr_NoMemory();
                 goto error;
             }
-            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
+            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
                 goto error;
             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
         }
 
         if (PyBytes_Check(rep)) {
             memcpy(out, PyBytes_AS_STRING(rep), repsize);
-            out += moreunits;
+            out += repsize / 4;
         } else /* rep is unicode */ {
             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
@@ -6243,7 +6245,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
 
     pos = 0;
     while (pos < len) {
-        Py_ssize_t repsize, moreunits;
+        Py_ssize_t newpos, repsize, moreunits;
 
         if (kind == PyUnicode_2BYTE_KIND) {
             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
@@ -6260,7 +6262,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
         rep = unicode_encode_call_errorhandler(
                 errors, &errorHandler,
                 encoding, "surrogates not allowed",
-                str, &exc, pos, pos + 1, &pos);
+                str, &exc, pos, pos + 1, &newpos);
         if (!rep)
             goto error;
 
@@ -6268,7 +6270,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
             repsize = PyBytes_GET_SIZE(rep);
             if (repsize & 1) {
                 raise_encode_exception(&exc, encoding,
-                                       str, pos - 1, pos,
+                                       str, pos, pos + 1,
                                        "surrogates not allowed");
                 goto error;
             }
@@ -6281,28 +6283,30 @@ _PyUnicode_EncodeUTF16(PyObject *str,
             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
             if (!PyUnicode_IS_ASCII(rep)) {
                 raise_encode_exception(&exc, encoding,
-                                       str, pos - 1, pos,
+                                       str, pos, pos + 1,
                                        "surrogates not allowed");
                 goto error;
             }
         }
+        moreunits += pos - newpos;
+        pos = newpos;
 
         /* two bytes are reserved for each surrogate */
-        if (moreunits > 1) {
+        if (moreunits > 0) {
             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
                 /* integer overflow */
                 PyErr_NoMemory();
                 goto error;
             }
-            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
+            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
                 goto error;
             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
         }
 
         if (PyBytes_Check(rep)) {
             memcpy(out, PyBytes_AS_STRING(rep), repsize);
-            out += moreunits;
+            out += repsize / 2;
         } else /* rep is unicode */ {
             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
@@ -7137,8 +7141,19 @@ unicode_encode_ucs1(PyObject *unicode,
                 if (rep == NULL)
                     goto onError;
 
-                /* subtract preallocated bytes */
-                writer.min_size -= newpos - collstart;
+                if (newpos < collstart) {
+                    writer.overallocate = 1;
+                    str = _PyBytesWriter_Prepare(&writer, str,
+                                                 collstart - newpos);
+                    if (str == NULL)
+                        goto onError;
+                }
+                else {
+                    /* subtract preallocated bytes */
+                    writer.min_size -= newpos - collstart;
+                    /* Only overallocate the buffer if it's not the last write */
+                    writer.overallocate = (newpos < size);
+                }
 
                 if (PyBytes_Check(rep)) {
                     /* Directly copy bytes result to output. */
@@ -7914,13 +7929,14 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
                   pos, pos + 1, &newpos);
         if (rep == NULL)
             goto error;
-        pos = newpos;
 
+        Py_ssize_t morebytes = pos - newpos;
         if (PyBytes_Check(rep)) {
             outsize = PyBytes_GET_SIZE(rep);
-            if (outsize != 1) {
+            morebytes += outsize;
+            if (morebytes > 0) {
                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
-                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
                     Py_DECREF(rep);
                     goto error;
@@ -7941,9 +7957,10 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
             }
 
             outsize = PyUnicode_GET_LENGTH(rep);
-            if (outsize != 1) {
+            morebytes += outsize;
+            if (morebytes > 0) {
                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
-                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
                     Py_DECREF(rep);
                     goto error;
@@ -7966,6 +7983,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
                 out++;
             }
         }
+        pos = newpos;
         Py_DECREF(rep);
     }
     /* write a NUL byte */

From 6d8cb3085aa6f3cc9bea7fd3f2c5f3a667e45021 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 15 Oct 2021 19:13:46 +0300
Subject: [PATCH 2/5] Add tests for various replacement strings

---
 Lib/test/test_codeccallbacks.py | 111 +++++++++++++++++++++++++++-----
 1 file changed, 96 insertions(+), 15 deletions(-)

diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 29be9405d23960..49c98704d418a8 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -1,5 +1,6 @@
 import codecs
 import html.entities
+import itertools
 import sys
 import unicodedata
 import unittest
@@ -795,20 +796,100 @@ def test_lookup(self):
             codecs.lookup_error("namereplace")
         )
 
-    def test_unencodablereplacement(self):
+    def test_encode_nonascii_replacement(self):
+        def handle(exc):
+            if isinstance(exc, UnicodeEncodeError):
+                return (repl, exc.end)
+            raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error("test.replacing", handle)
+
+        for enc, input, repl in (
+                ("ascii", "[¤]", "abc"),
+                ("iso-8859-1", "[€]", "½¾"),
+                ("iso-8859-15", "[¤]", "œŸ"),
+        ):
+            res = input.encode(enc, "test.replacing")
+            self.assertEqual(res, ("[" + repl + "]").encode(enc))
+
+        for enc, input, repl in (
+                ("utf-8", "[\udc80]", "\U0001f40d"),
+                ("utf-16", "[\udc80]", "\U0001f40d"),
+                ("utf-32", "[\udc80]", "\U0001f40d"),
+        ):
+            with self.assertRaises(UnicodeEncodeError) as cm:
+                input.encode(enc, "test.replacing")
+            exc = cm.exception
+            self.assertEqual(exc.start, 1)
+            self.assertEqual(exc.end, 2)
+            self.assertEqual(exc.object, input)
+
+    def test_encode_unencodable_replacement(self):
         def unencrepl(exc):
             if isinstance(exc, UnicodeEncodeError):
-                return ("\u4242", exc.end)
+                return (repl, exc.end)
             else:
                 raise TypeError("don't know how to handle %r" % exc)
         codecs.register_error("test.unencreplhandler", unencrepl)
-        for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
-            self.assertRaises(
-                UnicodeEncodeError,
-                "\u4242".encode,
-                enc,
-                "test.unencreplhandler"
-            )
+
+        for enc, input, repl in (
+                ("ascii", "[¤]", "½"),
+                ("iso-8859-1", "[€]", "œ"),
+                ("iso-8859-15", "[¤]", "½"),
+                ("utf-8", "[\udc80]", "\udcff"),
+                ("utf-16", "[\udc80]", "\udcff"),
+                ("utf-32", "[\udc80]", "\udcff"),
+        ):
+            with self.assertRaises(UnicodeEncodeError) as cm:
+                input.encode(enc, "test.unencreplhandler")
+            exc = cm.exception
+            self.assertEqual(exc.start, 1)
+            self.assertEqual(exc.end, 2)
+            self.assertEqual(exc.object, input)
+
+    def test_encode_bytes_replacement(self):
+        def handle(exc):
+            if isinstance(exc, UnicodeEncodeError):
+                return (repl, exc.end)
+            raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error("test.replacing", handle)
+
+        # It works even if the bytes sequence is not decodable.
+        for enc, input, repl in (
+                ("ascii", "[¤]", b"\xbd\xbe"),
+                ("iso-8859-1", "[€]", b"\xbd\xbe"),
+                ("iso-8859-15", "[¤]", b"\xbd\xbe"),
+                ("utf-8", "[\udc80]", b"\xbd\xbe"),
+                ("utf-16le", "[\udc80]", b"\xbd\xbe"),
+                ("utf-16be", "[\udc80]", b"\xbd\xbe"),
+                ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
+                ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
+        ):
+            res = input.encode(enc, "test.replacing")
+            self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
+
+    def test_encode_odd_bytes_replacement(self):
+        def handle(exc):
+            if isinstance(exc, UnicodeEncodeError):
+                return (repl, exc.end)
+            raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error("test.replacing", handle)
+
+        input = "[\udc80]"
+        # Tests in which the replacement bytestring contains not whole number
+        # of code units.
+        for enc, repl in (
+            *itertools.product(("utf-16le", "utf-16be"),
+                               [b"a", b"abc"]),
+            *itertools.product(("utf-32le", "utf-32be"),
+                               [b"a", b"ab", b"abc", b"abcde"]),
+        ):
+            with self.assertRaises(UnicodeEncodeError) as cm:
+                input.encode(enc, "test.replacing")
+            exc = cm.exception
+            self.assertEqual(exc.start, 1)
+            self.assertEqual(exc.end, 2)
+            self.assertEqual(exc.object, input)
+            self.assertEqual(exc.reason, "surrogates not allowed")
 
     def test_badregistercall(self):
         # enhance coverage of:
@@ -963,17 +1044,17 @@ def test_decodehelper_bug36819(self):
             ("utf-32be", b'\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00d\x00\x00\xdc\xff'),
             ("iso-8859-6", b"abcd\xff"),
         ]
-        for enc, data in testcases:
+        for enc, input in testcases:
             with self.subTest(encoding=enc):
                 handler.count = 50
-                decoded = data.decode(enc, "test.bug36819")
+                decoded = input.decode(enc, "test.bug36819")
                 self.assertEqual(decoded, 'abcdx' * 51)
 
     def test_encodehelper_bug36819(self):
         handler = RepeatedPosReturn("x")
         codecs.register_error("test.bug36819", handler.handle)
 
-        string = "abcd\udcff"
+        input = "abcd\udcff"
         encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"]  # built-in
         encodings += ["iso-8859-15"]  # charmap codec
         if sys.platform == 'win32':
@@ -984,13 +1065,13 @@ def test_encodehelper_bug36819(self):
                 # 50 was chosen to try to ensure a segfault without a fix,
                 # but not OOM a machine with one.
                 handler.count = 50
-                encoded = string.encode(enc, "test.bug36819")
+                encoded = input.encode(enc, "test.bug36819")
                 self.assertEqual(encoded.decode(enc), "abcdx" * 51)
         if sys.platform == "win32":
             handler.count = 50
-            encoded = codecs.code_page_encode(437, string, "test.bug36819")
+            encoded = codecs.code_page_encode(437, input, "test.bug36819")
             self.assertEqual(encoded[0].decode(), "abcdx" * 51)
-            self.assertEqual(encoded[1], len(string))
+            self.assertEqual(encoded[1], len(input))
 
     def test_translatehelper(self):
         # enhance coverage of:

From c13946745e7f23e3adb2381a12ffd8d7e5e51efa Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 15 Oct 2021 19:32:49 +0300
Subject: [PATCH 3/5] Refactor and more tests.

---
 Lib/test/test_codeccallbacks.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 49c98704d418a8..81fd004830ec74 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -1038,13 +1038,14 @@ def test_decodehelper_bug36819(self):
         codecs.register_error("test.bug36819", handler.handle)
 
         testcases = [
-            ("ascii", b"abcd\xff"),
-            ("utf-8", b"abcd\xff"),
-            ("utf-16be", b'\x00a\x00b\x00c\x00d\xdc\xff'),
-            ("utf-32be", b'\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00d\x00\x00\xdc\xff'),
-            ("iso-8859-6", b"abcd\xff"),
+            ("ascii", b"\xff"),
+            ("utf-8", b"\xff"),
+            ("utf-16be", b'\xdc\x80'),
+            ("utf-32be", b'\x00\x00\xdc\x80'),
+            ("iso-8859-6", b"\xff"),
         ]
-        for enc, input in testcases:
+        for enc, bad in testcases:
+            input = "abcd".encode(enc) + bad
             with self.subTest(encoding=enc):
                 handler.count = 50
                 decoded = input.decode(enc, "test.bug36819")
@@ -1054,7 +1055,7 @@ def test_encodehelper_bug36819(self):
         handler = RepeatedPosReturn("x")
         codecs.register_error("test.bug36819", handler.handle)
 
-        input = "abcd\udcff"
+        input = "abcd\udc80"
         encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"]  # built-in
         encodings += ["iso-8859-15"]  # charmap codec
         if sys.platform == 'win32':
@@ -1073,6 +1074,18 @@ def test_encodehelper_bug36819(self):
             self.assertEqual(encoded[0].decode(), "abcdx" * 51)
             self.assertEqual(encoded[1], len(input))
 
+        handler.repl = "\udcff"
+        for enc in encodings:
+            with self.subTest(encoding=enc):
+                handler.count = 50
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.bug36819")
+                exc = cm.exception
+                self.assertEqual(exc.start, 4)
+                self.assertEqual(exc.end, 5)
+                self.assertEqual(exc.object, input)
+
+
     def test_translatehelper(self):
         # enhance coverage of:
         # Objects/unicodeobject.c::unicode_encode_call_errorhandler()

From 7cc768d899d8f553ae9e2087efb122e03d0ff021 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 15 Oct 2021 19:35:43 +0300
Subject: [PATCH 4/5] Add more subTests.

---
 Lib/test/test_codeccallbacks.py | 46 ++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 81fd004830ec74..be123eefc68f7c 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -816,12 +816,13 @@ def handle(exc):
                 ("utf-16", "[\udc80]", "\U0001f40d"),
                 ("utf-32", "[\udc80]", "\U0001f40d"),
         ):
-            with self.assertRaises(UnicodeEncodeError) as cm:
-                input.encode(enc, "test.replacing")
-            exc = cm.exception
-            self.assertEqual(exc.start, 1)
-            self.assertEqual(exc.end, 2)
-            self.assertEqual(exc.object, input)
+            with self.subTest(encoding=enc):
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.replacing")
+                exc = cm.exception
+                self.assertEqual(exc.start, 1)
+                self.assertEqual(exc.end, 2)
+                self.assertEqual(exc.object, input)
 
     def test_encode_unencodable_replacement(self):
         def unencrepl(exc):
@@ -839,12 +840,13 @@ def unencrepl(exc):
                 ("utf-16", "[\udc80]", "\udcff"),
                 ("utf-32", "[\udc80]", "\udcff"),
         ):
-            with self.assertRaises(UnicodeEncodeError) as cm:
-                input.encode(enc, "test.unencreplhandler")
-            exc = cm.exception
-            self.assertEqual(exc.start, 1)
-            self.assertEqual(exc.end, 2)
-            self.assertEqual(exc.object, input)
+            with self.subTest(encoding=enc):
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.unencreplhandler")
+                exc = cm.exception
+                self.assertEqual(exc.start, 1)
+                self.assertEqual(exc.end, 2)
+                self.assertEqual(exc.object, input)
 
     def test_encode_bytes_replacement(self):
         def handle(exc):
@@ -864,8 +866,9 @@ def handle(exc):
                 ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
                 ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
         ):
-            res = input.encode(enc, "test.replacing")
-            self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
+            with self.subTest(encoding=enc):
+                res = input.encode(enc, "test.replacing")
+                self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
 
     def test_encode_odd_bytes_replacement(self):
         def handle(exc):
@@ -883,13 +886,14 @@ def handle(exc):
             *itertools.product(("utf-32le", "utf-32be"),
                                [b"a", b"ab", b"abc", b"abcde"]),
         ):
-            with self.assertRaises(UnicodeEncodeError) as cm:
-                input.encode(enc, "test.replacing")
-            exc = cm.exception
-            self.assertEqual(exc.start, 1)
-            self.assertEqual(exc.end, 2)
-            self.assertEqual(exc.object, input)
-            self.assertEqual(exc.reason, "surrogates not allowed")
+            with self.subTest(encoding=enc, repl=repl):
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.replacing")
+                exc = cm.exception
+                self.assertEqual(exc.start, 1)
+                self.assertEqual(exc.end, 2)
+                self.assertEqual(exc.object, input)
+                self.assertEqual(exc.reason, "surrogates not allowed")
 
     def test_badregistercall(self):
         # enhance coverage of:

From 7bdc61193ca6e384b6ed82f7b0ae510e49a1b678 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 15 Oct 2021 19:39:52 +0300
Subject: [PATCH 5/5] Add more tests.

---
 Lib/test/test_codeccallbacks.py | 35 +++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index be123eefc68f7c..4991330489d139 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -1056,7 +1056,7 @@ def test_decodehelper_bug36819(self):
                 self.assertEqual(decoded, 'abcdx' * 51)
 
     def test_encodehelper_bug36819(self):
-        handler = RepeatedPosReturn("x")
+        handler = RepeatedPosReturn()
         codecs.register_error("test.bug36819", handler.handle)
 
         input = "abcd\udc80"
@@ -1064,6 +1064,27 @@ def test_encodehelper_bug36819(self):
         encodings += ["iso-8859-15"]  # charmap codec
         if sys.platform == 'win32':
             encodings = ["mbcs", "oem"]  # code page codecs
+
+        handler.repl = "\udcff"
+        for enc in encodings:
+            with self.subTest(encoding=enc):
+                handler.count = 50
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.bug36819")
+                exc = cm.exception
+                self.assertEqual(exc.start, 4)
+                self.assertEqual(exc.end, 5)
+                self.assertEqual(exc.object, input)
+        if sys.platform == "win32":
+            handler.count = 50
+            with self.assertRaises(UnicodeEncodeError) as cm:
+                codecs.code_page_encode(437, input, "test.bug36819")
+            exc = cm.exception
+            self.assertEqual(exc.start, 4)
+            self.assertEqual(exc.end, 5)
+            self.assertEqual(exc.object, input)
+
+        handler.repl = "x"
         for enc in encodings:
             with self.subTest(encoding=enc):
                 # The interpreter should segfault after a handful of attempts.
@@ -1078,18 +1099,6 @@ def test_encodehelper_bug36819(self):
             self.assertEqual(encoded[0].decode(), "abcdx" * 51)
             self.assertEqual(encoded[1], len(input))
 
-        handler.repl = "\udcff"
-        for enc in encodings:
-            with self.subTest(encoding=enc):
-                handler.count = 50
-                with self.assertRaises(UnicodeEncodeError) as cm:
-                    input.encode(enc, "test.bug36819")
-                exc = cm.exception
-                self.assertEqual(exc.start, 4)
-                self.assertEqual(exc.end, 5)
-                self.assertEqual(exc.object, input)
-
-
     def test_translatehelper(self):
         # enhance coverage of:
         # Objects/unicodeobject.c::unicode_encode_call_errorhandler()