From 1a49d15e588d08122b48258ad13d0adfe40d23ed Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 28 Sep 2021 10:58:51 +0300 Subject: [PATCH 1/5] bpo-36819: Fix crashes in built-in encoders with weird error handlers If the error handler returns position less or equal than the starting position of non-encodable characters, most of built-in encoders didn't properly re-size the output buffer. This led to out-of-bounds writes, and segfaults. --- Lib/test/test_codeccallbacks.py | 52 ++++++++++++++++ .../2021-09-28-10-58-30.bpo-36819.cyV50C.rst | 2 + Objects/stringlib/codecs.h | 15 ++++- Objects/unicodeobject.c | 60 ++++++++++++------- 4 files changed, 106 insertions(+), 23 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 243f002c4ecadd..29be9405d23960 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -22,6 +22,18 @@ def handle(self, exc): self.pos = len(exc.object) return ("", oldpos) +class RepeatedPosReturn: + def __init__(self, repl=""): + self.repl = repl + self.pos = 0 + self.count = 0 + + def handle(self, exc): + if self.count > 0: + self.count -= 1 + return (self.repl, self.pos) + return (self.repl, exc.end) + # A UnicodeEncodeError object with a bad start attribute class BadStartUnicodeEncodeError(UnicodeEncodeError): def __init__(self): @@ -940,6 +952,46 @@ def __getitem__(self, key): self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D()) self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300}) + def test_decodehelper_bug36819(self): + handler = RepeatedPosReturn("x") + codecs.register_error("test.bug36819", handler.handle) + + testcases = [ + ("ascii", b"abcd\xff"), + ("utf-8", b"abcd\xff"), + ("utf-16be", b'\x00a\x00b\x00c\x00d\xdc\xff'), + ("utf-32be", b'\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00d\x00\x00\xdc\xff'), + ("iso-8859-6", b"abcd\xff"), + ] + for enc, data in testcases: + with self.subTest(encoding=enc): + handler.count = 50 + decoded = data.decode(enc, "test.bug36819") + self.assertEqual(decoded, 'abcdx' * 51) + + def test_encodehelper_bug36819(self): + handler = RepeatedPosReturn("x") + codecs.register_error("test.bug36819", handler.handle) + + string = "abcd\udcff" + encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in + encodings += ["iso-8859-15"] # charmap codec + if sys.platform == 'win32': + encodings = ["mbcs", "oem"] # code page codecs + for enc in encodings: + with self.subTest(encoding=enc): + # The interpreter should segfault after a handful of attempts. + # 50 was chosen to try to ensure a segfault without a fix, + # but not OOM a machine with one. + handler.count = 50 + encoded = string.encode(enc, "test.bug36819") + self.assertEqual(encoded.decode(enc), "abcdx" * 51) + if sys.platform == "win32": + handler.count = 50 + encoded = codecs.code_page_encode(437, string, "test.bug36819") + self.assertEqual(encoded[0].decode(), "abcdx" * 51) + self.assertEqual(encoded[1], len(string)) + def test_translatehelper(self): # enhance coverage of: # Objects/unicodeobject.c::unicode_encode_call_errorhandler() diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst b/Misc/NEWS.d/next/Core and Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst new file mode 100644 index 00000000000000..32bb55a90e6c4d --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst @@ -0,0 +1,2 @@ +Fix crashes in built-in encoders with error handlers that return position +less or equal than the starting position of non-encodable characters. diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index b17cda18f54b3e..958cc86147815d 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -387,8 +387,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, if (!rep) goto error; - /* subtract preallocated bytes */ - writer->min_size -= max_char_size * (newpos - startpos); + if (newpos < startpos) { + writer->overallocate = 1; + p = _PyBytesWriter_Prepare(writer, p, + max_char_size * (startpos - newpos)); + if (p == NULL) + goto error; + } + else { + /* subtract preallocated bytes */ + writer->min_size -= max_char_size * (newpos - startpos); + /* Only overallocate the buffer if it's not the last write */ + writer->overallocate = (newpos < size); + } if (PyBytes_Check(rep)) { p = _PyBytesWriter_WriteBytes(writer, p, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 02bf56e681e56e..66994eb9dfdbd0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5906,7 +5906,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, pos = 0; while (pos < len) { - Py_ssize_t repsize, moreunits; + Py_ssize_t newpos, repsize, moreunits; if (kind == PyUnicode_2BYTE_KIND) { pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, @@ -5923,7 +5923,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, rep = unicode_encode_call_errorhandler( errors, &errorHandler, encoding, "surrogates not allowed", - str, &exc, pos, pos + 1, &pos); + str, &exc, pos, pos + 1, &newpos); if (!rep) goto error; @@ -5931,7 +5931,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, repsize = PyBytes_GET_SIZE(rep); if (repsize & 3) { raise_encode_exception(&exc, encoding, - str, pos - 1, pos, + str, pos, pos + 1, "surrogates not allowed"); goto error; } @@ -5944,28 +5944,30 @@ _PyUnicode_EncodeUTF32(PyObject *str, moreunits = repsize = PyUnicode_GET_LENGTH(rep); if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, encoding, - str, pos - 1, pos, + str, pos, pos + 1, "surrogates not allowed"); goto error; } } + moreunits += pos - newpos; + pos = newpos; /* four bytes are reserved for each surrogate */ - if (moreunits > 1) { + if (moreunits > 0) { Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) { /* integer overflow */ PyErr_NoMemory(); goto error; } - if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0) + if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0) goto error; out = (uint32_t*) PyBytes_AS_STRING(v) + outpos; } if (PyBytes_Check(rep)) { memcpy(out, PyBytes_AS_STRING(rep), repsize); - out += moreunits; + out += repsize / 4; } else /* rep is unicode */ { assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, @@ -6243,7 +6245,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, pos = 0; while (pos < len) { - Py_ssize_t repsize, moreunits; + Py_ssize_t newpos, repsize, moreunits; if (kind == PyUnicode_2BYTE_KIND) { pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, @@ -6260,7 +6262,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, rep = unicode_encode_call_errorhandler( errors, &errorHandler, encoding, "surrogates not allowed", - str, &exc, pos, pos + 1, &pos); + str, &exc, pos, pos + 1, &newpos); if (!rep) goto error; @@ -6268,7 +6270,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, repsize = PyBytes_GET_SIZE(rep); if (repsize & 1) { raise_encode_exception(&exc, encoding, - str, pos - 1, pos, + str, pos, pos + 1, "surrogates not allowed"); goto error; } @@ -6281,28 +6283,30 @@ _PyUnicode_EncodeUTF16(PyObject *str, moreunits = repsize = PyUnicode_GET_LENGTH(rep); if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, encoding, - str, pos - 1, pos, + str, pos, pos + 1, "surrogates not allowed"); goto error; } } + moreunits += pos - newpos; + pos = newpos; /* two bytes are reserved for each surrogate */ - if (moreunits > 1) { + if (moreunits > 0) { Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) { /* integer overflow */ PyErr_NoMemory(); goto error; } - if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0) + if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0) goto error; out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; } if (PyBytes_Check(rep)) { memcpy(out, PyBytes_AS_STRING(rep), repsize); - out += moreunits; + out += repsize / 2; } else /* rep is unicode */ { assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, @@ -7137,8 +7141,19 @@ unicode_encode_ucs1(PyObject *unicode, if (rep == NULL) goto onError; - /* subtract preallocated bytes */ - writer.min_size -= newpos - collstart; + if (newpos < collstart) { + writer.overallocate = 1; + str = _PyBytesWriter_Prepare(&writer, str, + collstart - newpos); + if (str == NULL) + goto onError; + } + else { + /* subtract preallocated bytes */ + writer.min_size -= newpos - collstart; + /* Only overallocate the buffer if it's not the last write */ + writer.overallocate = (newpos < size); + } if (PyBytes_Check(rep)) { /* Directly copy bytes result to output. */ @@ -7914,13 +7929,14 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, pos, pos + 1, &newpos); if (rep == NULL) goto error; - pos = newpos; + Py_ssize_t morebytes = pos - newpos; if (PyBytes_Check(rep)) { outsize = PyBytes_GET_SIZE(rep); - if (outsize != 1) { + morebytes += outsize; + if (morebytes > 0) { Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); - newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); + newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes; if (_PyBytes_Resize(outbytes, newoutsize) < 0) { Py_DECREF(rep); goto error; @@ -7941,9 +7957,10 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, } outsize = PyUnicode_GET_LENGTH(rep); - if (outsize != 1) { + morebytes += outsize; + if (morebytes > 0) { Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); - newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); + newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes; if (_PyBytes_Resize(outbytes, newoutsize) < 0) { Py_DECREF(rep); goto error; @@ -7966,6 +7983,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, out++; } } + pos = newpos; Py_DECREF(rep); } /* write a NUL byte */ From 6d8cb3085aa6f3cc9bea7fd3f2c5f3a667e45021 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 15 Oct 2021 19:13:46 +0300 Subject: [PATCH 2/5] Add tests for various replacement strings --- Lib/test/test_codeccallbacks.py | 111 +++++++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 15 deletions(-) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 29be9405d23960..49c98704d418a8 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -1,5 +1,6 @@ import codecs import html.entities +import itertools import sys import unicodedata import unittest @@ -795,20 +796,100 @@ def test_lookup(self): codecs.lookup_error("namereplace") ) - def test_unencodablereplacement(self): + def test_encode_nonascii_replacement(self): + def handle(exc): + if isinstance(exc, UnicodeEncodeError): + return (repl, exc.end) + raise TypeError("don't know how to handle %r" % exc) + codecs.register_error("test.replacing", handle) + + for enc, input, repl in ( + ("ascii", "[¤]", "abc"), + ("iso-8859-1", "[€]", "½¾"), + ("iso-8859-15", "[¤]", "œŸ"), + ): + res = input.encode(enc, "test.replacing") + self.assertEqual(res, ("[" + repl + "]").encode(enc)) + + for enc, input, repl in ( + ("utf-8", "[\udc80]", "\U0001f40d"), + ("utf-16", "[\udc80]", "\U0001f40d"), + ("utf-32", "[\udc80]", "\U0001f40d"), + ): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.replacing") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) + + def test_encode_unencodable_replacement(self): def unencrepl(exc): if isinstance(exc, UnicodeEncodeError): - return ("\u4242", exc.end) + return (repl, exc.end) else: raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.unencreplhandler", unencrepl) - for enc in ("ascii", "iso-8859-1", "iso-8859-15"): - self.assertRaises( - UnicodeEncodeError, - "\u4242".encode, - enc, - "test.unencreplhandler" - ) + + for enc, input, repl in ( + ("ascii", "[¤]", "½"), + ("iso-8859-1", "[€]", "œ"), + ("iso-8859-15", "[¤]", "½"), + ("utf-8", "[\udc80]", "\udcff"), + ("utf-16", "[\udc80]", "\udcff"), + ("utf-32", "[\udc80]", "\udcff"), + ): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.unencreplhandler") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) + + def test_encode_bytes_replacement(self): + def handle(exc): + if isinstance(exc, UnicodeEncodeError): + return (repl, exc.end) + raise TypeError("don't know how to handle %r" % exc) + codecs.register_error("test.replacing", handle) + + # It works even if the bytes sequence is not decodable. + for enc, input, repl in ( + ("ascii", "[¤]", b"\xbd\xbe"), + ("iso-8859-1", "[€]", b"\xbd\xbe"), + ("iso-8859-15", "[¤]", b"\xbd\xbe"), + ("utf-8", "[\udc80]", b"\xbd\xbe"), + ("utf-16le", "[\udc80]", b"\xbd\xbe"), + ("utf-16be", "[\udc80]", b"\xbd\xbe"), + ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"), + ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"), + ): + res = input.encode(enc, "test.replacing") + self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc)) + + def test_encode_odd_bytes_replacement(self): + def handle(exc): + if isinstance(exc, UnicodeEncodeError): + return (repl, exc.end) + raise TypeError("don't know how to handle %r" % exc) + codecs.register_error("test.replacing", handle) + + input = "[\udc80]" + # Tests in which the replacement bytestring contains not whole number + # of code units. + for enc, repl in ( + *itertools.product(("utf-16le", "utf-16be"), + [b"a", b"abc"]), + *itertools.product(("utf-32le", "utf-32be"), + [b"a", b"ab", b"abc", b"abcde"]), + ): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.replacing") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) + self.assertEqual(exc.reason, "surrogates not allowed") def test_badregistercall(self): # enhance coverage of: @@ -963,17 +1044,17 @@ def test_decodehelper_bug36819(self): ("utf-32be", b'\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00d\x00\x00\xdc\xff'), ("iso-8859-6", b"abcd\xff"), ] - for enc, data in testcases: + for enc, input in testcases: with self.subTest(encoding=enc): handler.count = 50 - decoded = data.decode(enc, "test.bug36819") + decoded = input.decode(enc, "test.bug36819") self.assertEqual(decoded, 'abcdx' * 51) def test_encodehelper_bug36819(self): handler = RepeatedPosReturn("x") codecs.register_error("test.bug36819", handler.handle) - string = "abcd\udcff" + input = "abcd\udcff" encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in encodings += ["iso-8859-15"] # charmap codec if sys.platform == 'win32': @@ -984,13 +1065,13 @@ def test_encodehelper_bug36819(self): # 50 was chosen to try to ensure a segfault without a fix, # but not OOM a machine with one. handler.count = 50 - encoded = string.encode(enc, "test.bug36819") + encoded = input.encode(enc, "test.bug36819") self.assertEqual(encoded.decode(enc), "abcdx" * 51) if sys.platform == "win32": handler.count = 50 - encoded = codecs.code_page_encode(437, string, "test.bug36819") + encoded = codecs.code_page_encode(437, input, "test.bug36819") self.assertEqual(encoded[0].decode(), "abcdx" * 51) - self.assertEqual(encoded[1], len(string)) + self.assertEqual(encoded[1], len(input)) def test_translatehelper(self): # enhance coverage of: From c13946745e7f23e3adb2381a12ffd8d7e5e51efa Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 15 Oct 2021 19:32:49 +0300 Subject: [PATCH 3/5] Refactor and more tests. --- Lib/test/test_codeccallbacks.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 49c98704d418a8..81fd004830ec74 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -1038,13 +1038,14 @@ def test_decodehelper_bug36819(self): codecs.register_error("test.bug36819", handler.handle) testcases = [ - ("ascii", b"abcd\xff"), - ("utf-8", b"abcd\xff"), - ("utf-16be", b'\x00a\x00b\x00c\x00d\xdc\xff'), - ("utf-32be", b'\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00d\x00\x00\xdc\xff'), - ("iso-8859-6", b"abcd\xff"), + ("ascii", b"\xff"), + ("utf-8", b"\xff"), + ("utf-16be", b'\xdc\x80'), + ("utf-32be", b'\x00\x00\xdc\x80'), + ("iso-8859-6", b"\xff"), ] - for enc, input in testcases: + for enc, bad in testcases: + input = "abcd".encode(enc) + bad with self.subTest(encoding=enc): handler.count = 50 decoded = input.decode(enc, "test.bug36819") @@ -1054,7 +1055,7 @@ def test_encodehelper_bug36819(self): handler = RepeatedPosReturn("x") codecs.register_error("test.bug36819", handler.handle) - input = "abcd\udcff" + input = "abcd\udc80" encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in encodings += ["iso-8859-15"] # charmap codec if sys.platform == 'win32': @@ -1073,6 +1074,18 @@ def test_encodehelper_bug36819(self): self.assertEqual(encoded[0].decode(), "abcdx" * 51) self.assertEqual(encoded[1], len(input)) + handler.repl = "\udcff" + for enc in encodings: + with self.subTest(encoding=enc): + handler.count = 50 + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.bug36819") + exc = cm.exception + self.assertEqual(exc.start, 4) + self.assertEqual(exc.end, 5) + self.assertEqual(exc.object, input) + + def test_translatehelper(self): # enhance coverage of: # Objects/unicodeobject.c::unicode_encode_call_errorhandler() From 7cc768d899d8f553ae9e2087efb122e03d0ff021 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 15 Oct 2021 19:35:43 +0300 Subject: [PATCH 4/5] Add more subTests. --- Lib/test/test_codeccallbacks.py | 46 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 81fd004830ec74..be123eefc68f7c 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -816,12 +816,13 @@ def handle(exc): ("utf-16", "[\udc80]", "\U0001f40d"), ("utf-32", "[\udc80]", "\U0001f40d"), ): - with self.assertRaises(UnicodeEncodeError) as cm: - input.encode(enc, "test.replacing") - exc = cm.exception - self.assertEqual(exc.start, 1) - self.assertEqual(exc.end, 2) - self.assertEqual(exc.object, input) + with self.subTest(encoding=enc): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.replacing") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) def test_encode_unencodable_replacement(self): def unencrepl(exc): @@ -839,12 +840,13 @@ def unencrepl(exc): ("utf-16", "[\udc80]", "\udcff"), ("utf-32", "[\udc80]", "\udcff"), ): - with self.assertRaises(UnicodeEncodeError) as cm: - input.encode(enc, "test.unencreplhandler") - exc = cm.exception - self.assertEqual(exc.start, 1) - self.assertEqual(exc.end, 2) - self.assertEqual(exc.object, input) + with self.subTest(encoding=enc): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.unencreplhandler") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) def test_encode_bytes_replacement(self): def handle(exc): @@ -864,8 +866,9 @@ def handle(exc): ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"), ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"), ): - res = input.encode(enc, "test.replacing") - self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc)) + with self.subTest(encoding=enc): + res = input.encode(enc, "test.replacing") + self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc)) def test_encode_odd_bytes_replacement(self): def handle(exc): @@ -883,13 +886,14 @@ def handle(exc): *itertools.product(("utf-32le", "utf-32be"), [b"a", b"ab", b"abc", b"abcde"]), ): - with self.assertRaises(UnicodeEncodeError) as cm: - input.encode(enc, "test.replacing") - exc = cm.exception - self.assertEqual(exc.start, 1) - self.assertEqual(exc.end, 2) - self.assertEqual(exc.object, input) - self.assertEqual(exc.reason, "surrogates not allowed") + with self.subTest(encoding=enc, repl=repl): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.replacing") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) + self.assertEqual(exc.reason, "surrogates not allowed") def test_badregistercall(self): # enhance coverage of: From 7bdc61193ca6e384b6ed82f7b0ae510e49a1b678 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 15 Oct 2021 19:39:52 +0300 Subject: [PATCH 5/5] Add more tests. --- Lib/test/test_codeccallbacks.py | 35 +++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index be123eefc68f7c..4991330489d139 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -1056,7 +1056,7 @@ def test_decodehelper_bug36819(self): self.assertEqual(decoded, 'abcdx' * 51) def test_encodehelper_bug36819(self): - handler = RepeatedPosReturn("x") + handler = RepeatedPosReturn() codecs.register_error("test.bug36819", handler.handle) input = "abcd\udc80" @@ -1064,6 +1064,27 @@ def test_encodehelper_bug36819(self): encodings += ["iso-8859-15"] # charmap codec if sys.platform == 'win32': encodings = ["mbcs", "oem"] # code page codecs + + handler.repl = "\udcff" + for enc in encodings: + with self.subTest(encoding=enc): + handler.count = 50 + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.bug36819") + exc = cm.exception + self.assertEqual(exc.start, 4) + self.assertEqual(exc.end, 5) + self.assertEqual(exc.object, input) + if sys.platform == "win32": + handler.count = 50 + with self.assertRaises(UnicodeEncodeError) as cm: + codecs.code_page_encode(437, input, "test.bug36819") + exc = cm.exception + self.assertEqual(exc.start, 4) + self.assertEqual(exc.end, 5) + self.assertEqual(exc.object, input) + + handler.repl = "x" for enc in encodings: with self.subTest(encoding=enc): # The interpreter should segfault after a handful of attempts. @@ -1078,18 +1099,6 @@ def test_encodehelper_bug36819(self): self.assertEqual(encoded[0].decode(), "abcdx" * 51) self.assertEqual(encoded[1], len(input)) - handler.repl = "\udcff" - for enc in encodings: - with self.subTest(encoding=enc): - handler.count = 50 - with self.assertRaises(UnicodeEncodeError) as cm: - input.encode(enc, "test.bug36819") - exc = cm.exception - self.assertEqual(exc.start, 4) - self.assertEqual(exc.end, 5) - self.assertEqual(exc.object, input) - - def test_translatehelper(self): # enhance coverage of: # Objects/unicodeobject.c::unicode_encode_call_errorhandler()