Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-36819: Fix crashes in built-in encoders with weird error handlers #28593

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions Lib/test/test_codeccallbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@ def handle(self, exc):
self.pos = len(exc.object)
return ("<?>", oldpos)

class RepeatedPosReturn:
def __init__(self, repl="<?>"):
self.repl = repl
self.pos = 0
self.count = 0

def handle(self, exc):
if self.count > 0:
self.count -= 1
return (self.repl, self.pos)
return (self.repl, exc.end)

# A UnicodeEncodeError object with a bad start attribute
class BadStartUnicodeEncodeError(UnicodeEncodeError):
def __init__(self):
Expand Down Expand Up @@ -940,6 +952,46 @@ def __getitem__(self, key):
self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})

def test_decodehelper_bug36819(self):
handler = RepeatedPosReturn("x")
codecs.register_error("test.bug36819", handler.handle)

testcases = [
("ascii", b"abcd\xff"),
("utf-8", b"abcd\xff"),
("utf-16be", b'\x00a\x00b\x00c\x00d\xdc\xff'),
("utf-32be", b'\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00d\x00\x00\xdc\xff'),
("iso-8859-6", b"abcd\xff"),
]
for enc, data in testcases:
with self.subTest(encoding=enc):
handler.count = 50
decoded = data.decode(enc, "test.bug36819")
self.assertEqual(decoded, 'abcdx' * 51)

def test_encodehelper_bug36819(self):
handler = RepeatedPosReturn("x")
codecs.register_error("test.bug36819", handler.handle)

string = "abcd\udcff"
encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in
encodings += ["iso-8859-15"] # charmap codec
if sys.platform == 'win32':
encodings = ["mbcs", "oem"] # code page codecs
for enc in encodings:
with self.subTest(encoding=enc):
# The interpreter should segfault after a handful of attempts.
# 50 was chosen to try to ensure a segfault without a fix,
# but not OOM a machine with one.
handler.count = 50
encoded = string.encode(enc, "test.bug36819")
self.assertEqual(encoded.decode(enc), "abcdx" * 51)
if sys.platform == "win32":
handler.count = 50
encoded = codecs.code_page_encode(437, string, "test.bug36819")
self.assertEqual(encoded[0].decode(), "abcdx" * 51)
self.assertEqual(encoded[1], len(string))

def test_translatehelper(self):
# enhance coverage of:
# Objects/unicodeobject.c::unicode_encode_call_errorhandler()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix crashes in built-in encoders with error handlers that return position
less or equal than the starting position of non-encodable characters.
15 changes: 13 additions & 2 deletions Objects/stringlib/codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
if (!rep)
goto error;

/* subtract preallocated bytes */
writer->min_size -= max_char_size * (newpos - startpos);
if (newpos < startpos) {
writer->overallocate = 1;
p = _PyBytesWriter_Prepare(writer, p,
max_char_size * (startpos - newpos));
if (p == NULL)
goto error;
}
else {
/* subtract preallocated bytes */
writer->min_size -= max_char_size * (newpos - startpos);
/* Only overallocate the buffer if it's not the last write */
writer->overallocate = (newpos < size);
}

if (PyBytes_Check(rep)) {
p = _PyBytesWriter_WriteBytes(writer, p,
Expand Down
60 changes: 39 additions & 21 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -5906,7 +5906,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,

pos = 0;
while (pos < len) {
Py_ssize_t repsize, moreunits;
Py_ssize_t newpos, repsize, moreunits;

if (kind == PyUnicode_2BYTE_KIND) {
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
Expand All @@ -5923,15 +5923,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
rep = unicode_encode_call_errorhandler(
errors, &errorHandler,
encoding, "surrogates not allowed",
str, &exc, pos, pos + 1, &pos);
str, &exc, pos, pos + 1, &newpos);
if (!rep)
goto error;

if (PyBytes_Check(rep)) {
repsize = PyBytes_GET_SIZE(rep);
if (repsize & 3) {
raise_encode_exception(&exc, encoding,
str, pos - 1, pos,
str, pos, pos + 1,
"surrogates not allowed");
goto error;
}
Expand All @@ -5944,28 +5944,30 @@ _PyUnicode_EncodeUTF32(PyObject *str,
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, encoding,
str, pos - 1, pos,
str, pos, pos + 1,
"surrogates not allowed");
goto error;
}
}
moreunits += pos - newpos;
pos = newpos;

/* four bytes are reserved for each surrogate */
if (moreunits > 1) {
if (moreunits > 0) {
Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
/* integer overflow */
PyErr_NoMemory();
goto error;
}
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
goto error;
out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
}

if (PyBytes_Check(rep)) {
memcpy(out, PyBytes_AS_STRING(rep), repsize);
out += moreunits;
out += repsize / 4;
} else /* rep is unicode */ {
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
Expand Down Expand Up @@ -6243,7 +6245,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,

pos = 0;
while (pos < len) {
Py_ssize_t repsize, moreunits;
Py_ssize_t newpos, repsize, moreunits;

if (kind == PyUnicode_2BYTE_KIND) {
pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
Expand All @@ -6260,15 +6262,15 @@ _PyUnicode_EncodeUTF16(PyObject *str,
rep = unicode_encode_call_errorhandler(
errors, &errorHandler,
encoding, "surrogates not allowed",
str, &exc, pos, pos + 1, &pos);
str, &exc, pos, pos + 1, &newpos);
if (!rep)
goto error;

if (PyBytes_Check(rep)) {
repsize = PyBytes_GET_SIZE(rep);
if (repsize & 1) {
raise_encode_exception(&exc, encoding,
str, pos - 1, pos,
str, pos, pos + 1,
"surrogates not allowed");
goto error;
}
Expand All @@ -6281,28 +6283,30 @@ _PyUnicode_EncodeUTF16(PyObject *str,
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, encoding,
str, pos - 1, pos,
str, pos, pos + 1,
"surrogates not allowed");
goto error;
}
}
moreunits += pos - newpos;
pos = newpos;

/* two bytes are reserved for each surrogate */
if (moreunits > 1) {
if (moreunits > 0) {
Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
/* integer overflow */
PyErr_NoMemory();
goto error;
}
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
goto error;
out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
}

if (PyBytes_Check(rep)) {
memcpy(out, PyBytes_AS_STRING(rep), repsize);
out += moreunits;
out += repsize / 2;
} else /* rep is unicode */ {
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
Expand Down Expand Up @@ -7137,8 +7141,19 @@ unicode_encode_ucs1(PyObject *unicode,
if (rep == NULL)
goto onError;

/* subtract preallocated bytes */
writer.min_size -= newpos - collstart;
if (newpos < collstart) {
writer.overallocate = 1;
str = _PyBytesWriter_Prepare(&writer, str,
collstart - newpos);
if (str == NULL)
goto onError;
}
else {
/* subtract preallocated bytes */
writer.min_size -= newpos - collstart;
/* Only overallocate the buffer if it's not the last write */
writer.overallocate = (newpos < size);
}

if (PyBytes_Check(rep)) {
/* Directly copy bytes result to output. */
Expand Down Expand Up @@ -7914,13 +7929,14 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
pos, pos + 1, &newpos);
if (rep == NULL)
goto error;
pos = newpos;

Py_ssize_t morebytes = pos - newpos;
if (PyBytes_Check(rep)) {
outsize = PyBytes_GET_SIZE(rep);
if (outsize != 1) {
morebytes += outsize;
if (morebytes > 0) {
Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
Py_DECREF(rep);
goto error;
Expand All @@ -7941,9 +7957,10 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
}

outsize = PyUnicode_GET_LENGTH(rep);
if (outsize != 1) {
morebytes += outsize;
if (morebytes > 0) {
Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
Py_DECREF(rep);
goto error;
Expand All @@ -7966,6 +7983,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
out++;
}
}
pos = newpos;
Py_DECREF(rep);
}
/* write a NUL byte */
Expand Down