From b239ae5762604f77fcaac7cb81263a31dac560d0 Mon Sep 17 00:00:00 2001 From: sobolevn Date: Fri, 7 Oct 2022 13:10:24 +0300 Subject: [PATCH 1/9] gh-97982: Reuse `PyUnicode_Count` in `unicode_count` --- ...2-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst | 2 + Objects/unicodeobject.c | 51 +------------------ 2 files changed, 3 insertions(+), 50 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst new file mode 100644 index 00000000000000..0084d23af6fc39 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst @@ -0,0 +1,2 @@ +Reuse ``PyUnicode_Count`` function in ``unicode_count`` in +``unicodeobject.c``. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index bd169ed714212d..cb465a0e22a8da 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10858,60 +10858,11 @@ unicode_count(PyObject *self, PyObject *args) Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; PyObject *result; - int kind1, kind2; - const void *buf1, *buf2; - Py_ssize_t len1, len2, iresult; if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) return NULL; - kind1 = PyUnicode_KIND(self); - kind2 = PyUnicode_KIND(substring); - if (kind1 < kind2) - return PyLong_FromLong(0); - - len1 = PyUnicode_GET_LENGTH(self); - len2 = PyUnicode_GET_LENGTH(substring); - ADJUST_INDICES(start, end, len1); - if (end - start < len2) - return PyLong_FromLong(0); - - buf1 = PyUnicode_DATA(self); - buf2 = PyUnicode_DATA(substring); - if (kind2 != kind1) { - buf2 = unicode_askind(kind2, buf2, len2, kind1); - if (!buf2) - return NULL; - } - switch (kind1) { - case PyUnicode_1BYTE_KIND: - iresult = ucs1lib_count( - ((const Py_UCS1*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - case PyUnicode_2BYTE_KIND: - iresult = ucs2lib_count( - ((const Py_UCS2*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - case PyUnicode_4BYTE_KIND: - iresult = ucs4lib_count( - ((const Py_UCS4*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - default: - Py_UNREACHABLE(); - } - - result = PyLong_FromSsize_t(iresult); - - assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring))); - if (kind2 != kind1) - PyMem_Free((void *)buf2); - + result = PyLong_FromSsize_t(PyUnicode_Count(self, substring, start, end)); return result; } From e9d63589165daee34efaf76b33d31b297fdaa484 Mon Sep 17 00:00:00 2001 From: sobolevn Date: Fri, 7 Oct 2022 13:29:38 +0300 Subject: [PATCH 2/9] Address review --- Objects/unicodeobject.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index cb465a0e22a8da..7bf77525112deb 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8964,21 +8964,19 @@ _PyUnicode_InsertThousandsGrouping( return count; } - -Py_ssize_t -PyUnicode_Count(PyObject *str, - PyObject *substr, - Py_ssize_t start, - Py_ssize_t end) +static Py_ssize_t +any_unicode_count(PyObject *str, + PyObject *substr, + Py_ssize_t start, + Py_ssize_t end) { + // You must ensure that `str` and `substr` are both unicode objects + // before calling this function. Py_ssize_t result; int kind1, kind2; const void *buf1 = NULL, *buf2 = NULL; Py_ssize_t len1, len2; - if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) - return -1; - kind1 = PyUnicode_KIND(str); kind2 = PyUnicode_KIND(substr); if (kind1 < kind2) @@ -9039,6 +9037,18 @@ PyUnicode_Count(PyObject *str, return -1; } +Py_ssize_t +PyUnicode_Count(PyObject *str, + PyObject *substr, + Py_ssize_t start, + Py_ssize_t end) +{ + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) + return -1; + + return any_unicode_count(str, substr, start, end); +} + Py_ssize_t PyUnicode_Find(PyObject *str, PyObject *substr, @@ -10858,11 +10868,16 @@ unicode_count(PyObject *self, PyObject *args) Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; PyObject *result; + Py_ssize_t iresult; if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) return NULL; - result = PyLong_FromSsize_t(PyUnicode_Count(self, substring, start, end)); + iresult = any_unicode_count(self, substring, start, end); + if (iresult == -1) + return NULL; + + result = PyLong_FromSsize_t(iresult); return result; } From 886d6e388127c7d33b7b4e2027571c573ed35e28 Mon Sep 17 00:00:00 2001 From: Nikita Sobolev Date: Fri, 7 Oct 2022 13:44:56 +0300 Subject: [PATCH 3/9] Update 2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst --- .../2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst index 0084d23af6fc39..69654a8b1431c0 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst @@ -1,2 +1,3 @@ -Reuse ``PyUnicode_Count`` function in ``unicode_count`` in +Create ``any_unicode_count`` private helper for +both ``PyUnicode_Count`` and ``unicode_count`` in ``unicodeobject.c``. From 482010ac48f42a89619f585fb69c589be47a005a Mon Sep 17 00:00:00 2001 From: Nikita Sobolev Date: Fri, 7 Oct 2022 14:07:23 +0300 Subject: [PATCH 4/9] Update 2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst --- .../2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst index 69654a8b1431c0..983ada814b1b20 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst @@ -1,3 +1,3 @@ -Create ``any_unicode_count`` private helper for +Create ``any_unicode_count`` private helper for both ``PyUnicode_Count`` and ``unicode_count`` in ``unicodeobject.c``. From 4b2447e7233d714b0462cf7944aee349a27b6059 Mon Sep 17 00:00:00 2001 From: sobolevn Date: Fri, 7 Oct 2022 14:41:14 +0300 Subject: [PATCH 5/9] Use `assert()` for unicode objects --- Lib/test/test_unicode.py | 10 ++++++++++ Objects/unicodeobject.c | 5 +++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 30faaaf83bec96..edb92f106cf696 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -241,6 +241,10 @@ def test_count(self): self.checkequal(0, 'a' * 10, 'count', 'a\u0102') self.checkequal(0, 'a' * 10, 'count', 'a\U00100304') self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304') + # test subclass + class MyStr(str): + pass + self.checkequal(3, MyStr('aaa'), 'count', 'a') def test_find(self): string_tests.CommonTest.test_find(self) @@ -2983,6 +2987,12 @@ def test_count(self): self.assertEqual(unicode_count(uni, ch, 0, len(uni)), 1) self.assertEqual(unicode_count(st, ch, 0, len(st)), 0) + # subclasses should still work + class MyStr(str): + pass + + self.assertEqual(unicode_count(MyStr('aab'), 'a', 0, 3), 2) + # Test PyUnicode_FindChar() @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7bf77525112deb..875de97a51c5a6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8970,8 +8970,9 @@ any_unicode_count(PyObject *str, Py_ssize_t start, Py_ssize_t end) { - // You must ensure that `str` and `substr` are both unicode objects - // before calling this function. + assert(PyUnicode_Check(str)); + assert(PyUnicode_Check(substr)); + Py_ssize_t result; int kind1, kind2; const void *buf1 = NULL, *buf2 = NULL; From 35ecc77deb4c38f082842ff777f1af67730236a7 Mon Sep 17 00:00:00 2001 From: sobolevn Date: Sat, 8 Oct 2022 12:03:50 +0300 Subject: [PATCH 6/9] Address review --- ...2-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst | 3 --- Objects/unicodeobject.c | 20 +++++++++---------- 2 files changed, 9 insertions(+), 14 deletions(-) delete mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst deleted file mode 100644 index 983ada814b1b20..00000000000000 --- a/Misc/NEWS.d/next/Core and Builtins/2022-10-07-13-09-57.gh-issue-97982.6qR7Qm.rst +++ /dev/null @@ -1,3 +0,0 @@ -Create ``any_unicode_count`` private helper for -both ``PyUnicode_Count`` and ``unicode_count`` in -``unicodeobject.c``. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 875de97a51c5a6..469c348eea68e5 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8965,10 +8965,10 @@ _PyUnicode_InsertThousandsGrouping( } static Py_ssize_t -any_unicode_count(PyObject *str, - PyObject *substr, - Py_ssize_t start, - Py_ssize_t end) +unicode_count_impl(PyObject *str, + PyObject *substr, + Py_ssize_t start, + Py_ssize_t end) { assert(PyUnicode_Check(str)); assert(PyUnicode_Check(substr)); @@ -9047,7 +9047,7 @@ PyUnicode_Count(PyObject *str, if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) return -1; - return any_unicode_count(str, substr, start, end); + return unicode_count_impl(str, substr, start, end); } Py_ssize_t @@ -10868,18 +10868,16 @@ unicode_count(PyObject *self, PyObject *args) PyObject *substring = NULL; /* initialize to fix a compiler warning */ Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *result; - Py_ssize_t iresult; + Py_ssize_t result; if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) return NULL; - iresult = any_unicode_count(self, substring, start, end); - if (iresult == -1) + result = unicode_count_impl(self, substring, start, end); + if (result == -1) return NULL; - result = PyLong_FromSsize_t(iresult); - return result; + return PyLong_FromSsize_t(result); } /*[clinic input] From 2dabc73cc3df589be13c827269f9ef66924ce569 Mon Sep 17 00:00:00 2001 From: sobolevn Date: Wed, 12 Oct 2022 18:19:20 +0300 Subject: [PATCH 7/9] Use `anylib_count` in `unicode_count_impl` --- Objects/unicodeobject.c | 56 ++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ee7f87367897f5..93ac46985d3f2f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8964,6 +8964,21 @@ _PyUnicode_InsertThousandsGrouping( return count; } +static Py_ssize_t +anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen, + PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) +{ + switch (kind) { + case PyUnicode_1BYTE_KIND: + return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); + case PyUnicode_2BYTE_KIND: + return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); + case PyUnicode_4BYTE_KIND: + return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); + } + Py_UNREACHABLE(); +} + static Py_ssize_t unicode_count_impl(PyObject *str, PyObject *substr, @@ -8997,28 +9012,10 @@ unicode_count_impl(PyObject *str, goto onError; } - switch (kind1) { - case PyUnicode_1BYTE_KIND: - result = ucs1lib_count( - ((const Py_UCS1*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - case PyUnicode_2BYTE_KIND: - result = ucs2lib_count( - ((const Py_UCS2*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - case PyUnicode_4BYTE_KIND: - result = ucs4lib_count( - ((const Py_UCS4*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - default: - Py_UNREACHABLE(); - } + result = anylib_count(kind1, + str, buf1 + start, end - start, + substr, buf2, len2, + PY_SSIZE_T_MAX); assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); if (kind2 != kind1) @@ -9903,21 +9900,6 @@ anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1, Py_UNREACHABLE(); } -static Py_ssize_t -anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen, - PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) -{ - switch (kind) { - case PyUnicode_1BYTE_KIND: - return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); - case PyUnicode_2BYTE_KIND: - return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); - case PyUnicode_4BYTE_KIND: - return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); - } - Py_UNREACHABLE(); -} - static void replace_1char_inplace(PyObject *u, Py_ssize_t pos, Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) From 74f8c1c564bd3d3ac28dd05faa6053980b8e4f95 Mon Sep 17 00:00:00 2001 From: sobolevn Date: Wed, 12 Oct 2022 19:00:55 +0300 Subject: [PATCH 8/9] Revert "Use `anylib_count` in `unicode_count_impl`" This reverts commit 2dabc73cc3df589be13c827269f9ef66924ce569. --- Objects/unicodeobject.c | 56 +++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 93ac46985d3f2f..ee7f87367897f5 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8964,21 +8964,6 @@ _PyUnicode_InsertThousandsGrouping( return count; } -static Py_ssize_t -anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen, - PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) -{ - switch (kind) { - case PyUnicode_1BYTE_KIND: - return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); - case PyUnicode_2BYTE_KIND: - return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); - case PyUnicode_4BYTE_KIND: - return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); - } - Py_UNREACHABLE(); -} - static Py_ssize_t unicode_count_impl(PyObject *str, PyObject *substr, @@ -9012,10 +8997,28 @@ unicode_count_impl(PyObject *str, goto onError; } - result = anylib_count(kind1, - str, buf1 + start, end - start, - substr, buf2, len2, - PY_SSIZE_T_MAX); + switch (kind1) { + case PyUnicode_1BYTE_KIND: + result = ucs1lib_count( + ((const Py_UCS1*)buf1) + start, end - start, + buf2, len2, PY_SSIZE_T_MAX + ); + break; + case PyUnicode_2BYTE_KIND: + result = ucs2lib_count( + ((const Py_UCS2*)buf1) + start, end - start, + buf2, len2, PY_SSIZE_T_MAX + ); + break; + case PyUnicode_4BYTE_KIND: + result = ucs4lib_count( + ((const Py_UCS4*)buf1) + start, end - start, + buf2, len2, PY_SSIZE_T_MAX + ); + break; + default: + Py_UNREACHABLE(); + } assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); if (kind2 != kind1) @@ -9900,6 +9903,21 @@ anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1, Py_UNREACHABLE(); } +static Py_ssize_t +anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen, + PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) +{ + switch (kind) { + case PyUnicode_1BYTE_KIND: + return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); + case PyUnicode_2BYTE_KIND: + return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); + case PyUnicode_4BYTE_KIND: + return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); + } + Py_UNREACHABLE(); +} + static void replace_1char_inplace(PyObject *u, Py_ssize_t pos, Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) From 69a94e2cc0d0a13de01e9acd2de69d49c7222d58 Mon Sep 17 00:00:00 2001 From: sobolevn Date: Wed, 12 Oct 2022 19:01:39 +0300 Subject: [PATCH 9/9] Add a comment about `anylib_count` --- Objects/unicodeobject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ee7f87367897f5..5b737f1b596e3e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8997,6 +8997,7 @@ unicode_count_impl(PyObject *str, goto onError; } + // We don't reuse `anylib_count` here because of the explicit casts. switch (kind1) { case PyUnicode_1BYTE_KIND: result = ucs1lib_count(