gh-94808: Cover `PyUnicode_Count` in CAPI #96929

sobolevn · 2022-09-19T09:22:04Z

It is heavily inspired by

Lines 99 to 161 in cbdeda8

    
           def test_count(self): 
        
               self.checkequal(3, 'aaa', 'count', 'a') 
        
               self.checkequal(0, 'aaa', 'count', 'b') 
        
               self.checkequal(3, 'aaa', 'count', 'a') 
        
               self.checkequal(0, 'aaa', 'count', 'b') 
        
               self.checkequal(3, 'aaa', 'count', 'a') 
        
               self.checkequal(0, 'aaa', 'count', 'b') 
        
               self.checkequal(0, 'aaa', 'count', 'b') 
        
               self.checkequal(2, 'aaa', 'count', 'a', 1) 
        
               self.checkequal(0, 'aaa', 'count', 'a', 10) 
        
               self.checkequal(1, 'aaa', 'count', 'a', -1) 
        
               self.checkequal(3, 'aaa', 'count', 'a', -10) 
        
               self.checkequal(1, 'aaa', 'count', 'a', 0, 1) 
        
               self.checkequal(3, 'aaa', 'count', 'a', 0, 10) 
        
               self.checkequal(2, 'aaa', 'count', 'a', 0, -1) 
        
               self.checkequal(0, 'aaa', 'count', 'a', 0, -10) 
        
               self.checkequal(3, 'aaa', 'count', '', 1) 
        
               self.checkequal(1, 'aaa', 'count', '', 3) 
        
               self.checkequal(0, 'aaa', 'count', '', 10) 
        
               self.checkequal(2, 'aaa', 'count', '', -1) 
        
               self.checkequal(4, 'aaa', 'count', '', -10) 
        
               self.checkequal(1, '', 'count', '') 
        
               self.checkequal(0, '', 'count', '', 1, 1) 
        
               self.checkequal(0, '', 'count', '', sys.maxsize, 0) 
        
               self.checkequal(0, '', 'count', 'xx') 
        
               self.checkequal(0, '', 'count', 'xx', 1, 1) 
        
               self.checkequal(0, '', 'count', 'xx', sys.maxsize, 0) 
        
               self.checkraises(TypeError, 'hello', 'count') 
        
               if self.contains_bytes: 
        
                   self.checkequal(0, 'hello', 'count', 42) 
        
               else: 
        
                   self.checkraises(TypeError, 'hello', 'count', 42) 
        
               # For a variety of combinations, 
        
               #    verify that str.count() matches an equivalent function 
        
               #    replacing all occurrences and then differencing the string lengths 
        
               charset = ['', 'a', 'b'] 
        
               digits = 7 
        
               base = len(charset) 
        
               teststrings = set() 
        
               for i in range(base ** digits): 
        
                   entry = [] 
        
                   for j in range(digits): 
        
                       i, m = divmod(i, base) 
        
                       entry.append(charset[m]) 
        
                   teststrings.add(''.join(entry)) 
        
               teststrings = [self.fixtype(ts) for ts in teststrings] 
        
               for i in teststrings: 
        
                   n = len(i) 
        
                   for j in teststrings: 
        
                       r1 = i.count(j) 
        
                       if j: 
        
                           r2, rem = divmod(n - len(i.replace(j, self.fixtype(''))), 
        
                                            len(j)) 
        
                       else: 
        
                           r2, rem = len(i)+1, 0 
        
                       if rem or r1 != r2: 
        
                           self.assertEqual(rem, 0, '%s != 0 for %s' % (rem, i)) 
        
                           self.assertEqual(r1, r2, '%s != %s for %s' % (r1, r2, i))

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count? They look pretty similar:

cpython/Objects/unicodeobject.c

Lines 8968 to 9040 in cbdeda8

    
           Py_ssize_t 
        
           PyUnicode_Count(PyObject *str, 
        
                           PyObject *substr, 
        
                           Py_ssize_t start, 
        
                           Py_ssize_t end) 
        
           { 
        
               Py_ssize_t result; 
        
               int kind1, kind2; 
        
               const void *buf1 = NULL, *buf2 = NULL; 
        
               Py_ssize_t len1, len2; 
        
               if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 
        
                   return -1; 
        
               kind1 = PyUnicode_KIND(str); 
        
               kind2 = PyUnicode_KIND(substr); 
        
               if (kind1 < kind2) 
        
                   return 0; 
        
               len1 = PyUnicode_GET_LENGTH(str); 
        
               len2 = PyUnicode_GET_LENGTH(substr); 
        
               ADJUST_INDICES(start, end, len1); 
        
               if (end - start < len2) 
        
                   return 0; 
        
               buf1 = PyUnicode_DATA(str); 
        
               buf2 = PyUnicode_DATA(substr); 
        
               if (kind2 != kind1) { 
        
                   buf2 = unicode_askind(kind2, buf2, len2, kind1); 
        
                   if (!buf2) 
        
                       goto onError; 
        
               } 
        
               switch (kind1) { 
        
               case PyUnicode_1BYTE_KIND: 
        
                   if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) 
        
                       result = asciilib_count( 
        
                           ((const Py_UCS1*)buf1) + start, end - start, 
        
                           buf2, len2, PY_SSIZE_T_MAX 
        
                           ); 
        
                   else 
        
                       result = ucs1lib_count( 
        
                           ((const Py_UCS1*)buf1) + start, end - start, 
        
                           buf2, len2, PY_SSIZE_T_MAX 
        
                           ); 
        
                   break; 
        
               case PyUnicode_2BYTE_KIND: 
        
                   result = ucs2lib_count( 
        
                       ((const Py_UCS2*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               case PyUnicode_4BYTE_KIND: 
        
                   result = ucs4lib_count( 
        
                       ((const Py_UCS4*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               default: 
        
                   Py_UNREACHABLE(); 
        
               } 
        
               assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); 
        
               if (kind2 != kind1) 
        
                   PyMem_Free((void *)buf2); 
        
               return result; 
        
             onError: 
        
               assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); 
        
               if (kind2 != kind1) 
        
                   PyMem_Free((void *)buf2); 
        
               return -1; 
        
           }

And

cpython/Objects/unicodeobject.c

Lines 10854 to 10916 in cbdeda8

    
           static PyObject * 
        
           unicode_count(PyObject *self, PyObject *args) 
        
           { 
        
               PyObject *substring = NULL;   /* initialize to fix a compiler warning */ 
        
               Py_ssize_t start = 0; 
        
               Py_ssize_t end = PY_SSIZE_T_MAX; 
        
               PyObject *result; 
        
               int kind1, kind2; 
        
               const void *buf1, *buf2; 
        
               Py_ssize_t len1, len2, iresult; 
        
               if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) 
        
                   return NULL; 
        
               kind1 = PyUnicode_KIND(self); 
        
               kind2 = PyUnicode_KIND(substring); 
        
               if (kind1 < kind2) 
        
                   return PyLong_FromLong(0); 
        
               len1 = PyUnicode_GET_LENGTH(self); 
        
               len2 = PyUnicode_GET_LENGTH(substring); 
        
               ADJUST_INDICES(start, end, len1); 
        
               if (end - start < len2) 
        
                   return PyLong_FromLong(0); 
        
               buf1 = PyUnicode_DATA(self); 
        
               buf2 = PyUnicode_DATA(substring); 
        
               if (kind2 != kind1) { 
        
                   buf2 = unicode_askind(kind2, buf2, len2, kind1); 
        
                   if (!buf2) 
        
                       return NULL; 
        
               } 
        
               switch (kind1) { 
        
               case PyUnicode_1BYTE_KIND: 
        
                   iresult = ucs1lib_count( 
        
                       ((const Py_UCS1*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               case PyUnicode_2BYTE_KIND: 
        
                   iresult = ucs2lib_count( 
        
                       ((const Py_UCS2*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               case PyUnicode_4BYTE_KIND: 
        
                   iresult = ucs4lib_count( 
        
                       ((const Py_UCS4*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               default: 
        
                   Py_UNREACHABLE(); 
        
               } 
        
               result = PyLong_FromSsize_t(iresult); 
        
               assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring))); 
        
               if (kind2 != kind1) 
        
                   PyMem_Free((void *)buf2); 
        
               return result; 
        
           }

Issue: Metabug: Improving C-level coverage #94808

mdboom · 2022-09-22T19:45:02Z

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count?

It looks like these both date to the same commit d57fd91 from 2000-03-10. They were pretty different then, but are almost the same now. I see some benefit in making unicode_count call PyUnicode_Count to make sure they remain consistent, but I could also see someone seeing this as "churn for churn's sake".

Note there is also anylib_count which is a subset of unicode_count and PyUnicode_Count.

There are a few other instances of this kind of thing I've come across looking at coverage -- it would be good to get a core developer's take on whether merging internal and external functions where they are clearly wrappable like this would be welcome.

encukou · 2022-10-06T15:24:07Z

Apparently unicode_count missed an optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.

sobolevn · 2022-10-06T17:04:14Z

Thanks! Yes, I would like to do that! I will open a new issue for it.

* main: pythonGH-88050: fix race in closing subprocess pipe in asyncio (python#97951) pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962) pythongh-95986: Fix the example using match keyword (python#95989) pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944) pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929) pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610) pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703) pythonGH-88968: Add notes about socket ownership transfers (python#97936) pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528)

* main: (53 commits) pythongh-94808: Coverage: Test that maximum indentation level is handled (python#95926) pythonGH-88050: fix race in closing subprocess pipe in asyncio (python#97951) pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962) pythongh-95986: Fix the example using match keyword (python#95989) pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944) pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929) pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610) pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703) pythonGH-88968: Add notes about socket ownership transfers (python#97936) pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528) pythongh-65961: Raise `DeprecationWarning` when `__package__` differs from `__spec__.parent` (python#97879) docs(typing): add "see PEP 675" to LiteralString (python#97926) pythongh-97850: Remove all known instances of module_repr() (python#97876) I changed my surname early this year (python#96671) pythongh-93738: Documentation C syntax (:c:type:<C type> -> :c:expr:<C type>) (python#97768) pythongh-91539: improve performance of get_proxies_environment (python#91566) build(deps): bump actions/stale from 5 to 6 (python#97701) pythonGH-95172 Make the same version `versionadded` oneline (python#95172) pythongh-88050: Fix asyncio subprocess to kill process cleanly when process is blocked (python#32073) pythongh-93738: Documentation C syntax (Function glob patterns -> literal markup) (python#97774) ...

pythongh-94808: Cover PyUnicode_Count in CAPI

22aee9a

bedevere-bot added the awaiting review label Sep 19, 2022

sobolevn added tests Tests in the Lib/test dir skip news labels Sep 19, 2022

sobolevn requested a review from encukou September 19, 2022 09:49

encukou approved these changes Oct 6, 2022

View reviewed changes

bedevere-bot added awaiting merge and removed awaiting review labels Oct 6, 2022

encukou merged commit e63d7da into python:main Oct 6, 2022

bedevere-bot removed the awaiting merge label Oct 6, 2022

sobolevn mentioned this pull request Oct 6, 2022

Unify PyUncode_Count and unicode_count #97982

Closed

mpage pushed a commit to mpage/cpython that referenced this pull request Oct 11, 2022

pythongh-94808: Cover PyUnicode_Count in CAPI (python#96929)

d458682

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

sobolevn commented Sep 19, 2022 •

edited by bedevere-bot

Loading

mdboom commented Sep 22, 2022

encukou commented Oct 6, 2022

sobolevn commented Oct 6, 2022

	def test_count(self):
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(2, 'aaa', 'count', 'a', 1)
	self.checkequal(0, 'aaa', 'count', 'a', 10)
	self.checkequal(1, 'aaa', 'count', 'a', -1)
	self.checkequal(3, 'aaa', 'count', 'a', -10)
	self.checkequal(1, 'aaa', 'count', 'a', 0, 1)
	self.checkequal(3, 'aaa', 'count', 'a', 0, 10)
	self.checkequal(2, 'aaa', 'count', 'a', 0, -1)
	self.checkequal(0, 'aaa', 'count', 'a', 0, -10)
	self.checkequal(3, 'aaa', 'count', '', 1)
	self.checkequal(1, 'aaa', 'count', '', 3)
	self.checkequal(0, 'aaa', 'count', '', 10)
	self.checkequal(2, 'aaa', 'count', '', -1)
	self.checkequal(4, 'aaa', 'count', '', -10)

	self.checkequal(1, '', 'count', '')
	self.checkequal(0, '', 'count', '', 1, 1)
	self.checkequal(0, '', 'count', '', sys.maxsize, 0)

	self.checkequal(0, '', 'count', 'xx')
	self.checkequal(0, '', 'count', 'xx', 1, 1)
	self.checkequal(0, '', 'count', 'xx', sys.maxsize, 0)

	self.checkraises(TypeError, 'hello', 'count')

	if self.contains_bytes:
	self.checkequal(0, 'hello', 'count', 42)
	else:
	self.checkraises(TypeError, 'hello', 'count', 42)

	# For a variety of combinations,
	# verify that str.count() matches an equivalent function
	# replacing all occurrences and then differencing the string lengths
	charset = ['', 'a', 'b']
	digits = 7
	base = len(charset)
	teststrings = set()
	for i in range(base ** digits):
	entry = []
	for j in range(digits):
	i, m = divmod(i, base)
	entry.append(charset[m])
	teststrings.add(''.join(entry))
	teststrings = [self.fixtype(ts) for ts in teststrings]
	for i in teststrings:
	n = len(i)
	for j in teststrings:
	r1 = i.count(j)
	if j:
	r2, rem = divmod(n - len(i.replace(j, self.fixtype(''))),
	len(j))
	else:
	r2, rem = len(i)+1, 0
	if rem or r1 != r2:
	self.assertEqual(rem, 0, '%s != 0 for %s' % (rem, i))
	self.assertEqual(r1, r2, '%s != %s for %s' % (r1, r2, i))

	Py_ssize_t
	PyUnicode_Count(PyObject *str,
	PyObject *substr,
	Py_ssize_t start,
	Py_ssize_t end)
	{
	Py_ssize_t result;
	int kind1, kind2;
	const void buf1 = NULL, buf2 = NULL;
	Py_ssize_t len1, len2;

	if (ensure_unicode(str) < 0 \|\| ensure_unicode(substr) < 0)
	return -1;

	kind1 = PyUnicode_KIND(str);
	kind2 = PyUnicode_KIND(substr);
	if (kind1 < kind2)
	return 0;

	len1 = PyUnicode_GET_LENGTH(str);
	len2 = PyUnicode_GET_LENGTH(substr);
	ADJUST_INDICES(start, end, len1);
	if (end - start < len2)
	return 0;

	buf1 = PyUnicode_DATA(str);
	buf2 = PyUnicode_DATA(substr);
	if (kind2 != kind1) {
	buf2 = unicode_askind(kind2, buf2, len2, kind1);
	if (!buf2)
	goto onError;
	}

	switch (kind1) {
	case PyUnicode_1BYTE_KIND:
	if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
	result = asciilib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	else
	result = ucs1lib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_2BYTE_KIND:
	result = ucs2lib_count(
	((const Py_UCS2*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_4BYTE_KIND:
	result = ucs4lib_count(
	((const Py_UCS4*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	default:
	Py_UNREACHABLE();
	}

	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);

	return result;
	onError:
	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);
	return -1;
	}

	static PyObject *
	unicode_count(PyObject self, PyObject args)
	{
	PyObject substring = NULL; / initialize to fix a compiler warning */
	Py_ssize_t start = 0;
	Py_ssize_t end = PY_SSIZE_T_MAX;
	PyObject *result;
	int kind1, kind2;
	const void buf1, buf2;
	Py_ssize_t len1, len2, iresult;

	if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
	return NULL;

	kind1 = PyUnicode_KIND(self);
	kind2 = PyUnicode_KIND(substring);
	if (kind1 < kind2)
	return PyLong_FromLong(0);

	len1 = PyUnicode_GET_LENGTH(self);
	len2 = PyUnicode_GET_LENGTH(substring);
	ADJUST_INDICES(start, end, len1);
	if (end - start < len2)
	return PyLong_FromLong(0);

	buf1 = PyUnicode_DATA(self);
	buf2 = PyUnicode_DATA(substring);
	if (kind2 != kind1) {
	buf2 = unicode_askind(kind2, buf2, len2, kind1);
	if (!buf2)
	return NULL;
	}
	switch (kind1) {
	case PyUnicode_1BYTE_KIND:
	iresult = ucs1lib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_2BYTE_KIND:
	iresult = ucs2lib_count(
	((const Py_UCS2*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_4BYTE_KIND:
	iresult = ucs4lib_count(
	((const Py_UCS4*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	default:
	Py_UNREACHABLE();
	}

	result = PyLong_FromSsize_t(iresult);

	assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);

	return result;
	}

gh-94808: Cover PyUnicode_Count in CAPI #96929

gh-94808: Cover PyUnicode_Count in CAPI #96929

Conversation

sobolevn commented Sep 19, 2022 • edited by bedevere-bot Loading

mdboom commented Sep 22, 2022

encukou commented Oct 6, 2022

sobolevn commented Oct 6, 2022

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

sobolevn commented Sep 19, 2022 •

edited by bedevere-bot

Loading