From 8ea7744851048b032404f2400a7c7a070479e152 Mon Sep 17 00:00:00 2001
From: chris-b1 <cbartak@gmail.com>
Date: Wed, 5 Dec 2018 14:21:23 -0600
Subject: [PATCH] PERF: ascii c string functions (#23981)

---
 LICENSES/MUSL_LICENSE               | 132 ++++++++++++++++++++++++++++
 doc/source/whatsnew/v0.24.0.rst     |   1 +
 pandas/_libs/src/headers/portable.h |   6 ++
 pandas/_libs/src/parse_helper.h     |  14 +--
 pandas/_libs/src/parser/tokenizer.c |  56 ++++++------
 5 files changed, 175 insertions(+), 34 deletions(-)
 create mode 100644 LICENSES/MUSL_LICENSE

diff --git a/LICENSES/MUSL_LICENSE b/LICENSES/MUSL_LICENSE
new file mode 100644
index 0000000000000..a8833d4bc4744
--- /dev/null
+++ b/LICENSES/MUSL_LICENSE
@@ -0,0 +1,132 @@
+musl as a whole is licensed under the following standard MIT license:
+
+----------------------------------------------------------------------
+Copyright © 2005-2014 Rich Felker, et al.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+----------------------------------------------------------------------
+
+Authors/contributors include:
+
+Anthony G. Basile
+Arvid Picciani
+Bobby Bingham
+Boris Brezillon
+Brent Cook
+Chris Spiegel
+Clément Vasseur
+Emil Renner Berthing
+Hiltjo Posthuma
+Isaac Dunham
+Jens Gustedt
+Jeremy Huntwork
+John Spencer
+Justin Cormack
+Luca Barbato
+Luka Perkov
+M Farkas-Dyck (Strake)
+Michael Forney
+Nicholas J. Kain
+orc
+Pascal Cuoq
+Pierre Carrier
+Rich Felker
+Richard Pennington
+sin
+Solar Designer
+Stefan Kristiansson
+Szabolcs Nagy
+Timo Teräs
+Valentin Ochs
+William Haddon
+
+Portions of this software are derived from third-party works licensed
+under terms compatible with the above MIT license:
+
+The TRE regular expression implementation (src/regex/reg* and
+src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed
+under a 2-clause BSD license (license text in the source files). The
+included version has been heavily modified by Rich Felker in 2012, in
+the interests of size, simplicity, and namespace cleanliness.
+
+Much of the math library code (src/math/* and src/complex/*) is
+Copyright © 1993,2004 Sun Microsystems or
+Copyright © 2003-2011 David Schultz or
+Copyright © 2003-2009 Steven G. Kargl or
+Copyright © 2003-2009 Bruce D. Evans or
+Copyright © 2008 Stephen L. Moshier
+and labelled as such in comments in the individual source files. All
+have been licensed under extremely permissive terms.
+
+The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008
+The Android Open Source Project and is licensed under a two-clause BSD
+license. It was taken from Bionic libc, used on Android.
+
+The implementation of DES for crypt (src/misc/crypt_des.c) is
+Copyright © 1994 David Burren. It is licensed under a BSD license.
+
+The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was
+originally written by Solar Designer and placed into the public
+domain. The code also comes with a fallback permissive license for use
+in jurisdictions that may not recognize the public domain.
+
+The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011
+Valentin Ochs and is licensed under an MIT-style license.
+
+The BSD PRNG implementation (src/prng/random.c) and XSI search API
+(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and
+licensed under following terms: "Permission to use, copy, modify,
+and/or distribute this code for any purpose with or without fee is
+hereby granted. There is no warranty."
+
+The x86_64 port was written by Nicholas J. Kain. Several files (crt)
+were released into the public domain; others are licensed under the
+standard MIT license terms at the top of this file. See individual
+files for their copyright status.
+
+The mips and microblaze ports were originally written by Richard
+Pennington for use in the ellcc project. The original code was adapted
+by Rich Felker for build system and code conventions during upstream
+integration. It is licensed under the standard MIT terms.
+
+The powerpc port was also originally written by Richard Pennington,
+and later supplemented and integrated by John Spencer. It is licensed
+under the standard MIT terms.
+
+All other files which have no copyright comments are original works
+produced specifically for use as part of this library, written either
+by Rich Felker, the main author of the library, or by one or more
+contibutors listed above. Details on authorship of individual files
+can be found in the git version control history of the project. The
+omission of copyright and license comments in each file is in the
+interest of source tree size.
+
+All public header files (include/* and arch/*/bits/*) should be
+treated as Public Domain as they intentionally contain no content
+which can be covered by copyright. Some source modules may fall in
+this category as well. If you believe that a file is so trivial that
+it should be in the Public Domain, please contact the authors and
+request an explicit statement releasing it from copyright.
+
+The following files are trivial, believed not to be copyrightable in
+the first place, and hereby explicitly released to the Public Domain:
+
+All public headers: include/*, arch/*/bits/*
+Startup files: crt/*
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 090127f50c6c2..14bb7e5517370 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -1242,6 +1242,7 @@ Performance Improvements
 - Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`)
 - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`)
 - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`)
+- Fixed a performance regression on Windows with Python 3.7 of :func:`pd.read_csv` (:issue:`23516`)
 - Improved performance of :class:`Categorical` constructor for `Series` objects (:issue:`23814`)
 
 .. _whatsnew_0240.docs:
diff --git a/pandas/_libs/src/headers/portable.h b/pandas/_libs/src/headers/portable.h
index b9868276ef6e6..9ac4ebc306baa 100644
--- a/pandas/_libs/src/headers/portable.h
+++ b/pandas/_libs/src/headers/portable.h
@@ -5,4 +5,10 @@
 #define strcasecmp( s1, s2 ) _stricmp( s1, s2 )
 #endif
 
+// GH-23516 - works around locale perf issues
+// from MUSL libc, MIT Licensed - see LICENSES
+#define isdigit_ascii(c) ((unsigned)c - '0' < 10)
+#define isspace_ascii(c) (c == ' ' || (unsigned)c-'\t' < 5)
+#define toupper_ascii(c) (((unsigned)c-'a' < 26) ? (c & 0x5f) : c)
+
 #endif
diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h
index 4f9f825b15ffe..b71131bee7008 100644
--- a/pandas/_libs/src/parse_helper.h
+++ b/pandas/_libs/src/parse_helper.h
@@ -138,11 +138,11 @@ int floatify(PyObject *str, double *result, int *maybe_int) {
 //
 
 PANDAS_INLINE void lowercase(char *p) {
-    for (; *p; ++p) *p = tolower(*p);
+    for (; *p; ++p) *p = tolower_ascii(*p);
 }
 
 PANDAS_INLINE void uppercase(char *p) {
-    for (; *p; ++p) *p = toupper(*p);
+    for (; *p; ++p) *p = toupper_ascii(*p);
 }
 
 static double xstrtod(const char *str, char **endptr, char decimal, char sci,
@@ -177,7 +177,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci,
     num_decimals = 0;
 
     // Process string of digits
-    while (isdigit(*p)) {
+    while (isdigit_ascii(*p)) {
         number = number * 10. + (*p - '0');
         p++;
         num_digits++;
@@ -188,7 +188,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci,
         *maybe_int = 0;
         p++;
 
-        while (isdigit(*p)) {
+        while (isdigit_ascii(*p)) {
             number = number * 10. + (*p - '0');
             p++;
             num_digits++;
@@ -207,7 +207,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci,
     if (negative) number = -number;
 
     // Process an exponent string
-    if (toupper(*p) == toupper(sci)) {
+    if (toupper_ascii(*p) == toupper_ascii(sci)) {
         *maybe_int = 0;
 
         // Handle optional sign
@@ -222,7 +222,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci,
         // Process string of digits
         num_digits = 0;
         n = 0;
-        while (isdigit(*p)) {
+        while (isdigit_ascii(*p)) {
             n = n * 10 + (*p - '0');
             num_digits++;
             p++;
@@ -263,7 +263,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci,
 
     if (skip_trailing) {
         // Skip trailing whitespace
-        while (isspace(*p)) p++;
+        while (isspace_ascii(*p)) p++;
     }
 
     if (endptr) *endptr = p;
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index e46e1e85f1c81..3a4058f37efc7 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -23,6 +23,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include <float.h>
 #include <math.h>
 
+#include "../headers/portable.h"
+
 static void *safe_realloc(void *buffer, size_t size) {
     void *result;
     // OSX is weird.
@@ -1411,7 +1413,7 @@ int tokenize_all_rows(parser_t *self) {
 }
 
 PANDAS_INLINE void uppercase(char *p) {
-    for (; *p; ++p) *p = toupper(*p);
+    for (; *p; ++p) *p = toupper_ascii(*p);
 }
 
 int PANDAS_INLINE to_longlong(char *item, long long *p_value) {
@@ -1424,7 +1426,7 @@ int PANDAS_INLINE to_longlong(char *item, long long *p_value) {
     *p_value = strtoll(item, &p_end, 10);
 
     // Allow trailing spaces.
-    while (isspace(*p_end)) ++p_end;
+    while (isspace_ascii(*p_end)) ++p_end;
 
     return (errno == 0) && (!*p_end);
 }
@@ -1541,7 +1543,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     errno = 0;
 
     // Skip leading whitespace.
-    while (isspace(*p)) p++;
+    while (isspace_ascii(*p)) p++;
 
     // Handle optional sign.
     negative = 0;
@@ -1558,7 +1560,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     num_decimals = 0;
 
     // Process string of digits.
-    while (isdigit(*p)) {
+    while (isdigit_ascii(*p)) {
         number = number * 10. + (*p - '0');
         p++;
         num_digits++;
@@ -1570,7 +1572,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     if (*p == decimal) {
         p++;
 
-        while (isdigit(*p)) {
+        while (isdigit_ascii(*p)) {
             number = number * 10. + (*p - '0');
             p++;
             num_digits++;
@@ -1589,7 +1591,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     if (negative) number = -number;
 
     // Process an exponent string.
-    if (toupper(*p) == toupper(sci)) {
+    if (toupper_ascii(*p) == toupper_ascii(sci)) {
         // Handle optional sign.
         negative = 0;
         switch (*++p) {
@@ -1602,7 +1604,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
         // Process string of digits.
         num_digits = 0;
         n = 0;
-        while (isdigit(*p)) {
+        while (isdigit_ascii(*p)) {
             n = n * 10 + (*p - '0');
             num_digits++;
             p++;
@@ -1643,7 +1645,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
 
     if (skip_trailing) {
         // Skip trailing whitespace.
-        while (isspace(*p)) p++;
+        while (isspace_ascii(*p)) p++;
     }
 
     if (endptr) *endptr = p;
@@ -1697,7 +1699,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     errno = 0;
 
     // Skip leading whitespace.
-    while (isspace(*p)) p++;
+    while (isspace_ascii(*p)) p++;
 
     // Handle optional sign.
     negative = 0;
@@ -1714,7 +1716,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     num_decimals = 0;
 
     // Process string of digits.
-    while (isdigit(*p)) {
+    while (isdigit_ascii(*p)) {
         if (num_digits < max_digits) {
             number = number * 10. + (*p - '0');
             num_digits++;
@@ -1730,7 +1732,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     if (*p == decimal) {
         p++;
 
-        while (num_digits < max_digits && isdigit(*p)) {
+        while (num_digits < max_digits && isdigit_ascii(*p)) {
             number = number * 10. + (*p - '0');
             p++;
             num_digits++;
@@ -1738,7 +1740,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
         }
 
         if (num_digits >= max_digits)  // Consume extra decimal digits.
-            while (isdigit(*p)) ++p;
+            while (isdigit_ascii(*p)) ++p;
 
         exponent -= num_decimals;
     }
@@ -1752,7 +1754,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     if (negative) number = -number;
 
     // Process an exponent string.
-    if (toupper(*p) == toupper(sci)) {
+    if (toupper_ascii(*p) == toupper_ascii(sci)) {
         // Handle optional sign
         negative = 0;
         switch (*++p) {
@@ -1765,7 +1767,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
         // Process string of digits.
         num_digits = 0;
         n = 0;
-        while (isdigit(*p)) {
+        while (isdigit_ascii(*p)) {
             n = n * 10 + (*p - '0');
             num_digits++;
             p++;
@@ -1798,7 +1800,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
 
     if (skip_trailing) {
         // Skip trailing whitespace.
-        while (isspace(*p)) p++;
+        while (isspace_ascii(*p)) p++;
     }
 
     if (endptr) *endptr = p;
@@ -1833,7 +1835,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     int d;
 
     // Skip leading spaces.
-    while (isspace(*p)) {
+    while (isspace_ascii(*p)) {
         ++p;
     }
 
@@ -1846,7 +1848,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     }
 
     // Check that there is a first digit.
-    if (!isdigit(*p)) {
+    if (!isdigit_ascii(*p)) {
         // Error...
         *error = ERROR_NO_DIGITS;
         return 0;
@@ -1865,7 +1867,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                 if (d == tsep) {
                     d = *++p;
                     continue;
-                } else if (!isdigit(d)) {
+                } else if (!isdigit_ascii(d)) {
                     break;
                 }
                 if ((number > pre_min) ||
@@ -1878,7 +1880,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                 }
             }
         } else {
-            while (isdigit(d)) {
+            while (isdigit_ascii(d)) {
                 if ((number > pre_min) ||
                     ((number == pre_min) && (d - '0' <= dig_pre_min))) {
                     number = number * 10 - (d - '0');
@@ -1902,7 +1904,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                 if (d == tsep) {
                     d = *++p;
                     continue;
-                } else if (!isdigit(d)) {
+                } else if (!isdigit_ascii(d)) {
                     break;
                 }
                 if ((number < pre_max) ||
@@ -1916,7 +1918,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                 }
             }
         } else {
-            while (isdigit(d)) {
+            while (isdigit_ascii(d)) {
                 if ((number < pre_max) ||
                     ((number == pre_max) && (d - '0' <= dig_pre_max))) {
                     number = number * 10 + (d - '0');
@@ -1931,7 +1933,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     }
 
     // Skip trailing spaces.
-    while (isspace(*p)) {
+    while (isspace_ascii(*p)) {
         ++p;
     }
 
@@ -1954,7 +1956,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     int d;
 
     // Skip leading spaces.
-    while (isspace(*p)) {
+    while (isspace_ascii(*p)) {
         ++p;
     }
 
@@ -1968,7 +1970,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     }
 
     // Check that there is a first digit.
-    if (!isdigit(*p)) {
+    if (!isdigit_ascii(*p)) {
         // Error...
         *error = ERROR_NO_DIGITS;
         return 0;
@@ -1984,7 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
             if (d == tsep) {
                 d = *++p;
                 continue;
-            } else if (!isdigit(d)) {
+            } else if (!isdigit_ascii(d)) {
                 break;
             }
             if ((number < pre_max) ||
@@ -1998,7 +2000,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
             }
         }
     } else {
-        while (isdigit(d)) {
+        while (isdigit_ascii(d)) {
             if ((number < pre_max) ||
                 ((number == pre_max) && (d - '0' <= dig_pre_max))) {
                 number = number * 10 + (d - '0');
@@ -2012,7 +2014,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     }
 
     // Skip trailing spaces.
-    while (isspace(*p)) {
+    while (isspace_ascii(*p)) {
         ++p;
     }