From 8ea7744851048b032404f2400a7c7a070479e152 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Wed, 5 Dec 2018 14:21:23 -0600 Subject: [PATCH] PERF: ascii c string functions (#23981) --- LICENSES/MUSL_LICENSE | 132 ++++++++++++++++++++++++++++ doc/source/whatsnew/v0.24.0.rst | 1 + pandas/_libs/src/headers/portable.h | 6 ++ pandas/_libs/src/parse_helper.h | 14 +-- pandas/_libs/src/parser/tokenizer.c | 56 ++++++------ 5 files changed, 175 insertions(+), 34 deletions(-) create mode 100644 LICENSES/MUSL_LICENSE diff --git a/LICENSES/MUSL_LICENSE b/LICENSES/MUSL_LICENSE new file mode 100644 index 0000000000000..a8833d4bc4744 --- /dev/null +++ b/LICENSES/MUSL_LICENSE @@ -0,0 +1,132 @@ +musl as a whole is licensed under the following standard MIT license: + +---------------------------------------------------------------------- +Copyright © 2005-2014 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +---------------------------------------------------------------------- + +Authors/contributors include: + +Anthony G. Basile +Arvid Picciani +Bobby Bingham +Boris Brezillon +Brent Cook +Chris Spiegel +Clément Vasseur +Emil Renner Berthing +Hiltjo Posthuma +Isaac Dunham +Jens Gustedt +Jeremy Huntwork +John Spencer +Justin Cormack +Luca Barbato +Luka Perkov +M Farkas-Dyck (Strake) +Michael Forney +Nicholas J. Kain +orc +Pascal Cuoq +Pierre Carrier +Rich Felker +Richard Pennington +sin +Solar Designer +Stefan Kristiansson +Szabolcs Nagy +Timo Teräs +Valentin Ochs +William Haddon + +Portions of this software are derived from third-party works licensed +under terms compatible with the above MIT license: + +The TRE regular expression implementation (src/regex/reg* and +src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed +under a 2-clause BSD license (license text in the source files). The +included version has been heavily modified by Rich Felker in 2012, in +the interests of size, simplicity, and namespace cleanliness. + +Much of the math library code (src/math/* and src/complex/*) is +Copyright © 1993,2004 Sun Microsystems or +Copyright © 2003-2011 David Schultz or +Copyright © 2003-2009 Steven G. Kargl or +Copyright © 2003-2009 Bruce D. Evans or +Copyright © 2008 Stephen L. Moshier +and labelled as such in comments in the individual source files. All +have been licensed under extremely permissive terms. + +The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008 +The Android Open Source Project and is licensed under a two-clause BSD +license. It was taken from Bionic libc, used on Android. + +The implementation of DES for crypt (src/misc/crypt_des.c) is +Copyright © 1994 David Burren. It is licensed under a BSD license. + +The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was +originally written by Solar Designer and placed into the public +domain. The code also comes with a fallback permissive license for use +in jurisdictions that may not recognize the public domain. + +The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 +Valentin Ochs and is licensed under an MIT-style license. + +The BSD PRNG implementation (src/prng/random.c) and XSI search API +(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and +licensed under following terms: "Permission to use, copy, modify, +and/or distribute this code for any purpose with or without fee is +hereby granted. There is no warranty." + +The x86_64 port was written by Nicholas J. Kain. Several files (crt) +were released into the public domain; others are licensed under the +standard MIT license terms at the top of this file. See individual +files for their copyright status. + +The mips and microblaze ports were originally written by Richard +Pennington for use in the ellcc project. The original code was adapted +by Rich Felker for build system and code conventions during upstream +integration. It is licensed under the standard MIT terms. + +The powerpc port was also originally written by Richard Pennington, +and later supplemented and integrated by John Spencer. It is licensed +under the standard MIT terms. + +All other files which have no copyright comments are original works +produced specifically for use as part of this library, written either +by Rich Felker, the main author of the library, or by one or more +contibutors listed above. Details on authorship of individual files +can be found in the git version control history of the project. The +omission of copyright and license comments in each file is in the +interest of source tree size. + +All public header files (include/* and arch/*/bits/*) should be +treated as Public Domain as they intentionally contain no content +which can be covered by copyright. Some source modules may fall in +this category as well. If you believe that a file is so trivial that +it should be in the Public Domain, please contact the authors and +request an explicit statement releasing it from copyright. + +The following files are trivial, believed not to be copyrightable in +the first place, and hereby explicitly released to the Public Domain: + +All public headers: include/*, arch/*/bits/* +Startup files: crt/* diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 090127f50c6c2..14bb7e5517370 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1242,6 +1242,7 @@ Performance Improvements - Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`) - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) +- Fixed a performance regression on Windows with Python 3.7 of :func:`pd.read_csv` (:issue:`23516`) - Improved performance of :class:`Categorical` constructor for `Series` objects (:issue:`23814`) .. _whatsnew_0240.docs: diff --git a/pandas/_libs/src/headers/portable.h b/pandas/_libs/src/headers/portable.h index b9868276ef6e6..9ac4ebc306baa 100644 --- a/pandas/_libs/src/headers/portable.h +++ b/pandas/_libs/src/headers/portable.h @@ -5,4 +5,10 @@ #define strcasecmp( s1, s2 ) _stricmp( s1, s2 ) #endif +// GH-23516 - works around locale perf issues +// from MUSL libc, MIT Licensed - see LICENSES +#define isdigit_ascii(c) ((unsigned)c - '0' < 10) +#define isspace_ascii(c) (c == ' ' || (unsigned)c-'\t' < 5) +#define toupper_ascii(c) (((unsigned)c-'a' < 26) ? (c & 0x5f) : c) + #endif diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 4f9f825b15ffe..b71131bee7008 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -138,11 +138,11 @@ int floatify(PyObject *str, double *result, int *maybe_int) { // PANDAS_INLINE void lowercase(char *p) { - for (; *p; ++p) *p = tolower(*p); + for (; *p; ++p) *p = tolower_ascii(*p); } PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper(*p); + for (; *p; ++p) *p = toupper_ascii(*p); } static double xstrtod(const char *str, char **endptr, char decimal, char sci, @@ -177,7 +177,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -188,7 +188,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, *maybe_int = 0; p++; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -207,7 +207,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { *maybe_int = 0; // Handle optional sign @@ -222,7 +222,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -263,7 +263,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index e46e1e85f1c81..3a4058f37efc7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -23,6 +23,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include +#include "../headers/portable.h" + static void *safe_realloc(void *buffer, size_t size) { void *result; // OSX is weird. @@ -1411,7 +1413,7 @@ int tokenize_all_rows(parser_t *self) { } PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper(*p); + for (; *p; ++p) *p = toupper_ascii(*p); } int PANDAS_INLINE to_longlong(char *item, long long *p_value) { @@ -1424,7 +1426,7 @@ int PANDAS_INLINE to_longlong(char *item, long long *p_value) { *p_value = strtoll(item, &p_end, 10); // Allow trailing spaces. - while (isspace(*p_end)) ++p_end; + while (isspace_ascii(*p_end)) ++p_end; return (errno == 0) && (!*p_end); } @@ -1541,7 +1543,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, errno = 0; // Skip leading whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; // Handle optional sign. negative = 0; @@ -1558,7 +1560,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits. - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1570,7 +1572,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (*p == decimal) { p++; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1589,7 +1591,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string. - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { // Handle optional sign. negative = 0; switch (*++p) { @@ -1602,7 +1604,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1643,7 +1645,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; @@ -1697,7 +1699,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, errno = 0; // Skip leading whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; // Handle optional sign. negative = 0; @@ -1714,7 +1716,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits. - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { if (num_digits < max_digits) { number = number * 10. + (*p - '0'); num_digits++; @@ -1730,7 +1732,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (*p == decimal) { p++; - while (num_digits < max_digits && isdigit(*p)) { + while (num_digits < max_digits && isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1738,7 +1740,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } if (num_digits >= max_digits) // Consume extra decimal digits. - while (isdigit(*p)) ++p; + while (isdigit_ascii(*p)) ++p; exponent -= num_decimals; } @@ -1752,7 +1754,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string. - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { // Handle optional sign negative = 0; switch (*++p) { @@ -1765,7 +1767,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1798,7 +1800,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; @@ -1833,7 +1835,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int d; // Skip leading spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1846,7 +1848,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } // Check that there is a first digit. - if (!isdigit(*p)) { + if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; @@ -1865,7 +1867,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number > pre_min) || @@ -1878,7 +1880,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { number = number * 10 - (d - '0'); @@ -1902,7 +1904,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number < pre_max) || @@ -1916,7 +1918,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); @@ -1931,7 +1933,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } // Skip trailing spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1954,7 +1956,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, int d; // Skip leading spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1968,7 +1970,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } // Check that there is a first digit. - if (!isdigit(*p)) { + if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; @@ -1984,7 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number < pre_max) || @@ -1998,7 +2000,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); @@ -2012,7 +2014,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } // Skip trailing spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; }