From a68fe9f854993dc498a1cf0ddf08ec294f76f59a Mon Sep 17 00:00:00 2001 From: Nathan Ford Date: Thu, 25 May 2017 09:40:31 -0700 Subject: [PATCH 1/5] added string processing --- doc/source/comparison_with_sas.rst | 136 +++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index 875358521173a..fdd8247b23260 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -357,6 +357,142 @@ takes a list of columns to sort by. tips = tips.sort_values(['sex', 'total_bill']) tips.head() + +String Processing +----------------- + +Length +~~~~~~ + +SAS determines the length of a character string with the ``LENGTHN`` +and ``LENGTHC`` functions. ``LENGTHN`` excludes trailing blanks and +``LENGTHC`` includes trailing blanks. + +.. code-block:: none + + data _null_; + set tips; + put(LENGTHN(time)); + put(LENGTHC(time)); + run; + +Python determines the length of a character string with the ``len`` function. +``len`` includes trailing blanks. Use ``len`` and ``rstrip`` to exclude +trailing blanks. + +.. code-block:: none + + tips['time'].str.len() + tips['time'].str.rstrip().str.len() + + +Find +~~~~ + +SAS determines the position of a character in a string with the +``FINDW`` function. ``FINDW`` takes the string defined by +the first argument and searches for the first position of the substring +you supply as the second argument. + +.. code-block:: none + + data _null_; + set tips; + put(FINDW(sex,'ALE')); + run; + +Python determines the position of a character in a string with the +``find`` function. ``find`` searches for the first position of the +substring. If the substring is found, the function returns its +position. Keep in mind that Python indexes are zero-based and +the function will return -1 if it fails to find the substring. + +.. code-block:: none + + tips['sex'].str.find("ALE") + + +Substring +~~~~~~~~~ + +SAS extracts a substring from a string based on its position +with the ``SUBSTR`` function. + +.. code-block:: none + + data _null_; + set tips; + put(substr(sex,1,1)); + run; + +In Python, you can use ``[]`` notation to extract a substring +from a string by position locations. Keep in mind that Python +indexes are zero-based. + +.. code-block:: none + + tips['sex'].str[0:1] + + +Scan +~~~~ + +The SAS ``SCAN`` function returns the nth word from a string. +The first argument is the string you want to parse and the +second argument specifies which word you want to extract. + +.. code-block:: none + + data firstlast; + input String $60.; + First_Name = scan(string, 1); + Last_Name = scan(string, -1); + datalines2; + John Smith; + Jane Cook; + ;;; + run; + +Python extracts a substring from a string based on its text +by using regular expressions. There are much more powerful +approaches, but this just shows a simple approach. + +.. code-block:: none + + firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) + firstlast['First_Name'] = firstlast['String'].str.split(" ", expand=True)[0] + firstlast['Last_Name'] = firstlast['String'].str.rsplit(" ", expand=True)[0] + + +Upcase, Lowcase, and Propcase +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The SAS ``UPCASE``, ``LOWCASE``, and ``PROPCASE`` functions change +the case of the argument. + +.. code-block:: none + + data firstlast; + input String $60.; + string_up = UPCASE(string); + string_low = LOWCASE(string); + string_prop = PROPCASE(string); + datalines2; + John Smith; + Jane Cook; + ;;; + run; + +The equivalent Python functions are ``upper``, ``lower``, and ``title``. + +.. code-block:: none + + firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) + firstlast['string_up'] = firstlast['String'].str.upper() + firstlast['string_low'] = firstlast['String'].str.lower() + firstlast['string_prop'] = firstlast['String'].str.title() + + Merging ------- From 4e1f8dd3dfe1f8be873fb7c52ef136d127b23900 Mon Sep 17 00:00:00 2001 From: Nathan Ford Date: Thu, 25 May 2017 13:39:09 -0700 Subject: [PATCH 2/5] made code segments as ipython blocks changed python code segments from code-block:: none to ipython:: python --- doc/source/comparison_with_sas.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index fdd8247b23260..e9e4ca6981498 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -380,7 +380,7 @@ Python determines the length of a character string with the ``len`` function. ``len`` includes trailing blanks. Use ``len`` and ``rstrip`` to exclude trailing blanks. -.. code-block:: none +.. ipython:: python tips['time'].str.len() tips['time'].str.rstrip().str.len() @@ -407,7 +407,7 @@ substring. If the substring is found, the function returns its position. Keep in mind that Python indexes are zero-based and the function will return -1 if it fails to find the substring. -.. code-block:: none +.. ipython:: python tips['sex'].str.find("ALE") @@ -429,7 +429,7 @@ In Python, you can use ``[]`` notation to extract a substring from a string by position locations. Keep in mind that Python indexes are zero-based. -.. code-block:: none +.. ipython:: python tips['sex'].str[0:1] @@ -457,7 +457,7 @@ Python extracts a substring from a string based on its text by using regular expressions. There are much more powerful approaches, but this just shows a simple approach. -.. code-block:: none +.. ipython:: python firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) firstlast['First_Name'] = firstlast['String'].str.split(" ", expand=True)[0] @@ -485,7 +485,7 @@ the case of the argument. The equivalent Python functions are ``upper``, ``lower``, and ``title``. -.. code-block:: none +.. ipython:: python firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) firstlast['string_up'] = firstlast['String'].str.upper() From 0ec1854a1c24569f83b04c7d4d6f61e4b5a7f8ad Mon Sep 17 00:00:00 2001 From: Nathan Ford Date: Thu, 25 May 2017 13:57:49 -0700 Subject: [PATCH 3/5] limited output using .head() limited output of string processing examples using .head() --- doc/source/comparison_with_sas.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index e9e4ca6981498..f3be1cdedb91b 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -382,8 +382,8 @@ trailing blanks. .. ipython:: python - tips['time'].str.len() - tips['time'].str.rstrip().str.len() + tips['time'].str.len().head() + tips['time'].str.rstrip().str.len().head() Find @@ -398,7 +398,7 @@ you supply as the second argument. data _null_; set tips; - put(FINDW(sex,'ALE')); + put(FINDW(sex,'ale')); run; Python determines the position of a character in a string with the @@ -409,7 +409,7 @@ the function will return -1 if it fails to find the substring. .. ipython:: python - tips['sex'].str.find("ALE") + tips['sex'].str.find("ale").head() Substring @@ -431,7 +431,7 @@ indexes are zero-based. .. ipython:: python - tips['sex'].str[0:1] + tips['sex'].str[0:1].head() Scan @@ -451,7 +451,7 @@ second argument specifies which word you want to extract. John Smith; Jane Cook; ;;; - run; + run; Python extracts a substring from a string based on its text by using regular expressions. There are much more powerful From 04c9353523b708d57d4a90ffb26185c2f9d72681 Mon Sep 17 00:00:00 2001 From: Nathan Ford Date: Thu, 25 May 2017 14:45:10 -0700 Subject: [PATCH 4/5] added directives for SAS string functions --- doc/source/comparison_with_sas.rst | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index f3be1cdedb91b..e5413cf751866 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -364,9 +364,10 @@ String Processing Length ~~~~~~ -SAS determines the length of a character string with the ``LENGTHN`` -and ``LENGTHC`` functions. ``LENGTHN`` excludes trailing blanks and -``LENGTHC`` includes trailing blanks. +SAS determines the length of a character string with the +`LENGTHN `__ +and `LENGTHC `__ +functions. ``LENGTHN`` excludes trailing blanks and ``LENGTHC`` includes trailing blanks. .. code-block:: none @@ -390,8 +391,8 @@ Find ~~~~ SAS determines the position of a character in a string with the -``FINDW`` function. ``FINDW`` takes the string defined by -the first argument and searches for the first position of the substring +`FINDW `__ function. +``FINDW`` takes the string defined by the first argument and searches for the first position of the substring you supply as the second argument. .. code-block:: none @@ -415,8 +416,8 @@ the function will return -1 if it fails to find the substring. Substring ~~~~~~~~~ -SAS extracts a substring from a string based on its position -with the ``SUBSTR`` function. +SAS extracts a substring from a string based on its position with the +`SUBSTR `__ function. .. code-block:: none @@ -437,8 +438,8 @@ indexes are zero-based. Scan ~~~~ -The SAS ``SCAN`` function returns the nth word from a string. -The first argument is the string you want to parse and the +The SAS `SCAN `__ +function returns the nth word from a string. The first argument is the string you want to parse and the second argument specifies which word you want to extract. .. code-block:: none @@ -467,8 +468,10 @@ approaches, but this just shows a simple approach. Upcase, Lowcase, and Propcase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The SAS ``UPCASE``, ``LOWCASE``, and ``PROPCASE`` functions change -the case of the argument. +The SAS `UPCASE `__ +`LOWCASE `__ and +`PROPCASE `__ +functions change the case of the argument. .. code-block:: none From f4c9cd8323052b4d2d45876531f0acd54611ff74 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 7 Aug 2017 14:54:42 +0200 Subject: [PATCH 5/5] small edit to show result --- doc/source/comparison_with_sas.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index e5413cf751866..02e0f46d36c84 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -426,7 +426,7 @@ SAS extracts a substring from a string based on its position with the put(substr(sex,1,1)); run; -In Python, you can use ``[]`` notation to extract a substring +With pandas you can use ``[]`` notation to extract a substring from a string by position locations. Keep in mind that Python indexes are zero-based. @@ -463,6 +463,7 @@ approaches, but this just shows a simple approach. firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) firstlast['First_Name'] = firstlast['String'].str.split(" ", expand=True)[0] firstlast['Last_Name'] = firstlast['String'].str.rsplit(" ", expand=True)[0] + firstlast Upcase, Lowcase, and Propcase @@ -494,7 +495,7 @@ The equivalent Python functions are ``upper``, ``lower``, and ``title``. firstlast['string_up'] = firstlast['String'].str.upper() firstlast['string_low'] = firstlast['String'].str.lower() firstlast['string_prop'] = firstlast['String'].str.title() - + firstlast Merging -------