/* This is the first in a series of examples provided to demonstrate the    */
/* use of SAS Viya Data Mining and Machine Learning procedures to compose   */
/* a program that follows a standard machine learning process of            */
/* - loading data,                                                          */
/* - preparing the data,                                                    */
/* - building models, and                                                   */
/* - assessing and comparing those models                                   */
/*                                                                          */
/* The programs are written to execute in the CAS in-memory distributed     */
/* computing engine in the SAS Viya environment.                            */
/*                                                                          */
/* This first example showcases how to load local data into CAS             */

/* Define a CAS engine libref for CAS in-memory data tables */
libname mycaslib cas caslib=casuser;

/* Load data into CAS                                                       */
/*                                                                          */
/* The data set used for this workflow is from a financial services company */
/* that offers a home equity line of credit. The company has extended       */
/* several thousand lines of credit in the past, and many of these accepted */
/* applicants have defaulted on their loans. Using demographic and          */
/* financial variables, the company wants to build a model to predict       */
/* whether an applicant will default.                                       */
/*                                                                          */
/* The target variable "BAD" indicates whether an applicant defaulted       */
/* on the home equity line of credit.                                       */
/*                                                                          */
/* For execution in the CAS engine, data must be loaded from the local      */
/* data set to a CAS table. This code first checks to see if the specified  */
/* CAS table exists and then loads data from local data sets in 2           */
/* different ways.  After executing this code, you will notice a new        */
/* "MYCASLIB" library reference under "Libraries" in the navigation panel   */
/* on the left side (note the special icon indicating it is a caslib).      */
/*                                                                          */
%if not %sysfunc(exist(mycaslib.hmeq)) %then %do;

  /* You can load data using a "load" statement in PROC CASUTIL */
  proc casutil;
    load data=sampsio.hmeq casout="hmeq" outcaslib=casuser;


%if not %sysfunc(exist(mycaslib.hmeq)) %then %do;

  /* You can also load data using a data step */
  data mycaslib.hmeq;
    set sampsio.hmeq;

/* This example illustrates various tools for assaying, assessing,      */
/* modifying and preparing data prior to modeling. It uses HMEQ         */
/* dataset as input and produces HMEQ_PREPPED dataset. The HMEQ_PREPPED */
/* dataset is used in subsequent examples.                              */
/*                                                                      */
/* The steps include:                                                   */
/*                                                                      */
/* (1) PREPARE AND EXPLORE                                              */
/*     a) Load data set into CAS                                        */
/*     b) Explore                                                       */
/*     c) Impute                                                        */
/*     d) Identify variables that explain variance                      */
/*     e) Perform a cluster analysis to identify homogeneous            */
/*        groups in the data                                            */
/*     f) Perform principal components analysis to assess collineary    */
/*        among candidate, interval valued inputs                       */

/* Setup and initialize for later use in the program                    */
/* Define a CAS engine libref for CAS in-memory data tables */
libname mycaslib cas caslib=casuser;

/* Specify the data set names */
%let sasdata          = sampsio.hmeq;                     
%let casdata          = mycaslib.hmeq;

/* Specify the data set inputs and target */
%let class_inputs    = reason job;
%let interval_inputs = clage clno debtinc loan mortdue value yoj derog delinq ninq;
%let target          = bad;

%let im_class_inputs    = reason job;
%let im_interval_inputs = im_clage clno im_debtinc loan mortdue value im_yoj im_ninq derog im_delinq;
%let cluster_inputs     = im_clage im_debtinc value;

/* Specify a folder path to write the temporary output files */
%let outdir = &_SASWORKINGDIR;

/* Load data into CAS if needed. Data should have been loaded in        */
/* step 1, it will be loaded here after checking if it exists in CAS    */
%if not %sysfunc(exist(&casdata)) %then %do;
  proc casutil;
    load data=&sasdata casout="hmeq" outcaslib=casuser;

/* Explore the data and plot missing values                             */
proc cardinality data=&casdata outcard=mycaslib.data_card;

proc print data=mycaslib.data_card(where=(_nmiss_>0));
  title "Data Summary";

data data_missing;
  set mycaslib.data_card (where=(_nmiss_>0) keep=_varname_ _nmiss_ _nobs_);
  _percentmiss_ = (_nmiss_/_nobs_)*100;
  label _percentmiss_ = 'Percent Missing';

proc sgplot data=data_missing;
  title "Percentage of Missing Values";
  vbar _varname_ / response=_percentmiss_ datalabel categoryorder=respdesc;

/* Impute missing values                                                */
proc varimpute data=&casdata;
  input clage /ctech=mean;
  input delinq /ctech=median;
  input ninq /ctech=random;
  input debtinc yoj /ctech=value cvalues=50,100;
  output out=mycaslib.hmeq_prepped copyvars=(_ALL_);
  code file="&outdir./";

/* Identify variables that explain variance in the target               */
/* Discriminant analysis for class target */
proc varreduce data=mycaslib.hmeq_prepped technique=discriminantanalysis;  
  class &target &im_class_inputs.;
  reduce supervised &target=&im_class_inputs. &im_interval_inputs. / maxeffects=8;
  ods output selectionsummary=summary;	     

data out_iter (keep=Iteration VarExp Base Increment Parameter);
  set summary;
  if Increment=. then Increment=0;
  Base=VarExp - Increment;

proc transpose data=out_iter out=out_iter_trans;
  by Iteration VarExp Parameter;

proc sort data=out_iter_trans;
  label _NAME_='Group';
  by _NAME_;

/* Variance explained by Iteration plot */
proc sgplot data=out_iter_trans;
  title "Variance Explained by Iteration";
  yaxis label="Variance Explained";
  vbar Iteration / response=COL1 group=_NAME_;

/* Perform a cluster analysis based on demographic inputs               */
proc kclus data=mycaslib.hmeq_prepped standardize=std distance=euclidean maxclusters=6;
  input &cluster_inputs. / level=interval;

/* Perform a principal components analysis on the interval valued       */
/* input variables                                                      */     
proc pca data=mycaslib.hmeq_prepped plots=(scree);
  var &im_interval_inputs;
/* This example illustrates fitting and comparing several Machine       */
/* Learning algorithms for predicting the binary target in the          */
/* HMEQ data set. The steps include:                                    */
/*                                                                      */
/* (1) PREPARE AND EXPLORE                                              */
/*     a) Check data is loaded into CAS                                 */
/*                                                                      */
/* (2) PERFORM SUPERVISED LEARNING                                      */
/*     a) Fit a model using a Random Forest                             */
/*     b) Fit a model using Gradient Boosting                           */
/*     c) Fit a model using a Neural Network                            */
/*     d) Fit a model using a Support Vector Machine                    */   
/*                                                                      */
/* (3) EVALUATE AND IMPLEMENT                                           */
/*     a) Score the data                                                */
/*     b) Assess model performance                                      */
/*     c) Generate ROC and Lift charts                                  */

/* Setup and initialize for later use in the program                    */

/* Define a CAS engine libref for CAS in-memory data tables */
libname mycaslib cas caslib=casuser;

/* Specify the data set names */
%let casdata          = mycaslib.hmeq_prepped;            
%let partitioned_data = mycaslib.hmeq_part;  

/* Specify the data set inputs and target */
%let class_inputs    = reason job;
%let interval_inputs = im_clage clno im_debtinc loan mortdue value im_yoj im_ninq derog im_delinq;
%let target          = bad;

/* Check if HMEQ_PREPPED data created in the Prepare and Explore Data   */
/* snippet exists.  If not, print error message to run the program.     */
%if not %sysfunc(exist(&casdata)) %then %do;
  %put ERROR: The input dataset HMEQ_PREPPED is not loaded into CAS.;
  %put ERROR: Remember to run the Prepare and Explore Data snippet to load necessary data before executing this example.;

/* Partition the data into training and validation                      */
proc partition data=&casdata partition samppct=70;
  by ⌖
  output out=&partitioned_data copyvars=(_ALL_);

/* RANDOM FOREST predictive model                                       */
proc forest data=&partitioned_data ntrees=50 intervalbins=20 minleafsize=5
  input &interval_inputs. / level = interval;
  input &class_inputs. / level = nominal;
  target &target / level=nominal;
  partition rolevar=_partind_(train='1' validate='0');

/* Score the data using the generated RF model                          */
proc forest data=&partitioned_data inmodel=mycaslib.forest_model noprint;
  output out=mycaslib._scored_RF copyvars=(_ALL_);

/* GRADIENT BOOSTING MACHINES predictive model                          */
proc gradboost data=&partitioned_data ntrees=10 intervalbins=20 maxdepth=5
  input &interval_inputs. / level = interval;
  input &class_inputs. / level = nominal;
  target &target / level=nominal;
  partition rolevar=_partind_(train='1' validate='0');

/* Score the data using the generated GBM model                         */
proc gradboost  data=&partitioned_data inmodel=mycaslib.gb_model noprint;
  output out=mycaslib._scored_GB copyvars=(_ALL_);

/* NEURAL NETWORK predictive model                                      */
proc nnet data=&partitioned_data;
  target &target / level=nom;
  input &interval_inputs. / level=int;
  input &class_inputs. / level=nom;
  hidden 2;
  train outmodel=mycaslib.nnet_model;
  partition rolevar=_partind_(train='1' validate='0');
  ods exclude OptIterHistory;

/* Score the data using the generated NN model                          */
proc nnet data=&partitioned_data inmodel=mycaslib.nnet_model noprint;
  output out=mycaslib._scored_NN copyvars=(_ALL_);

/* SUPPORT VECTOR MACHINE predictive model                              */
proc svmachine data=&partitioned_data(where=(_partind_=1));
  kernel polynom / deg=2;
  target ⌖
  input &interval_inputs. / level=interval;
  input &class_inputs. / level=nominal;
  id bad _partind_;
  savestate rstore=mycaslib.svm_astore_model;
  ods exclude IterHistory;

/* Score data using ASTORE code generated for the SVM model             */
proc astore;
  score data=&partitioned_data out=mycaslib._scored_SVM

/* Assess                                                               */
%macro assess_model(prefix=, var_evt=, var_nevt=);
proc assess data=mycaslib._scored_&prefix.(where=(_partind_=0));
    input &var_evt.;
    target &target / level=nominal event='1';
    fitstat pvar=&var_nevt. / pevent='0';

    ods output
%mend assess_model;

ods exclude all;
%assess_model(prefix=RF, var_evt=p_&target.1, var_nevt=p_&target.0);
%assess_model(prefix=SVM, var_evt=p_&target.1, var_nevt=p_&target.0);
%assess_model(prefix=GB, var_evt=p_&target.1, var_nevt=p_&target.0);
%assess_model(prefix=NN, var_evt=p_&target.1, var_nevt=p_&target.0);
ods exclude none;

/* ROC and Lift Charts using validation data                            */
ods graphics on;

data all_rocinfo;
  set SVM_rocinfo(in=s)

  length model $ 16;
    when (s) model='SVM';
    when (f) model='Forest';
    when (g) model='GradientBoosting';
    when (n) model='NeuralNetwork';

data all_liftinfo;
  set SVM_liftinfo(in=s)

  length model $ 16;
    when (s) model='SVM';
    when (f) model='Forest';
    when (g) model='GradientBoosting';
    when (n) model='NeuralNetwork';

/* Print AUC (Area Under the ROC Curve) */
title "AUC (using validation data) ";
proc sql;
  select distinct model, c from all_rocinfo order by c desc;

/* Draw ROC charts */         
proc sgplot data=all_rocinfo aspect=1;
  title "ROC Curve (using validation data)";
  xaxis values=(0 to 1 by 0.25) grid offsetmin=.05 offsetmax=.05;
  yaxis values=(0 to 1 by 0.25) grid offsetmin=.05 offsetmax=.05;
  lineparm x=0 y=0 slope=1 / transparency=.7;
  series x=fpr y=sensitivity / group=model;

/* Draw lift charts */         
proc sgplot data=all_liftinfo;
  title "Lift Chart (using validation data)";
  yaxis label=' ' grid;
  series x=depth y=lift / group=model markers markerattrs=(symbol=circlefilled);

ods graphics off;
AUC (using validation data)
model	Area Under ROC
Forest	0.926052
GradientBoosting	0.890352
SVM	0.859655
NeuralNetwork	0.514079


/* This example illustrates fitting and comparing two Machine           */   
/* Learning algorithms for predicting the binary target in the          */
/* HMEQ data set. The steps include:                                    */
/*                                                                      */
/* (1) PREPARE AND EXPLORE                                              */
/*     a) Check data is loaded into CAS                                 */
/*                                                                      */
/* (2) PERFORM SUPERVISED LEARNING                                      */
/*     a) Fit model using Logistic Regression                           */
/*     b) Fit a model using a Decision Tree                             */
/*                                                                      */
/* (3) EVALUATE AND IMPLEMENT                                           */
/*     a) Score the data                                                */
/*     b) Assess model performance                                      */
/*     c) Generate ROC and Lift charts                                  */

/* Setup and initialize for later use in the program                    */

/* Define a CAS engine libref for CAS in-memory data tables */
libname mycaslib cas caslib=casuser;

/* Specify the data set names */
%let casdata          = mycaslib.hmeq_prepped;            
%let partitioned_data = mycaslib.hmeq_part;  

/* Specify the data set inputs and target */
%let class_inputs    = reason job;
%let interval_inputs = im_clage clno im_debtinc loan mortdue value im_yoj im_ninq derog im_delinq;
%let target          = bad;

/* Specify a folder path to write the temporary output files */
%let outdir = &_SASWORKINGDIR;

/* Check if HMEQ_PREPPED data created in the Prepare and Explore Data   */
/* snippet exists.  If not, print error message to run the program.     */
%if not %sysfunc(exist(&casdata)) %then %do;
  %put ERROR: The input dataset HMEQ_PREPPED is not loaded into CAS.;
  %put ERROR: Remember to run the Prepare and Explore Data snippet to load necessary data before executing this example.;

/* Partition the data into training and validation                      */
proc partition data=&casdata partition samppct=70;
  by ⌖
  output out=&partitioned_data copyvars=(_ALL_);

/* LOGISTIC REGRESSION predictive model                                 */
/* ALL data used for training model */
proc logselect data=&partitioned_data;
  class &target &class_inputs.;
  model &target.(event='1')=&class_inputs. &interval_inputs.;
  selection method=backward;
  code file="&outdir./" pcatall;

/* Score the data using the generated logistic model score code         */
data mycaslib._scored_logistic;
  set &partitioned_data;
  %include "&outdir./";

/* Assess model performance (LOGISTIC REGRESSION)                       */
ods exclude all;
proc assess data=mycaslib._scored_logistic(where=(_partind_=0));
  input p_&target.1;
  target &target / level=nominal event='1';
  fitstat pvar=p_&target.0/ pevent='0';
  ods output fitstat  = logit_fitstat
             rocinfo  = logit_rocinfo
             liftinfo = logit_liftinfo;
ods exclude none;

/* DECISION TREE predictive model                                       */
proc treesplit data=&partitioned_data;
  input &interval_inputs. / level=interval;
  input &class_inputs. / level=nominal;
  target &target / level=nominal;
  partition rolevar=_partind_(train='1' validate='0');
  grow entropy;
  prune c45;
  code file="&outdir./";

/* Score the data using the generated tree model score code             */
data mycaslib._scored_tree;
  set &partitioned_data;
  %include "&outdir./";

/* Assess tree model performance (DECISTION TREE)                       */
ods exclude all;
proc assess data=mycaslib._scored_tree(where=(_partind_=0));
  input p_&target.1;
  target &target / level=nominal event='1';
  fitstat pvar=p_&target.0/ pevent='0';
  ods output fitstat  = tree_fitstat
             rocinfo  = tree_rocinfo
             liftinfo = tree_liftinfo;
ods exclude none;

/*  Create ROC and Lift plots (both models) using validation data        */
ods graphics on;

data all_rocinfo;
  set logit_rocinfo(in=l)

  length model $ 16;
      when (l) model='Logistic';
      when (t) model='Tree';

data all_liftinfo;
  set logit_liftinfo(in=l)

  length model $ 16;
      when (l) model='Logistic';
      when (t) model='Tree';

/* Print AUC (Area Under the ROC Curve) */
title "AUC (using validation data)";
proc sql;
  select distinct model, c from all_rocinfo order by c desc;

/* Draw ROC charts */
proc sgplot data=all_rocinfo aspect=1;
  title "ROC Curve (using validation data)";
  xaxis values=(0 to 1 by 0.25) grid offsetmin=.05 offsetmax=.05;
  yaxis values=(0 to 1 by 0.25) grid offsetmin=.05 offsetmax=.05;
  lineparm x=0 y=0 slope=1 / transparency=.7;
  series x=fpr y=sensitivity / group=model;

/* Draw lift charts */   
proc sgplot data=all_liftinfo;
  title "Lift Chart (using validation data)";
  yaxis label=' ' grid;
  series x=depth y=lift / group=model markers markerattrs=(symbol=circlefilled);

ods graphics off;
The LOGSELECT Procedure

Parameter Estimates
Parameter	DF	Estimate	Standard
Error	Chi-Square	Pr > ChiSq
Intercept	1	-7.082694	0.321028	486.7571	<.0001
IM_CLAGE	1	-0.005552	0.000701	62.6437	<.0001
CLNO	1	-0.022179	0.005139	18.6257	<.0001
IM_DEBTINC	1	0.160982	0.006591	596.5662	<.0001
LOAN	1	-0.000017558	0.000005047	12.1040	0.0005
VALUE	1	0.000001266	0.000000981	1.6647	0.1970
IM_NINQ	1	0.122803	0.023566	27.1554	<.0001
DEROG	1	0.600609	0.069674	74.3098	<.0001
IM_DELINQ	1	0.735456	0.054082	184.9287	<.0001

The TREESPLIT Procedure
Variable Importance
Variable	Importance	Std Dev
Importance	Relative
Importance	Count
IM_DEBTINC	193.65	0	1.0000	4
IM_DELINQ	37.0480	0	0.1913	5
IM_CLAGE	19.6441	0	0.1014	12
VALUE	9.9259	0	0.0513	11
LOAN	4.7259	0	0.0244	10
REASON	1.0646	0	0.0055	1
JOB	-0.1088	0	-56E-5	9
IM_NINQ	-0.8289	0	-0.004	4
CLNO	-1.0401	0	-0.005	9
DEROG	-2.9749	0	-0.015	5
IM_YOJ	-8.8294	0	-0.046	16
MORTDUE	-9.9475	0	-0.051	8

AUC (using validation data)
model	Area Under ROC
Logistic	0.883336
Tree	0.873168


/* This example showcases fitting and assessing Generalized Linear      */
/* Models using the GENSELECT Procedure                                 */
/* The steps include:                                                   */
/*                                                                      */
/* (1) PREPARE                                                          */
/*     a) Check data is loaded into CAS                                 */
/*                                                                      */
/* (2) Perform Modeling on the Binary Target                            */
/*     a) Assuming binary distribution, using logit link                */
/*     b) Plot ROC curve                                                */   

/* Setup and initialize for later use in the program                    */

/* Define a CAS engine libref for CAS in-memory data tables */
libname mycaslib cas caslib=casuser;

/* Specify the data set names */
%let casdata          = mycaslib.hmeq_prepped;            
%let partitioned_data = mycaslib.hmeq_part;    

/* Specify the data set inputs and target */
%let class_inputs    = reason job;
%let interval_inputs = im_clage clno im_debtinc loan mortdue value im_yoj im_ninq derog im_delinq;
%let target          = bad;

/* Specify a folder path to write the temporary output files */
%let outdir = &_SASWORKINGDIR;

/* Check if HMEQ_PREPPED data created in the Prepare and Explore Data   */
/* snippet exists.  If not, print error message to run the program.     */
%if not %sysfunc(exist(&casdata)) %then %do;
  %put ERROR: The input dataset HMEQ_PREPPED is not loaded into CAS.;
  %put ERROR: Remember to run the Prepare and Explore Data snippet to load necessary data before executing this example.;

/* Partition the data into training and validation                      */
proc partition data=&casdata partition samppct=70;
  by &target;
  output out=&partitioned_data copyvars=(_ALL_);

/* The binary target model using LOGIT link                             */
/* Assuming binary distribution, using logit link, fit a GLM using the Genselect procedure */
proc genselect data=&partitioned_data;
  class &class_inputs.;
  model &target.(event='1')=&interval_inputs. &class_inputs. / dist=binary link=logit;
  selection method=forward(select=sbc stop=sbc choose=validate);
  partition rolevar=_partind_(train='1' validate='0');
  code file="&outdir./" pcatall;

/* Score the data using the generated GLM score code                    */
data mycaslib._scored_glm;
  set &partitioned_data;
  %include "&outdir./";

/* Assess model performance (GLM)                                       */
ods exclude all;
proc assess data=mycaslib._scored_glm(where=(_partind_=0));
  input p_&target.1;
  target &target / level=nominal event='1';
  fitstat pvar=p_&target.0/ pevent='0';
  ods output fitstat  = glm_fitstat
             rocinfo  = glm_rocinfo
             liftinfo = glm_liftinfo;
ods exclude none;

/*  Create ROC and Lift plots using validation data                      */
ods graphics on;

/* Print AUC (Area Under the ROC Curve) */
title "AUC (using validation data)";
proc sql;
  select distinct 'GLM', c from glm_rocinfo order by c desc;

/* Draw ROC charts */
proc sgplot data=glm_rocinfo aspect=1 noautolegend;
  title "ROC Curve (using validation data)";
  xaxis values=(0 to 1 by 0.25) grid offsetmin=.05 offsetmax=.05;
  yaxis values=(0 to 1 by 0.25) grid offsetmin=.05 offsetmax=.05;
  lineparm x=0 y=0 slope=1 / transparency=.7;
  series x=fpr y=sensitivity;

/* Draw lift charts */
proc sgplot data=glm_liftinfo;
  title "Lift Chart (using validation data)";
  yaxis label=' ' grid;
  series x=depth y=lift / markers markerattrs=(symbol=circlefilled);

ods graphics off;
Parameter Estimates
Parameter	DF	Estimate	Standard
Error	Chi-Square	Pr > ChiSq
Intercept	1	-7.499193	0.392688	364.6975	<.0001
IM_CLAGE	1	-0.004314	0.000837	26.5508	<.0001
CLNO	1	-0.022570	0.006300	12.8341	0.0003
IM_DEBTINC	1	0.169074	0.008056	440.5204	<.0001
LOAN	1	-0.000017455	0.000006161	8.0263	0.0046
MORTDUE	1	-0.000001187	0.000002772	0.1833	0.6686
VALUE	1	0.000001785	0.000002303	0.6007	0.4383
IM_YOJ	1	-0.005669	0.002911	3.7929	0.0515
IM_NINQ	1	0.133431	0.028314	22.2079	<.0001
DEROG	1	0.552326	0.079905	47.7797	<.0001
IM_DELINQ	1	0.722498	0.066444	118.2374	<.0001

AUC (using validation data)
 	Area Under ROC
GLM	0.876721


/* This snippet showcases a sample Machine Learning workflow for            */
/* unsupervised learning using SASHELP.IRIS data set. The steps include:    */
/*                                                                          */
/* (1) PREPARE DATA                                                         */
/*     a) Load data set into CAS                                            */
/*                                                                          */
/* (2) PERFORM UNSUPERVISED LEARNING                                        */
/*     a) Generate Principal Components                                     */
/*     b) Analyze Clusters                                                  */
/*                                                                          */
/* (3) VISUALIZE THE RESULTS                                                */
/*     a) Examine the clustering plot                                       */
/*     b) Identify clusters in a PCA plot                                   */

/* Define the macro variables for later use in the program                  */
/* Specify a folder path to write the temporary output files */
%let outdir = &_SASWORKINGDIR;

/* Create a CAS engine libref to save the output data sets */
%let caslibname = mycas;     
libname &caslibname cas caslib=casuser;   

/* Specify the data set names */
%let sasdata = sashelp.iris;
%let casdata = &caslibname..iris;

/* Specify the data set inputs */
%let interval_vars=sepallength sepalwidth petallength petalwidth;

/* Load data into CAS                                                       */
data &casdata;
	set &sasdata;
	rowid = _n_;

/* Unsupervised Learning: Principal Component Analysis                      */
proc pca data=&casdata prefix=PC method=EIG plots=all;
	var &interval_vars;
	output out=&casdata._scored_pca copyvars=(_all_) score=PC_;
	code file="&outdir/";

/* Unsupervised Learning: Cluster Analysis                                  */
proc kclus data=&casdata._scored_pca standardize=STD impute=MEAN
        distance=EUCLIDEAN maxiters=50 maxclusters=3;
	input &interval_vars;
	score out=&casdata._scored_kclus copyvars=(_all_);
	ods output clustersum=clus_clustersum;
	code file="&outdir/";

/* Visualize the results using a clustering plot for segment frequency      */
data clus_clustersum;
    set clus_clustersum;
	clusterLabel = catx(' ', 'Cluster', cluster);

proc template;
    define statgraph simplepie;
		entrytitle "Segment Frequency";
		layout region;
		piechart category=clusterLabel response=frequency;

proc sgrender data=clus_clustersum template=simplepie;

/* Visualize the results by identifying clusters in a PCA plot              */
proc sgplot data=&casdata._scored_kclus(keep=PC_1 PC_2 _cluster_id_);
	title "Identify Clusters in a PCA Plot";
	scatter x=PC_1 y=PC_2 / group=_cluster_id_;
/* This snippet showcases a sample Machine Learning workflow for        */
/* supervised learning using SAMPLEML.HMEQ data set. The steps include:  */
/*                                                                      */
/* (1) PREPARE AND EXPLORE                                              */
/*     a) Load data set into CAS                                        */
/*     b) Explore                                                       */
/*     c) Partition                                                     */
/*     d) Impute                                                        */
/*     e) Identify variables that explain variance                      */
/*                                                                      */
/* (2) PERFORM SUPERVISED LEARNING                                      */
/*     a) Fit model using random forest                                 */
/*                                                                      */
/* (3) EVALUATE AND IMPLEMENT                                           */
/*     a) Score the data                                                */
/*     b) Assess model performance                                      */
/*     c) Generate ROC and Lift charts                                  */

/* Define the macro variables for later use in the program              */
/* Specify a folder path to write the temporary output files */
%let outdir = &_SASWORKINGDIR;

/* Create a CAS engine libref to save the output data sets */
%let caslibname = mycas;     
libname &caslibname cas caslib=casuser;

/* Specify the data set names */
%let sasdata          = sampsio.hmeq;                     
%let casdata          = &caslibname..hmeq;            
%let partitioned_data = &caslibname.._part;  

/* Specify the data set inputs and target */
%let class_inputs    = reason job derog delinq ninq IM_clno IM_yoj;
%let interval_inputs = loan value IM_clage IM_debtinc IM_mortdue;
%let target          = bad;

/* Load data into CAS                                                   */
data &casdata;
	set &sasdata;

/* Explore the data and look for missing value                          */
proc cardinality data=&casdata outcard=&caslibname..data_card;

proc print data=&caslibname..data_card(where=(_nmiss_>0));
        title "Data Summary";

data data_missing;
	set &caslibname..data_card
        (where=(_nmiss_>0) keep=_varname_ _nmiss_ _nobs_);
	_percentmiss_ = (_nmiss_/_nobs_)*100;
	label _percentmiss_ = 'Percent Missing';

proc sgplot data=data_missing;
        title "Percentage of Missing Values";
	vbar _varname_ / response=_percentmiss_
                         datalabel categoryorder=respdesc;

/* Partition the data into training and validation                      */
proc partition data=&casdata partition samppct=70;
	by bad;
	output out=&partitioned_data copyvars=(_ALL_);

/* Impute missing values                                                */
proc varimpute data=&partitioned_data;
	input mortdue yoj clno /ctech=median;
	input clage debtinc /ctech=mean;
	code file="&outdir./";
	output out=&caslibname.._prepped copyvars=(_ALL_);

/* Identify variables that explain variance in the target               */
/* Discriminant analysis for class target */
proc varreduce data=&caslibname.._prepped technique=discriminantanalysis;  
	class &target &class_inputs.;
	reduce supervised &target=&class_inputs. &interval_inputs. /
	ods output selectionsummary=summary;	     

data out_iter (keep=Iteration VarExp Base Increment Parameter);
	set summary;
	if Increment=. then Increment=0;
	Base=VarExp - Increment;

proc transpose data=out_iter out=out_iter_trans;
	by Iteration VarExp Parameter;

proc sort data=out_iter_trans;
	label _NAME_='Group';
	by _NAME_;

/* Variance explained by Iteration plot */
proc sgplot data=out_iter_trans;
	title "Variance Explained by Iteration";
	yaxis label="Variance Explained";
	vbar Iteration / response=COL1 group=_NAME_;

/* Build a predictive model using Random Forest                         */
proc forest data=&caslibname.._prepped ntrees=50 numbin=20 minleafsize=5;
    input &interval_inputs. / level = interval;
    input &class_inputs.    / level = nominal;
    target &target          / level = nominal;
    partition rolevar=_partind_(train='1' validate='0');
    code file="&outdir./";
    ods output FitStatistics=fitstats;

/* Score the data using the generated model                             */
data &caslibname.._scored_forest;
	set &caslibname.._prepped;
	%include "&outdir./";

/* create data set from forest stats output */
data fitstats;
	set fitstats;
	label Trees     = 'Number of Trees';
	label MiscTrain   = 'Training';
	label MiscValid = 'Validation';

/* plot misclassification as function of number of trees */
proc sgplot data=fitstats;
	title "Training vs Validation";
	series x=Trees y=MiscTrain;
	series x=Trees y=MiscValid/
           lineattrs=(pattern=shortdash thickness=2);
	yaxis label='Misclassification Rate';

/* Assess model performance                                             */
proc assess data=&caslibname.._scored_forest;
	input p_bad1;
	target &target / level=nominal event='1';
	fitstat pvar=p_bad0 / pevent='0';
	by _partind_;
	ods output fitstat  = forest_fitstat
	           rocinfo  = forest_rocinfo
	           liftinfo = forest_liftinfo;

/* Analyze model using ROC and Lift charts                              */
ods graphics on;
proc format;
	value partindlbl 0 = 'Validation' 1 = 'Training';

/* Construct a ROC chart */
proc sgplot data=forest_rocinfo aspect=1;
	title "ROC Curve";
	xaxis label="False positive rate" values=(0 to 1 by 0.1);
	yaxis label="True positive rate"  values=(0 to 1 by 0.1);
	lineparm x=0 y=0 slope=1 / transparency=.7 LINEATTRS=(Pattern=34);
	series x=fpr y=sensitivity /group=_partind_;
	format _partind_ partindlbl.;

/* Construct a Lift chart */
proc sgplot data=forest_liftinfo;
	title "Lift Chart";
	xaxis label="Population Percentage";
	yaxis label="Lift";
	series x=depth y=lift /
	       group=_partind_ markers markerattrs=(symbol=circlefilled);
	format _partind_ partindlbl.;

ods graphics off;
