From b39f1283352bfafbe4b590388ea74688d2d4cd4a Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Tue, 23 May 2023 17:48:27 -0400 Subject: [PATCH] fix issue caused by groupby over non-numeric values, spotted by jjvaldes --- ewstools/core.py | 843 +++++++++++++++++++++++------------------------ 1 file changed, 420 insertions(+), 423 deletions(-) diff --git a/ewstools/core.py b/ewstools/core.py index 8875e16..6c5a91f 100644 --- a/ewstools/core.py +++ b/ewstools/core.py @@ -59,11 +59,11 @@ import deprecation - # --------------- # Classes # -------------- + class TimeSeries: """ Univariate time series data on which to compute early warning signals. @@ -83,26 +83,28 @@ def __init__(self, data, transition=None): # Put data into a pandas DataFrame if type(data) in [list, np.ndarray]: - df_state = pd.DataFrame({'time': np.arange(len(data)), 'state': data}) - df_state.set_index('time', inplace=True) - + df_state = pd.DataFrame({"time": np.arange(len(data)), "state": data}) + df_state.set_index("time", inplace=True) + # If given as pandas series, carry index forward elif type(data) == pd.Series: - df_state = pd.DataFrame({'state': data.values}) + df_state = pd.DataFrame({"state": data.values}) df_state.index = data.index # Rename index if no name given if not df_state.index.name: - df_state.index.name = 'time' - + df_state.index.name = "time" + # If data is not provided as either of these, flag error else: - print('\nERROR: data has been provided as type {}'.format(type(data))) - print('Make sure to provide data as either a list, np.ndarray or pd.Series\n') - + print("\nERROR: data has been provided as type {}".format(type(data))) + print( + "Make sure to provide data as either a list, np.ndarray or pd.Series\n" + ) + # Set state and transition attributes self.state = df_state self.transition = float(transition) if transition else transition - + # Initialise other attributes self.ews = pd.DataFrame(index=df_state.index) self.dl_preds = pd.DataFrame() @@ -110,17 +112,16 @@ def __init__(self, data, transition=None): self.pspec = None self.pspec_fits = None self.ews_spec = pd.DataFrame() - - def detrend(self, method='Gaussian', bandwidth=0.2, span=0.2): - ''' + def detrend(self, method="Gaussian", bandwidth=0.2, span=0.2): + """ Detrend the time series using a chosen method. Add column to the dataframe for 'smoothing' and 'residuals' Parameters ---------- method : str, optional - Method of detrending to use. + Method of detrending to use. Select from ['Gaussian', 'Lowess'] The default is 'Gaussian'. bandwidth : float, optional @@ -130,15 +131,15 @@ def detrend(self, method='Gaussian', bandwidth=0.2, span=0.2): such that the kernel has its quartiles at +/- 0.25*bandwidth. The default is 0.2. span : float, optional - Span of time-series data used for Lowess filtering. Provide as a - proportion of data length or as a number of data points. + Span of time-series data used for Lowess filtering. Provide as a + proportion of data length or as a number of data points. The default is 0.2. Returns ------- None. - ''' + """ # Get time series data prior to transition if self.transition: @@ -146,8 +147,7 @@ def detrend(self, method='Gaussian', bandwidth=0.2, span=0.2): else: df_pre = self.state - - if method == 'Gaussian': + if method == "Gaussian": # Get size of bandwidth in terms of num. datapoints if given as a proportion if 0 < bandwidth <= 1: bw_num = bandwidth * len(df_pre) @@ -158,35 +158,31 @@ def detrend(self, method='Gaussian', bandwidth=0.2, span=0.2): # Standard deviation of kernel given bandwidth # Note that for a Gaussian, quartiles are at +/- 0.675*sigma sigma = (0.25 / 0.675) * bw_num - smooth_values = gf(df_pre['state'].values, sigma=sigma, mode='reflect') + smooth_values = gf(df_pre["state"].values, sigma=sigma, mode="reflect") smooth_series = pd.Series(smooth_values, index=df_pre.index) - - if method == 'Lowess': + if method == "Lowess": # Convert span to a proportion of the length of the data if not 0 < span <= 1: span_prop = span / len(df_pre) else: span_prop = span - smooth_values = lowess(df_pre['state'].values, - df_pre.index.values, - frac=span_prop)[:,1] + smooth_values = lowess( + df_pre["state"].values, df_pre.index.values, frac=span_prop + )[:, 1] smooth_series = pd.Series(smooth_values, index=df_pre.index) - # Add smoothed data and residuals to the 'state' DataFrame self.state["smoothing"] = smooth_series self.state["residuals"] = self.state["state"] - self.state["smoothing"] - - def compute_var(self, rolling_window=0.25): - ''' + """ Compute variance over a rolling window. If residuals have not been computed, computation will be performed over state variable. - + Put into 'ews' dataframe Parameters @@ -195,19 +191,19 @@ def compute_var(self, rolling_window=0.25): Length of rolling window used to compute variance. Can be specified as an absolute value or as a proportion of the length of the data being analysed. Default is 0.25. - + Returns ------- None. - ''' + """ # Get time series data prior to transition if self.transition: df_pre = self.state[self.state.index <= self.transition] else: df_pre = self.state - + # Get absolute size of rollling window if given as a proportion if 0 < rolling_window <= 1: rw_absolute = int(rolling_window * len(df_pre)) @@ -215,22 +211,20 @@ def compute_var(self, rolling_window=0.25): rw_absolute = rolling_window # If residuals column exists, compute over residuals. - if 'residuals' in df_pre.columns: - var_values = df_pre['residuals'].rolling(window=rw_absolute).var() + if "residuals" in df_pre.columns: + var_values = df_pre["residuals"].rolling(window=rw_absolute).var() # Else, compute over state variable else: - var_values = df_pre['state'].rolling(window=rw_absolute).var() - - self.ews['variance'] = var_values - - - + var_values = df_pre["state"].rolling(window=rw_absolute).var() + + self.ews["variance"] = var_values + def compute_std(self, rolling_window=0.25): - ''' + """ Compute standard deviation over a rolling window. If residuals have not been computed, computation will be performed over state variable. - + Put into 'ews' dataframe Parameters @@ -239,19 +233,19 @@ def compute_std(self, rolling_window=0.25): Length of rolling window used to compute variance. Can be specified as an absolute value or as a proportion of the length of the data being analysed. Default is 0.25. - + Returns ------- None. - ''' + """ # Get time series data prior to transition if self.transition: df_pre = self.state[self.state.index <= self.transition] else: df_pre = self.state - + # Get absolute size of rollling window if given as a proportion if 0 < rolling_window <= 1: rw_absolute = int(rolling_window * len(df_pre)) @@ -259,23 +253,22 @@ def compute_std(self, rolling_window=0.25): rw_absolute = rolling_window # If residuals column exists, compute over residuals. - if 'residuals' in df_pre.columns: - std_values = df_pre['residuals'].rolling(window=rw_absolute).std() + if "residuals" in df_pre.columns: + std_values = df_pre["residuals"].rolling(window=rw_absolute).std() # Else, compute over state variable else: - std_values = df_pre['state'].rolling(window=rw_absolute).std() - - self.ews['std'] = std_values + std_values = df_pre["state"].rolling(window=rw_absolute).std() + self.ews["std"] = std_values def compute_cv(self, rolling_window=0.25): - ''' + """ Compute coefficient of variation over a rolling window. This is the standard deviation of the residuals divided by the mean of the state variable. If residuals have not been computed, computation will be performed over state variable. - + Put into 'ews' dataframe Parameters @@ -284,19 +277,19 @@ def compute_cv(self, rolling_window=0.25): Length of rolling window used to compute variance. Can be specified as an absolute value or as a proportion of the length of the data being analysed. Default is 0.25. - + Returns ------- None. - ''' + """ # Get time series data prior to transition if self.transition: df_pre = self.state[self.state.index <= self.transition] else: df_pre = self.state - + # Get absolute size of rollling window if given as a proportion if 0 < rolling_window <= 1: rw_absolute = int(rolling_window * len(df_pre)) @@ -305,26 +298,21 @@ def compute_cv(self, rolling_window=0.25): # Get standard deviation values # If residuals column exists, compute over residuals. - if 'residuals' in df_pre.columns: - std_values = df_pre['residuals'].rolling(window=rw_absolute).std() + if "residuals" in df_pre.columns: + std_values = df_pre["residuals"].rolling(window=rw_absolute).std() # Else, compute over state variable else: - std_values = df_pre['state'].rolling(window=rw_absolute).std() - - # Get mean values from state variable - mean_values = df_pre['state'].rolling(window=rw_absolute).mean() - - cv_values = std_values/mean_values - - self.ews['cv'] = cv_values - - + std_values = df_pre["state"].rolling(window=rw_absolute).std() + # Get mean values from state variable + mean_values = df_pre["state"].rolling(window=rw_absolute).mean() + cv_values = std_values / mean_values + self.ews["cv"] = cv_values def compute_auto(self, rolling_window=0.25, lag=1): - ''' + """ Compute autocorrelation over a rolling window. Add to dataframe. Parameters @@ -335,19 +323,19 @@ def compute_auto(self, rolling_window=0.25, lag=1): data being analysed. Default is 0.25. lag : int Lag of autocorrelation - + Returns ------- None. - ''' + """ # Get time series data prior to transition if self.transition: df_pre = self.state[self.state.index <= self.transition] else: df_pre = self.state - + # Get absolute size of rollling window if given as a proportion if 0 < rolling_window <= 1: rw_absolute = int(rolling_window * len(df_pre)) @@ -355,25 +343,28 @@ def compute_auto(self, rolling_window=0.25, lag=1): rw_absolute = rolling_window # If residuals column exists, compute over residuals. - if 'residuals' in df_pre.columns: - ac_values = df_pre['residuals'].rolling(window=rw_absolute).apply( - func=lambda x: pd.Series(x).autocorr(lag=lag), raw=True + if "residuals" in df_pre.columns: + ac_values = ( + df_pre["residuals"] + .rolling(window=rw_absolute) + .apply(func=lambda x: pd.Series(x).autocorr(lag=lag), raw=True) ) # Else, compute over state variable else: - ac_values = df_pre['state'].rolling(window=rw_absolute).apply( - func=lambda x: pd.Series(x).autocorr(lag=lag), raw=True - ) - - self.ews['ac{}'.format(lag)] = ac_values + ac_values = ( + df_pre["state"] + .rolling(window=rw_absolute) + .apply(func=lambda x: pd.Series(x).autocorr(lag=lag), raw=True) + ) + self.ews["ac{}".format(lag)] = ac_values def compute_skew(self, rolling_window=0.25): - ''' + """ Compute skew over a rolling window. If residuals have not been computed, computation will be performed over state variable. - + Add to dataframe. Parameters @@ -382,19 +373,19 @@ def compute_skew(self, rolling_window=0.25): Length of rolling window used to compute variance. Can be specified as an absolute value or as a proportion of the length of the data being analysed. Default is 0.25. - + Returns ------- None. - ''' - + """ + # Get time series data prior to transition if self.transition: df_pre = self.state[self.state.index <= self.transition] else: df_pre = self.state - + # Get absolute size of rollling window if given as a proportion if 0 < rolling_window <= 1: rw_absolute = int(rolling_window * len(df_pre)) @@ -402,22 +393,20 @@ def compute_skew(self, rolling_window=0.25): rw_absolute = rolling_window # If residuals column exists, compute over residuals. - if 'residuals' in df_pre.columns: - skew_values = df_pre['residuals'].rolling(window=rw_absolute).skew() + if "residuals" in df_pre.columns: + skew_values = df_pre["residuals"].rolling(window=rw_absolute).skew() # Else, compute over state variable else: - skew_values = df_pre['state'].rolling(window=rw_absolute).skew() - - self.ews['skew'] = skew_values - + skew_values = df_pre["state"].rolling(window=rw_absolute).skew() + self.ews["skew"] = skew_values def compute_kurt(self, rolling_window=0.25): - ''' + """ Compute kurtosis over a rolling window. If residuals have not been computed, computation will be performed over state variable. - + Add to dataframe. Parameters @@ -426,19 +415,19 @@ def compute_kurt(self, rolling_window=0.25): Length of rolling window used to compute variance. Can be specified as an absolute value or as a proportion of the length of the data being analysed. Default is 0.25. - + Returns ------- None. - ''' + """ # Get time series data prior to transition if self.transition: df_pre = self.state[self.state.index <= self.transition] else: df_pre = self.state - + # Get absolute size of rollling window if given as a proportion if 0 < rolling_window <= 1: rw_absolute = int(rolling_window * len(df_pre)) @@ -446,21 +435,18 @@ def compute_kurt(self, rolling_window=0.25): rw_absolute = rolling_window # If residuals column exists, compute over residuals. - if 'residuals' in df_pre.columns: - kurt_values = df_pre['residuals'].rolling(window=rw_absolute).kurt() + if "residuals" in df_pre.columns: + kurt_values = df_pre["residuals"].rolling(window=rw_absolute).kurt() # Else, compute over state variable else: - kurt_values = df_pre['state'].rolling(window=rw_absolute).kurt() - - self.ews['kurtosis'] = kurt_values - + kurt_values = df_pre["state"].rolling(window=rw_absolute).kurt() + self.ews["kurtosis"] = kurt_values - - def compute_ktau(self, tmin='earliest', tmax='latest'): - ''' + def compute_ktau(self, tmin="earliest", tmax="latest"): + """ Compute kendall tau values of CSD-based EWS. - Output is placed in the attribute *ktau*, which is a Python + Output is placed in the attribute *ktau*, which is a Python dictionary contianing Kendall tau values for each CSD-based EWS. Parameters @@ -473,35 +459,31 @@ def compute_ktau(self, tmin='earliest', tmax='latest'): taken as latest time point in EWS time series, which could be the end of the state time series, or a defined transtion point. - ''' - - + """ + # Get tmin and tmax values if using extrema - if tmin == 'earliest': + if tmin == "earliest": tmin = self.ews.dropna().index[0] - - if tmax == 'latest': + + if tmax == "latest": tmax = self.ews.dropna().index[-1] - + # Get cropped data - df_ews = self.ews[(self.ews.index >= tmin) &\ - (self.ews.index <= tmax)].copy() - + df_ews = self.ews[(self.ews.index >= tmin) & (self.ews.index <= tmax)].copy() + # Include smax in Kendall tau computation if it exists - if 'smax' in self.ews_spec.columns: - df_ews = df_ews.join(self.ews_spec['smax']) - + if "smax" in self.ews_spec.columns: + df_ews = df_ews.join(self.ews_spec["smax"]) + # Make a series with the time values time_values = pd.Series(data=df_ews.index, index=df_ews.index) ktau_out = df_ews.corrwith(time_values, method="kendall", axis=0) self.ktau = dict(ktau_out) - - def apply_classifier(self, classifier, tmin, tmax, - name='c1', verbose=1): - ''' + def apply_classifier(self, classifier, tmin, tmax, name="c1", verbose=1): + """ Apply a deep learning classifier to the residual time series from - tmin to tmax. + tmin to tmax. If time series has not been detrended, apply to the raw data. Predictions from the classifier are saved into the attribute dl_preds. @@ -518,55 +500,57 @@ def apply_classifier(self, classifier, tmin, tmax, verbose : int, optional Verbosity of update messages from TensorFlow. 0 = silent, 1 = progress bar, 2 = single line. The default is 1. - + Returns ------- None. - ''' - + """ + # Length of time series required as input to classifier input_len = classifier.layers[0].input_shape[1] - + # Get time series segment. Use residuals if detrending performed. # Otherwise use state variable. - if 'residuals' in self.state.columns: - series = self.state[(self.state.index >= tmin) &\ - (self.state.index < tmax)]['residuals'] + if "residuals" in self.state.columns: + series = self.state[(self.state.index >= tmin) & (self.state.index < tmax)][ + "residuals" + ] else: - series = self.state[(self.state.index >= tmin) &\ - (self.state.index < tmax)]['state'] - + series = self.state[(self.state.index >= tmin) & (self.state.index < tmax)][ + "state" + ] + # Get values in series data = series.values - + # If time series segment is larger than input dimension of classifier if len(data) > input_len: - print('ERROR: Length of time series segment is too long for the classifier. You can modify the tmin and tmax parameters to select a smaller segment.') + print( + "ERROR: Length of time series segment is too long for the classifier. You can modify the tmin and tmax parameters to select a smaller segment." + ) return - + # Normalise by mean of absolute value - data_norm = data/(abs(data).mean()) + data_norm = data / (abs(data).mean()) # Prepend with zeros to make appropriate length for classifier num_zeros = input_len - len(data_norm) - input_data = np.concatenate((np.zeros(num_zeros),data_norm)).reshape(1,-1, 1) - + input_data = np.concatenate((np.zeros(num_zeros), data_norm)).reshape(1, -1, 1) + # Get DL prediction dl_pred = classifier.predict(input_data, verbose=verbose)[0] # Put info into dataframe - dict_dl_pred = {i:val for (i,val) in zip(np.arange(len(dl_pred)), dl_pred)} - dict_dl_pred['time'] = series.index[-1] - dict_dl_pred['classifier'] = name + dict_dl_pred = {i: val for (i, val) in zip(np.arange(len(dl_pred)), dl_pred)} + dict_dl_pred["time"] = series.index[-1] + dict_dl_pred["classifier"] = name df_dl_pred = pd.DataFrame(dict_dl_pred, index=[len(self.dl_preds)]) # Append to dataframe contiaining DL predictions self.dl_preds = pd.concat([self.dl_preds, df_dl_pred], ignore_index=True) - - - def apply_classifier_inc(self, classifier, inc=10, name='c1', verbose=1): - ''' + def apply_classifier_inc(self, classifier, inc=10, name="c1", verbose=1): + """ Apply a deep learning classifier to incrementally increasing time series lengths. First prediction is made on time series segment from data point at index 0 to data point at index inc. Second prediction @@ -583,7 +567,7 @@ def apply_classifier_inc(self, classifier, inc=10, name='c1', verbose=1): name : str, optional Name assigned to the classifier. The default is 'c1'. verbose : int, optional - Verbosity of update messages from TensorFlow. + Verbosity of update messages from TensorFlow. 0 = silent, 1 = progress bar, 2 = single line. The default is 1. @@ -591,39 +575,40 @@ def apply_classifier_inc(self, classifier, inc=10, name='c1', verbose=1): ------- None. - ''' - - dt = self.state.index[1]-self.state.index[0] + """ + + dt = self.state.index[1] - self.state.index[0] tmin = self.state.index[0] tend = self.state.index[-1] if not self.transition else self.transition - tend += dt # Make transition point inclusive - + tend += dt # Make transition point inclusive + # Tmax values for each time series segment - tmax_vals = np.arange(tmin+inc, tend+dt, inc) + tmax_vals = np.arange(tmin + inc, tend + dt, inc) for tmax in tmax_vals: - self.apply_classifier(classifier, name=name, tmin=tmin, tmax=tmax, - verbose=verbose) - - - + self.apply_classifier( + classifier, name=name, tmin=tmin, tmax=tmax, verbose=verbose + ) def clear_dl_preds(self): - ''' + """ Clear the attribute *dl_preds* Returns ------- None. - ''' - - self.dl_preds = pd.DataFrame() - + """ + self.dl_preds = pd.DataFrame() - def compute_spectrum(self, rolling_window=0.25, ham_length=40, - ham_offset=0.5, pspec_roll_offset=20, - w_cutoff=1): - ''' + def compute_spectrum( + self, + rolling_window=0.25, + ham_length=40, + ham_offset=0.5, + pspec_roll_offset=20, + w_cutoff=1, + ): + """ Compute the power spectrum over a rolling window. Stores the power spectra as a DataFrame in TimeSeries.pspec @@ -634,10 +619,10 @@ def compute_spectrum(self, rolling_window=0.25, ham_length=40, as an absolute value or as a proportion of the length of the data being analysed. Default is 0.25. ham_length : int - Length of the Hamming window used to compute the power spectrum. + Length of the Hamming window used to compute the power spectrum. The default is 40. ham_offset : float - Hamming offset as a proportion of the Hamming window size. + Hamming offset as a proportion of the Hamming window size. The default is 0.5. pspec_roll_offset : int Rolling window offset used when computing power spectra. Power spectrum @@ -651,8 +636,8 @@ def compute_spectrum(self, rolling_window=0.25, ham_length=40, ------- None. - ''' - + """ + # Get time series data prior to transition if self.transition: df_pre = self.state[self.state.index <= self.transition] @@ -666,11 +651,11 @@ def compute_spectrum(self, rolling_window=0.25, ham_length=40, rw_absolute = rolling_window # If residuals column exists, compute over residuals. - if 'residuals' in df_pre.columns: - series = df_pre['residuals'] + if "residuals" in df_pre.columns: + series = df_pre["residuals"] # Else, compute over state variable else: - series = df_pre['state'] + series = df_pre["state"] # Number of components in the residual time-series num_comps = len(df_pre) @@ -708,16 +693,14 @@ def compute_spectrum(self, rolling_window=0.25, ham_length=40, columns={"Power spectrum": "empirical"}, inplace=True ) # Include a column for the time-stamp - df_pspec_empirical['time'] = t_point * np.ones(len(pspec)) + df_pspec_empirical["time"] = t_point * np.ones(len(pspec)) list_pspec.append(df_pspec_empirical) - + # Concatenate power spectra dataframes and store in attribute pspec self.pspec = pd.concat(list_pspec) - - def compute_smax(self): - ''' + """ Compute Smax (the maximum power in the power spectrum). This can only be applied after applying compute_spectrum(). Stores Smax values in TimeSeries.ews_spec @@ -726,21 +709,20 @@ def compute_smax(self): ------- None. - ''' - + """ + if self.pspec is None: - print('ERROR: The power spectrum must be computed before computing\ + print( + "ERROR: The power spectrum must be computed before computing\ spectral EWS such as Smax. The power spectrum can be\ - computed using compute_pspec()') - - smax_values = self.pspec[['time','power']].groupby(['time']).max() - self.ews_spec['smax'] = smax_values - - + computed using compute_pspec()" + ) + smax_values = self.pspec[["time", "power"]].groupby(["time"]).max() + self.ews_spec["smax"] = smax_values def compute_spec_type(self, sweep=False): - ''' + """ Fit the analytical forms of the Fold, Hopf and Null power spectrum to the empirical power spectrum. Get Akaike Information Criterion (AIC) weights to determine best fit. @@ -759,31 +741,32 @@ def compute_spec_type(self, sweep=False): ------- None. - ''' - - + """ + if self.pspec is None: - print('ERROR: The power spectrum must be computed before computing\ + print( + "ERROR: The power spectrum must be computed before computing\ the spectrum type. The power spectrum can be\ - computed using compute_pspec()') - + computed using compute_pspec()" + ) + list_aic = [] list_pspec_fits = [] - + # Loop through time values - for time in self.pspec['time'].unique(): - pspec = self.pspec[self.pspec['time']==time] - pspec_series = pspec.set_index('frequency')['power'] - + for time in self.pspec["time"].unique(): + pspec = self.pspec[self.pspec["time"] == time] + pspec_series = pspec.set_index("frequency")["power"] + ## Compute the AIC values - metrics = helpers.pspec_metrics(pspec_series, ews=['aic'], sweep=sweep) + metrics = helpers.pspec_metrics(pspec_series, ews=["aic"], sweep=sweep) dict_aic = {} - dict_aic['fold'] = metrics['AIC fold'] - dict_aic['hopf'] = metrics['AIC hopf'] - dict_aic['null'] = metrics['AIC null'] - dict_aic['time'] = time + dict_aic["fold"] = metrics["AIC fold"] + dict_aic["hopf"] = metrics["AIC hopf"] + dict_aic["null"] = metrics["AIC null"] + dict_aic["time"] = time list_aic.append(dict_aic) - + # Generate data to plot the fitted power spectra # Create fine-scale frequency values @@ -806,8 +789,8 @@ def compute_spec_type(self, sweep=False): ## Put spectrum fits into a dataframe dict_fits = { - 'time': time * np.ones(len(wVals)), - 'frequency': wVals, + "time": time * np.ones(len(wVals)), + "frequency": wVals, "fit fold": pspec_fold, "fit hopf": pspec_hopf, "fit null": pspec_null, @@ -816,17 +799,14 @@ def compute_spec_type(self, sweep=False): list_pspec_fits.append(df_pspec_fits) # Concatenate - df_aic = pd.DataFrame(list_aic).set_index('time') + df_aic = pd.DataFrame(list_aic).set_index("time") df_pspec_fits = pd.concat(list_pspec_fits) self.ews_spec = self.ews_spec.join(df_aic) self.pspec_fits = df_pspec_fits - - - def make_plotly(self, kendall_tau=True, ens_avg=False): - ''' + """ Make an interactive Plotly figure to view all EWS computed Parameters @@ -836,15 +816,15 @@ def make_plotly(self, kendall_tau=True, ens_avg=False): Set as true to show Kendall tau values (if they have been computed) Default is True. ens_avg : bool, optional - Plot the ensenble average of DL predictions. + Plot the ensenble average of DL predictions. Default is False. Returns ------- Plotly figure - ''' - + """ + # Trace colours colours = px.colors.qualitative.Plotly # Count number of panels for Plotly subplots @@ -852,275 +832,292 @@ def make_plotly(self, kendall_tau=True, ens_avg=False): # Always a row for state varialbe row_count += 1 # Row for autocorrelation - ac_labels = [s for s in self.ews.columns if s[:2]=='ac'] - if len(ac_labels)>0: - row_count+=1 + ac_labels = [s for s in self.ews.columns if s[:2] == "ac"] + if len(ac_labels) > 0: + row_count += 1 # Row for each other EWS - row_count += len(self.ews.columns)-len(ac_labels) + row_count += len(self.ews.columns) - len(ac_labels) # Row for Smax if included - if 'smax' in self.ews_spec.columns: - row_count+=1 + if "smax" in self.ews_spec.columns: + row_count += 1 # Row for AIC weights if computed - if 'fold' in self.ews_spec.columns: - row_count+=1 + if "fold" in self.ews_spec.columns: + row_count += 1 # Row for DL predictions if computed - if len(self.dl_preds.columns)>0: - row_count+=1 - + if len(self.dl_preds.columns) > 0: + row_count += 1 + num_rows = row_count - row_count = 1 # reset row counter - + row_count = 1 # reset row counter + # Make Plotly subplots frame - fig = make_subplots(rows=num_rows, cols=1, - shared_xaxes=True, - x_title='Time', - vertical_spacing=0.02 - ) - + fig = make_subplots( + rows=num_rows, + cols=1, + shared_xaxes=True, + x_title="Time", + vertical_spacing=0.02, + ) + # Plot state variable fig.add_trace( - go.Scatter(x=self.state.index.values, - y=self.state['state'].values, - name='state', - ), - row=row_count, col=1 - ) - fig.update_yaxes(title='State', row=row_count) + go.Scatter( + x=self.state.index.values, + y=self.state["state"].values, + name="state", + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="State", row=row_count) - # Plot smoothing if computed - if 'smoothing' in self.state.columns: + if "smoothing" in self.state.columns: fig.add_trace( - go.Scatter(x=self.state.index.values, - y=self.state['smoothing'].values, - name='smoothing', - ), - row=row_count, col=1 - ) - + go.Scatter( + x=self.state.index.values, + y=self.state["smoothing"].values, + name="smoothing", + ), + row=row_count, + col=1, + ) + # Plot variance if computed - if 'variance' in self.ews.columns: + if "variance" in self.ews.columns: row_count += 1 - + # Add kendall tau to name - if kendall_tau and ('variance' in self.ktau.keys()): - ktau = self.ktau['variance'] - name = 'variance (ktau={:.2f})'.format(ktau) + if kendall_tau and ("variance" in self.ktau.keys()): + ktau = self.ktau["variance"] + name = "variance (ktau={:.2f})".format(ktau) else: - name = 'variance' - + name = "variance" + fig.add_trace( - go.Scatter(x=self.ews.index.values, - y=self.ews['variance'].values, - name=name, - ), - row=row_count, col=1 - ) - fig.update_yaxes(title='Variance', row=row_count) - + go.Scatter( + x=self.ews.index.values, + y=self.ews["variance"].values, + name=name, + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="Variance", row=row_count) + # Plot standard deviation if computed - if 'std' in self.ews.columns: + if "std" in self.ews.columns: row_count += 1 - + # Add kendall tau to name - if kendall_tau and ('std' in self.ktau.keys()): - ktau = self.ktau['std'] - name = 'std (ktau={:.2f})'.format(ktau) + if kendall_tau and ("std" in self.ktau.keys()): + ktau = self.ktau["std"] + name = "std (ktau={:.2f})".format(ktau) else: - name = 'std' - + name = "std" + fig.add_trace( - go.Scatter(x=self.ews.index.values, - y=self.ews['std'].values, - name=name, - ), - row=row_count, col=1 - ) - fig.update_yaxes(title='St. Dev.', row=row_count) - + go.Scatter( + x=self.ews.index.values, + y=self.ews["std"].values, + name=name, + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="St. Dev.", row=row_count) + # Plot coefficient of variation if computed - if 'cv' in self.ews.columns: + if "cv" in self.ews.columns: row_count += 1 - + # Add kendall tau to name - if kendall_tau and ('cv' in self.ktau.keys()): - ktau = self.ktau['cv'] - name = 'cv (ktau={:.2f})'.format(ktau) + if kendall_tau and ("cv" in self.ktau.keys()): + ktau = self.ktau["cv"] + name = "cv (ktau={:.2f})".format(ktau) else: - name = 'cv' - + name = "cv" + fig.add_trace( - go.Scatter(x=self.ews.index.values, - y=self.ews['cv'].values, - name=name, - ), - row=row_count, col=1 - ) - fig.update_yaxes(title='Coeff. of Var.', row=row_count) - - - + go.Scatter( + x=self.ews.index.values, + y=self.ews["cv"].values, + name=name, + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="Coeff. of Var.", row=row_count) + # Plot autocorrelation metrics if computed - if len(ac_labels)!=0: - row_count+=1 + if len(ac_labels) != 0: + row_count += 1 for ac_label in ac_labels: - + # Add kendall tau to name if kendall_tau and (ac_label in self.ktau.keys()): ktau = self.ktau[ac_label] - name = '{} (ktau={:.2f})'.format(ac_label, ktau) + name = "{} (ktau={:.2f})".format(ac_label, ktau) else: - name = ac_label - + name = ac_label + fig.add_trace( - go.Scatter(x=self.ews.index.values, - y=self.ews[ac_label].values, - name=name, - ), - row=row_count, col=1 - ) - fig.update_yaxes(title='Autocorrelation', row=row_count) - - + go.Scatter( + x=self.ews.index.values, + y=self.ews[ac_label].values, + name=name, + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="Autocorrelation", row=row_count) + # Plot skew if computed - if 'skew' in self.ews.columns: + if "skew" in self.ews.columns: row_count += 1 - + # Add kendall tau to name - if kendall_tau and ('skew' in self.ktau.keys()): - ktau = self.ktau['skew'] - name = 'skew (ktau={:.2f})'.format(ktau) + if kendall_tau and ("skew" in self.ktau.keys()): + ktau = self.ktau["skew"] + name = "skew (ktau={:.2f})".format(ktau) else: - name = 'skew' - + name = "skew" + fig.add_trace( - go.Scatter(x=self.ews.index.values, - y=self.ews['skew'].values, - name=name, - ), - row=row_count, col=1 - ) - fig.update_yaxes(title='Skew', row=row_count) - + go.Scatter( + x=self.ews.index.values, + y=self.ews["skew"].values, + name=name, + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="Skew", row=row_count) + # Plot kurtosis if computed - if 'kurtosis' in self.ews.columns: + if "kurtosis" in self.ews.columns: row_count += 1 - + # Add kendall tau to name - if kendall_tau and ('kurtosis' in self.ktau.keys()): - ktau = self.ktau['kurtosis'] - name = 'kurtosis (ktau={:.2f})'.format(ktau) + if kendall_tau and ("kurtosis" in self.ktau.keys()): + ktau = self.ktau["kurtosis"] + name = "kurtosis (ktau={:.2f})".format(ktau) else: - name = 'kurtosis' - + name = "kurtosis" + fig.add_trace( - go.Scatter(x=self.ews.index.values, - y=self.ews['kurtosis'].values, - name=name, - ), - row=row_count, col=1 - ) - fig.update_yaxes(title='Kurtosis', row=row_count) - - + go.Scatter( + x=self.ews.index.values, + y=self.ews["kurtosis"].values, + name=name, + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="Kurtosis", row=row_count) + # Plot Smax if computd - if 'smax' in self.ews_spec.columns: + if "smax" in self.ews_spec.columns: row_count += 1 - + # Add kendall tau to name - if kendall_tau and ('smax' in self.ktau.keys()): - ktau = self.ktau['smax'] - name = 'smax (ktau={:.2f})'.format(ktau) + if kendall_tau and ("smax" in self.ktau.keys()): + ktau = self.ktau["smax"] + name = "smax (ktau={:.2f})".format(ktau) else: - name = 'smax' - + name = "smax" + fig.add_trace( - go.Scatter(x=self.ews_spec.index.values, - y=self.ews_spec['smax'].values, - name=name, - ), - row=row_count, col=1 - ) - fig.update_yaxes(title='Smax', row=row_count) - - + go.Scatter( + x=self.ews_spec.index.values, + y=self.ews_spec["smax"].values, + name=name, + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="Smax", row=row_count) + # Plot AIC weights if computd - if 'fold' in self.ews_spec.columns: + if "fold" in self.ews_spec.columns: row_count += 1 - aic_labels = ['fold', 'hopf', 'null'] + aic_labels = ["fold", "hopf", "null"] for aic_label in aic_labels: fig.add_trace( - go.Scatter(x=self.ews_spec.index.values, - y=self.ews_spec[aic_label].values, - name=aic_label, - ), - row=row_count, col=1 - ) - - fig.update_yaxes(title='AIC weights', row=row_count) - - - + go.Scatter( + x=self.ews_spec.index.values, + y=self.ews_spec[aic_label].values, + name=aic_label, + ), + row=row_count, + col=1, + ) + + fig.update_yaxes(title="AIC weights", row=row_count) + # Plot DL predictions if computed - if len(self.dl_preds)>0: + if len(self.dl_preds) > 0: row_count += 1 - class_labels = [s for s in self.dl_preds.columns if s not in ['time', 'classifier']] - classifiers = self.dl_preds['classifier'].unique() - + class_labels = [ + s for s in self.dl_preds.columns if s not in ["time", "classifier"] + ] + classifiers = self.dl_preds["classifier"].unique() + # If plotting predictions from every classifier if not ens_avg: # Loop through class labels and classifier names for idx, class_label in enumerate(class_labels): for idx2, classifier in enumerate(classifiers): - df_plot = self.dl_preds[self.dl_preds['classifier']==classifier] + df_plot = self.dl_preds[ + self.dl_preds["classifier"] == classifier + ] fig.add_trace( - go.Scatter(x=df_plot['time'].values, - y=df_plot[class_label].values, - name='DL class {}'.format(class_label), - legendgroup='DL class {}'.format(class_label), - showlegend=True if idx2==0 else False, - line=dict(color=colours[idx]) - ), - row=row_count, col=1 - ) - + go.Scatter( + x=df_plot["time"].values, + y=df_plot[class_label].values, + name="DL class {}".format(class_label), + legendgroup="DL class {}".format(class_label), + showlegend=True if idx2 == 0 else False, + line=dict(color=colours[idx]), + ), + row=row_count, + col=1, + ) + # If plotting ensemble average over classifiers else: - df_plot = self.dl_preds.groupby(['time']).mean().reset_index() + df = self.dl_preds.drop(["classifier"], axis=1) # JJV + df_plot = df.groupby(["time"]).mean().reset_index() # JJV + # Loop through class labels for idx, class_label in enumerate(class_labels): fig.add_trace( - go.Scatter(x=df_plot['time'].values, - y=df_plot[class_label].values, - name='DL class {}'.format(class_label), - line=dict(color=colours[idx]) - ), - row=row_count, col=1 - ) - - fig.update_yaxes(title='DL predictions', row=row_count) - - - - - # Set figure dimensions - fig.update_layout(height=200*num_rows, - width=800) - - # Separation between legend entries - fig.layout.legend.tracegroupgap = 0 - - return fig + go.Scatter( + x=df_plot["time"].values, + y=df_plot[class_label].values, + name="DL class {}".format(class_label), + line=dict(color=colours[idx]), + ), + row=row_count, + col=1, + ) + fig.update_yaxes(title="DL predictions", row=row_count) + # Set figure dimensions + fig.update_layout(height=200 * num_rows, width=800) + # Separation between legend entries + fig.layout.legend.tracegroupgap = 0 + return fig # ----------------------------- # Eigenvalue reconstruction # ------------------------------ + def eval_recon_rolling( df_in, roll_window=0.4, @@ -1240,13 +1237,13 @@ def eval_recon_rolling( # Do eigenvalue reconstruction on residuals dic_eval_recon = helpers.eval_recon(df_window) # Add time component - dic_eval_recon['time'] = t_point + dic_eval_recon["time"] = t_point # Add them to list list_evaldata.append(dic_eval_recon) # Create dataframe from list of dicts of eval data df_evaldata = pd.DataFrame(list_evaldata) - df_evaldata.set_index('time', inplace=True) + df_evaldata.set_index("time", inplace=True) # Create output dataframe that merges all useful info df_out = pd.concat( @@ -1305,7 +1302,7 @@ def block_bootstrap(series, n_samples, bs_type="Stationary", block_size=10): for data in bs.bootstrap(n_samples): df_temp = pd.DataFrame( - {"sample": count, 'time': series.index.values, "x": data[0][0]} + {"sample": count, "time": series.index.values, "x": data[0][0]} ) list_samples.append(df_temp) count += 1 @@ -1318,14 +1315,14 @@ def block_bootstrap(series, n_samples, bs_type="Stationary", block_size=10): for data in bs.bootstrap(n_samples): df_temp = pd.DataFrame( - {"sample": count, 'time': series.index.values, "x": data[0][0]} + {"sample": count, "time": series.index.values, "x": data[0][0]} ) list_samples.append(df_temp) count += 1 # Concatenate list of samples df_samples = pd.concat(list_samples) - df_samples.set_index(["sample", 'time'], inplace=True) + df_samples.set_index(["sample", "time"], inplace=True) # Output DataFrame of samples return df_samples @@ -1427,15 +1424,15 @@ def roll_bootstrap( # Compute bootstrap samples of residauls within rolling window df_samples_temp = block_bootstrap(window_series, n_samples, bs_type, block_size) df_samples_temp.reset_index(inplace=True) - df_samples_temp['wintime'] = df_samples_temp['time'] - + df_samples_temp["wintime"] = df_samples_temp["time"] + # Add column with real time (end of window) - df_samples_temp['time'] = t_point + df_samples_temp["time"] = t_point # Reorganise index - + df_samples_temp.reset_index(inplace=True) - df_samples_temp.set_index(['time', "sample", 'wintime'], inplace=True) + df_samples_temp.set_index(["time", "sample", "wintime"], inplace=True) # Append the list of samples list_samples.append(df_samples_temp)