Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/reminder interval #614

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ If you want to test it locally, see [Docker](#docker).
| `endpoints[].alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
| `endpoints[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
| `endpoints[].alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
| `endpoints[].alerts[].reminder-interval` | Configuration for setting an interval between reminders. | `""` |
| `endpoints[].client` | [Client configuration](#client-configuration). | `{}` |
| `endpoints[].ui` | UI configuration at the endpoint level. | `{}` |
| `endpoints[].ui.hide-hostname` | Whether to hide the hostname in the result. | `false` |
Expand Down
4 changes: 4 additions & 0 deletions alerting/alert/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package alert
import (
"errors"
"strings"
"time"
)

var (
Expand Down Expand Up @@ -45,6 +46,9 @@ type Alert struct {
// ongoing/triggered incidents
ResolveKey string `yaml:"-"`

// ReminderInterval is the interval between reminders
ReminderInterval time.Duration `yaml:"reminder-interval,omitempty"`
bugrakocabay marked this conversation as resolved.
Show resolved Hide resolved

// Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value
// should be set back to false. It is used to prevent the same alert from going out twice.
//
Expand Down
3 changes: 3 additions & 0 deletions core/endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ type Endpoint struct {

// SSH is the configuration of SSH monitoring.
SSH *SSH `yaml:"ssh,omitempty"`

// LastReminderSent is the time at which the last reminder was sent for this endpoint.
LastReminderSent time.Time `yaml:"-"`
}

type SSH struct {
Expand Down
26 changes: 20 additions & 6 deletions watchdog/alerting.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"log"
"os"
"time"

"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/core"
Expand All @@ -25,20 +26,29 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin
endpoint.NumberOfSuccessesInARow = 0
endpoint.NumberOfFailuresInARow++
for _, endpointAlert := range endpoint.Alerts {
// If the alert hasn't been triggered, move to the next one
// Check for initial alert trigger
if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > endpoint.NumberOfFailuresInARow {
continue
}
if endpointAlert.Triggered {
// Determine if an initial alert should be sent
sendInitialAlert := !endpointAlert.Triggered
// Determine if a reminder should be sent
sendReminder := endpointAlert.Triggered && endpointAlert.ReminderInterval > 0 && time.Since(endpoint.LastReminderSent) >= endpointAlert.ReminderInterval
// If neither initial alert nor reminder needs to be sent, skip to the next alert
if !sendInitialAlert && !sendReminder {
if debug {
log.Printf("[watchdog][handleAlertsToTrigger] Alert for endpoint=%s with description='%s' has already been TRIGGERED, skipping", endpoint.Name, endpointAlert.GetDescription())
log.Printf("[watchdog][handleAlertsToTrigger] Alert for endpoint=%s with description='%s' is not due for triggering or reminding, skipping", endpoint.Name, endpointAlert.GetDescription())
}
continue
}
alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type)
if alertProvider != nil {
log.Printf("[watchdog][handleAlertsToTrigger] Sending %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", endpointAlert.Type, endpoint.Name, endpointAlert.GetDescription())
var err error
alertType := "reminder"
if sendInitialAlert {
alertType = "initial"
}
log.Printf("[watchdog][handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, endpoint.Name, endpointAlert.GetDescription())
if os.Getenv("MOCK_ALERT_PROVIDER") == "true" {
if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" {
err = errors.New("error")
Expand All @@ -49,10 +59,14 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin
if err != nil {
log.Printf("[watchdog][handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", endpoint.Name, err.Error())
} else {
endpointAlert.Triggered = true
// Mark initial alert as triggered and update last reminder time
if sendInitialAlert {
endpointAlert.Triggered = true
}
endpoint.LastReminderSent = time.Now()
}
} else {
log.Printf("[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type)
log.Printf("[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being due, because the provider wasn't configured properly", endpointAlert.Type)
bugrakocabay marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
Expand Down
42 changes: 42 additions & 0 deletions watchdog/alerting_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package watchdog
import (
"os"
"testing"
"time"

"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/alerting/alert"
Expand Down Expand Up @@ -475,6 +476,47 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) {
verify(t, endpoint, 0, 2, false, "")
}

func TestHandleAlertingWithReminderInterval(t *testing.T) {
_ = os.Setenv("MOCK_ALERT_PROVIDER", "true")
defer os.Clearenv()

cfg := &config.Config{
Debug: true,
Alerting: &alerting.Config{
Custom: &custom.AlertProvider{
URL: "https://twin.sh/health",
Method: "GET",
},
},
}
enabled := true
endpoint := &core.Endpoint{
URL: "https://example.com",
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
Enabled: &enabled,
FailureThreshold: 2,
SuccessThreshold: 3,
SendOnResolved: &enabled,
Triggered: false,
ReminderInterval: 1 * time.Second,
},
},
}

verify(t, endpoint, 0, 0, false, "The alert shouldn't start triggered")
HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, endpoint, 1, 0, false, "The alert shouldn't have triggered")
HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, endpoint, 2, 0, true, "The alert should've triggered")
HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, endpoint, 3, 0, true, "The alert should still be triggered")
HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, endpoint, 4, 0, true, "The alert should still be triggered")
HandleAlerting(endpoint, &core.Result{Success: true}, cfg.Alerting, cfg.Debug)
}

func verify(t *testing.T, endpoint *core.Endpoint, expectedNumberOfFailuresInARow, expectedNumberOfSuccessInARow int, expectedTriggered bool, expectedTriggeredReason string) {
if endpoint.NumberOfFailuresInARow != expectedNumberOfFailuresInARow {
t.Errorf("endpoint.NumberOfFailuresInARow should've been %d, got %d", expectedNumberOfFailuresInARow, endpoint.NumberOfFailuresInARow)
Expand Down