Skip to content

Commit

Permalink
Add heartbeat (#847)
Browse files Browse the repository at this point in the history
  • Loading branch information
lou-lan authored Oct 30, 2023
1 parent a5a5945 commit e090e9c
Show file tree
Hide file tree
Showing 16 changed files with 384 additions and 60 deletions.
9 changes: 9 additions & 0 deletions charts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ helm install egressgateway egressgateway/egressgateway --namespace kube-system
| `feature.maxNumberEndpointPerSlice` | max number of endpoints per slice | `100` |
| `feature.announcedInterfacesToExclude` | The list of network interface excluded for announcing Egress IP. | `["^cali.*","br-*"]` |

### feature.gatewayFailover Enable gateway failover.

| Name | Description | Value |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `feature.gatewayFailover.enable` | Enable gateway failover, default `false`. | `false` |
| `feature.gatewayFailover.tunnelMonitorPeriod` | The egress controller check tunnel last update status at an interval set in seconds, default `5`. | `5` |
| `feature.gatewayFailover.tunnelUpdatePeriod` | The egress agent updates the tunnel status at an interval set in seconds, default `5`. | `5` |
| `feature.gatewayFailover.eipEvictionTimeout` | If the last updated time of the egress tunnel exceeds this time, move the Egress IP of the node to an available node, the unit is seconds, default is `15`. | `15` |

### Egressgateway agent parameters

| Name | Description | Value |
Expand Down
5 changes: 4 additions & 1 deletion charts/crds/egressgateway.spidernet.io_egresstunnels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ spec:
type: object
status:
properties:
lastHeartbeatTime:
format: date-time
type: string
mark:
type: string
phase:
Expand All @@ -68,7 +71,7 @@ spec:
- Init
- Failed
- Ready
- ""
- HeartbeatTimeout
type: string
tunnel:
properties:
Expand Down
10 changes: 10 additions & 0 deletions charts/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,16 @@ feature:
announcedInterfacesToExclude:
- "^cali.*"
- "br-*"
## @section feature.gatewayFailover Enable gateway failover.
gatewayFailover:
## @param feature.gatewayFailover.enable Enable gateway failover, default `false`.
enable: false
## @param feature.gatewayFailover.tunnelMonitorPeriod The egress controller check tunnel last update status at an interval set in seconds, default `5`.
tunnelMonitorPeriod: 5
## @param feature.gatewayFailover.tunnelUpdatePeriod The egress agent updates the tunnel status at an interval set in seconds, default `5`.
tunnelUpdatePeriod: 5
## @param feature.gatewayFailover.eipEvictionTimeout If the last updated time of the egress tunnel exceeds this time, move the Egress IP of the node to an available node, the unit is seconds, default is `15`.
eipEvictionTimeout: 15

## @section Egressgateway agent parameters
##
Expand Down
18 changes: 18 additions & 0 deletions cmd/controller/cmd/clean.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

egressv1 "github.com/spidernet-io/egressgateway/pkg/k8s/apis/v1beta1"
"github.com/spidernet-io/egressgateway/pkg/schema"
)

Expand Down Expand Up @@ -72,5 +73,22 @@ func clean(validate, mutating string) error {
return err
}
}

list := new(egressv1.EgressTunnelList)
err = cli.List(ctx, list)
if err == nil {
for _, item := range list.Items {
item.Finalizers = make([]string, 0)
err := cli.Update(ctx, &item)
if err != nil {
return err
}
err = cli.Delete(ctx, &item)
if err != nil {
return err
}
}
}

return nil
}
55 changes: 55 additions & 0 deletions docs/usage/EgressGatewayFailover.en.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# EgressGateway Failover

## Controller Failover

When the EgressGateway controller fails over, you can control the number of Controller replicas by specifying the `controller.replicas` parameter during installation. If one of the replicas in multiple Controller replicas fails, the system will automatically elect another replica as the primary controller to ensure service continuity.

## Datapath Failover

When handling datapath failover, creating an `EgressGateway` can use `nodeSelector` to select a set of nodes as Egress Nodes. The Egress IP will be bound to one of these nodes. When a node fails or the Egress Agent on a node fails, the Egress IP will automatically move to another available node to ensure service continuity and reliability.

```yaml
apiVersion: egressgateway.spidernet.io/v1beta1
kind: EgressGateway
metadata:
name: egw1
spec:
clusterDefault: true
ippools:
ipv4:
- 10.6.1.55
- 10.6.1.56
ipv4DefaultEIP: 10.6.1.56
ipv6:
- fd00::55
- fd00::56
ipv6DefaultEIP: fd00::55
nodeSelector:
selector:
matchLabels:
egress: "true"
status:
nodeList:
- eips:
- ipv4: 10.6.1.56
ipv6: fd00::55
policies:
- name: policy1
namespace: default
name: workstation2
status: Ready
- name: workstation3
status: Ready
```
The timeout for health checks and Egress IP failover can be tuned via Helm values configuration.
* `feature.tunnelMonitorPeriod` The egress controller check tunnel last update status at an interval set in seconds, default `5`.
* `feature.tunnelUpdatePeriod` The egress agent updates the tunnel status at an interval set in seconds, default `5`.
* `feature.eipEvictionTimeout` If the last updated time of the egress tunnel exceeds this time, move the Egress IP of the node to an available node, the unit is seconds, default is `15`.

Datapath Failover troubleshooting steps:

1. First, check the installation configuration file `values.yaml` of the EgressGateway application to ensure failover related configurations are set reasonably, in particular ensuring `eipEvictionTimeout` is greater than the sum of `tunnelMonitorPeriod` and `tunnelUpdatePeriod`.
2. Execute `kubectl get egt -w` to check the status of `EgressTunnel`. Check if the selected Node is in `HeartbeatTimeout` state, and if there are other `EgressTunnel` in `Ready` state.
3. If you want to check if there has been an IP switch caused by HeartbeatTimeout, you can retrieve the logs related to `update tunnel status to HeartbeatTimeout` in the controller container.
55 changes: 55 additions & 0 deletions docs/usage/EgressGatewayFailover.zh.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# EgressGateway Failover

## Controller Failover

EgressGateway 控制面故障转移时,可以通过在安装时指定 `controller.replicas` 参数来控制 Controller 的副本数量。在多个 Controller 副本中的一个发生故障时,系统会自动选择另一个副本作为主要控制器,以确保服务的持续提供。

## Datapath Failover

在处理数据面故障转移时,创建 EgressGateway 可以通过使用 `nodeSelector` 来选择一组节点作为 Egress Node。Egress IP 将会绑定到其中的一个节点上。当某个节点发生故障或者节点上的 Egress Agent 故障时,Egress IP 将会自动转移到另一个可用节点上,从而保证服务的连续性和可靠性。

```yaml
apiVersion: egressgateway.spidernet.io/v1beta1
kind: EgressGateway
metadata:
name: egw1
spec:
clusterDefault: true
ippools:
ipv4:
- 10.6.1.55
- 10.6.1.56
ipv4DefaultEIP: 10.6.1.56
ipv6:
- fd00::55
- fd00::56
ipv6DefaultEIP: fd00::55
nodeSelector:
selector:
matchLabels:
egress: "true"
status:
nodeList:
- eips:
- ipv4: 10.6.1.56
ipv6: fd00::55
policies:
- name: policy1
namespace: default
name: workstation2
status: Ready
- name: workstation3
status: Ready
```
通过 Helm 的 values 配置,可以调整状态检测和 Egress IP 转移的时间。
* `feature.tunnelMonitorPeriod`:Egress Controller 以秒为单位设置的间隔检查 EgressTunnel 的最后更新状态,默认为 `5`。
* `feature.tunnelUpdatePeriod`:Egress Agent 以秒为单位设置的间隔更新 EgressTunnel 状态,默认为 `5`。
* `feature.eipEvictionTimeout`:如果 EgressTunnel 的最后更新时间超过此时间,则将节点的 Egress IP 移动到另一个可用节点,单位为秒,默认为 `15`。

Datapath Failover 问题排查步骤:

1. 首先,查看 EgressGateway 应用的安装配置文件 `values.yaml`,确认与 Datapath Failover 相关的配置是否设置合理,特别是确保 `eipEvictionTimeout` 的值大于 `tunnelMonitorPeriod` 加上 `tunnelUpdatePeriod` 的总和;
2. 执行 `kubectl get egt -w` 命令,检查 `EgressTunnel` 的状态。检查选中的 Node 是否处于 `HeartbeatTimeout` 状态,并且是否存在其他处于 `Ready` 状态的 `EgressTunnel`;
3. 如果想查询是否出现过 HeartbeatTimeout 导致的 IP 切换,可以在 controller 容器检索 `update tunnel status to HeartbeatTimeout` 相关的日志。
10 changes: 6 additions & 4 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ type Agent struct {
func New(cfg *config.Config) (types.Service, error) {
syncPeriod := time.Second * 15
log := logger.NewLogger(cfg.EnvConfig.Logger)
t := time.Duration(0)
mgrOpts := manager.Options{
Scheme: schema.GetScheme(),
Logger: log,
HealthProbeBindAddress: cfg.HealthProbeBindAddress,
SyncPeriod: &syncPeriod,
Scheme: schema.GetScheme(),
Logger: log,
HealthProbeBindAddress: cfg.HealthProbeBindAddress,
SyncPeriod: &syncPeriod,
GracefulShutdownTimeout: &t,
}

if cfg.MetricsBindAddress != "" {
Expand Down
88 changes: 73 additions & 15 deletions pkg/agent/vxlan.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package agent

import (
"context"
"errors"
"fmt"
"net"
"strconv"
Expand All @@ -14,7 +15,7 @@ import (

"github.com/go-logr/logr"
"github.com/vishvananda/netlink"
"k8s.io/apimachinery/pkg/api/errors"
k8sErr "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -31,6 +32,8 @@ import (
"github.com/spidernet-io/egressgateway/pkg/utils"
)

var ErrHeartbeatTime = errors.New("heartbeat time")

type vxlanReconciler struct {
client client.Client
log logr.Logger
Expand All @@ -44,6 +47,8 @@ type vxlanReconciler struct {

ruleRoute *route.RuleRoute
ruleRouteCache *utils.SyncMap[string, []net.IP]

updateTimer *time.Timer
}

type VTEP struct {
Expand Down Expand Up @@ -380,7 +385,7 @@ func (r *vxlanReconciler) reconcileEgressTunnel(ctx context.Context, req reconci
deleted := false
err := r.client.Get(ctx, req.NamespacedName, node)
if err != nil {
if !errors.IsNotFound(err) {
if !k8sErr.IsNotFound(err) {
return reconcile.Result{}, err
}
deleted = true
Expand Down Expand Up @@ -472,7 +477,7 @@ func (r *vxlanReconciler) getEgressTunnelByEgressGateway(ctx context.Context, na
egw := &egressv1.EgressGateway{}
err := r.client.Get(ctx, types.NamespacedName{Name: name}, egw)
if err != nil {
if errors.IsNotFound(err) {
if k8sErr.IsNotFound(err) {
return res, nil
}
return nil, err
Expand Down Expand Up @@ -535,7 +540,7 @@ func (r *vxlanReconciler) updateEgressTunnelStatus(node *egressv1.EgressTunnel,
ctx := context.Background()
err = r.client.Get(ctx, types.NamespacedName{Name: r.cfg.NodeName}, node)
if err != nil {
if errors.IsNotFound(err) {
if k8sErr.IsNotFound(err) {
return nil
}
return err
Expand Down Expand Up @@ -579,16 +584,7 @@ func (r *vxlanReconciler) updateEgressTunnelStatus(node *egressv1.EgressTunnel,
}

if needUpdate {
r.log.Info("update node status",
"phase", node.Status.Phase,
"tunnelIPv4", node.Status.Tunnel.IPv4,
"tunnelIPv6", node.Status.Tunnel.IPv6,
"parentName", node.Status.Tunnel.Parent.Name,
"parentIPv4", node.Status.Tunnel.Parent.IPv4,
"parentIPv6", node.Status.Tunnel.Parent.IPv6,
)
ctx := context.Background()
err = r.client.Status().Update(ctx, node)
err := r.updateTunnelStatus(node)
if err != nil {
return err
}
Expand All @@ -597,6 +593,39 @@ func (r *vxlanReconciler) updateEgressTunnelStatus(node *egressv1.EgressTunnel,
return nil
}

func (r *vxlanReconciler) syncLastHeartbeatTime(ctx context.Context) error {
r.log.Info("start sync heartbeat")
for {
select {
case <-ctx.Done():
r.log.Info("heartbeat context done")
return nil
case <-r.updateTimer.C:
ctx := context.Background()
tunnel := new(egressv1.EgressTunnel)
err := r.client.Get(ctx, types.NamespacedName{Name: r.cfg.NodeName}, tunnel)
if err != nil {
if k8sErr.IsNotFound(err) {
break
}
r.log.Error(err, "update tunnel status")
r.updateTimer.Reset(time.Second)
break
}
r.log.V(1).Info("update tunnel last heartbeat time")
err = r.updateTunnelStatus(tunnel)
if err != nil {
if strings.Contains(err.Error(), "context deadline exceeded") {
return ErrHeartbeatTime
}
r.log.Error(err, "update tunnel status")
r.updateTimer.Reset(time.Second)
break
}
}
}
}

func (r *vxlanReconciler) parseVTEP(status egressv1.EgressTunnelStatus) *vxlan.Peer {
var ipv4 *net.IP
var ipv6 *net.IP
Expand Down Expand Up @@ -624,7 +653,6 @@ func (r *vxlanReconciler) parseVTEP(status egressv1.EgressTunnelStatus) *vxlan.P
ipv6 = &ip
}
}

mac, err := net.ParseMAC(status.Tunnel.MAC)
if err != nil {
ready = false
Expand Down Expand Up @@ -735,6 +763,27 @@ func (r *vxlanReconciler) keepVXLAN() {
}
}

func (r *vxlanReconciler) updateTunnelStatus(node *egressv1.EgressTunnel) error {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(r.cfg.FileConfig.GatewayFailover.EipEvictionTimeout)*time.Second)
defer cancel()

node.Status.LastHeartbeatTime = metav1.Now()
r.log.Info("update tunnel status",
"phase", node.Status.Phase,
"tunnelIPv4", node.Status.Tunnel.IPv4,
"tunnelIPv6", node.Status.Tunnel.IPv6,
"parentName", node.Status.Tunnel.Parent.Name,
"parentIPv4", node.Status.Tunnel.Parent.IPv4,
"parentIPv6", node.Status.Tunnel.Parent.IPv6,
)
err := r.client.Status().Update(ctx, node)
if err != nil {
return err
}
r.updateTimer.Reset(time.Second * time.Duration(r.cfg.FileConfig.GatewayFailover.TunnelUpdatePeriod))
return nil
}

func (r *vxlanReconciler) ensureRoute() error {
neighList, err := r.vxlan.ListNeigh()
if err != nil {
Expand Down Expand Up @@ -811,6 +860,10 @@ func (r *vxlanReconciler) keepReplayRoute() {
}
}

func (r *vxlanReconciler) Start(ctx context.Context) error {
return r.syncLastHeartbeatTime(ctx)
}

func parseMarkToInt(mark string) (int, error) {
tmp := strings.ReplaceAll(mark, "0x", "")
i64, err := strconv.ParseInt(tmp, 16, 32)
Expand All @@ -832,6 +885,7 @@ func newEgressTunnelController(mgr manager.Manager, cfg *config.Config, log logr
peerMap: utils.NewSyncMap[string, vxlan.Peer](),
ruleRoute: ruleRoute,
ruleRouteCache: utils.NewSyncMap[string, []net.IP](),
updateTimer: time.NewTimer(time.Second * time.Duration(cfg.FileConfig.GatewayFailover.TunnelUpdatePeriod)),
}

netLink := vxlan.NetLink{
Expand All @@ -852,6 +906,10 @@ func newEgressTunnelController(mgr manager.Manager, cfg *config.Config, log logr
if err != nil {
return err
}
err = mgr.Add(r)
if err != nil {
return err
}

if err := c.Watch(source.Kind(mgr.GetCache(), &egressv1.EgressTunnel{}),
handler.EnqueueRequestsFromMapFunc(utils.KindToMapFlat("EgressTunnel"))); err != nil {
Expand Down
Loading

0 comments on commit e090e9c

Please sign in to comment.