From 833e3de24a5068e1ed57aa044d12b8df68f529f3 Mon Sep 17 00:00:00 2001 From: Nick Muerdter Date: Mon, 18 Jun 2018 21:05:46 -0600 Subject: [PATCH] Improve keepalive handling. - Tweak how the upstreams are setup to prevent temporary connection failures from removing the servers from rotation. - Allow connection retries to upstreams in the event of connection failures. - Enable so_keepalive on listening sockets (I don't necessarily think this will help with the upstream keepalive issues, but is probably a good idea, and could help with keepalive behavior to any front-facing load balancers. This cropped up after introducing an AWS NAT Gateway into our stack, which closes inactive keepalive connections after 5 minutes: https://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/vpc-nat-gateway.html#nat-gateway-troubleshooting-timeout See https://github.com/18F/api.data.gov/issues/446 --- src/api-umbrella/proxy/load_backends.lua | 18 +++++++++++++++++- templates/etc/nginx/router.conf.mustache | 21 +++++++++++---------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/src/api-umbrella/proxy/load_backends.lua b/src/api-umbrella/proxy/load_backends.lua index 171b61bc3..720c6cc16 100644 --- a/src/api-umbrella/proxy/load_backends.lua +++ b/src/api-umbrella/proxy/load_backends.lua @@ -45,7 +45,23 @@ local function generate_upstream_config(api) nginx_ip = ip end - table.insert(servers, "server " .. nginx_ip .. ":" .. server["port"] .. ";") + -- Insert 5 copies of the server, and set max_fails=0. In combination + -- with the global "proxy_next_upstream error" setting, this allows + -- for the API backend requests to retry up to 5 times if a + -- connection was never actually established. + -- + -- This is a bit of a hack, but this helps deal with upstream + -- keepalive connections that might get closed (either by the API + -- backend or some other firewall or NAT in between). + - + -- max_fails=0 is important so that single servers don't get + -- completely removed from rotation (for fail_timeout) if a single + -- request fails. By repeating the same server IP multiple times, + -- this also gives proxy_next_upstream a chance to failover and retry + -- the same server. + for i = 1, 5 do + table.insert(servers, "server " .. nginx_ip .. ":" .. server["port"] .. " max_fails=0;") + end end end end diff --git a/templates/etc/nginx/router.conf.mustache b/templates/etc/nginx/router.conf.mustache index 39f24d6f7..a3cf4593f 100644 --- a/templates/etc/nginx/router.conf.mustache +++ b/templates/etc/nginx/router.conf.mustache @@ -30,7 +30,8 @@ http { 'ca=$connections_active cr=$connections_reading cw=$connections_writing ' 'ct=$connections_waiting cq=$connection_requests bs=$bytes_sent ' 'rl=$request_length rt=$request_time uct=$upstream_connect_time ' - 'uht=$upstream_header_time urt=$upstream_response_time'; + 'uht=$upstream_header_time urt=$upstream_response_time ' + 'ua="$upstream_addr" us="$upstream_status"'; access_log {{log_dir}}/nginx/{{nginx.access_log_filename}} combined_extended {{nginx.access_log_options}}; client_body_temp_path {{tmp_dir}}/nginx-client_body_temp; @@ -211,14 +212,14 @@ http { dyups_trylock on; dyups_read_msg_timeout 300ms; server { - listen {{nginx.dyups.host}}:{{nginx.dyups.port}}; + listen {{nginx.dyups.host}}:{{nginx.dyups.port}} so_keepalive=on; location / { dyups_interface; } } server { - listen {{api_server.host}}:{{api_server.port}}; + listen {{api_server.host}}:{{api_server.port}} so_keepalive=on; set $x_api_umbrella_request_id $http_x_api_umbrella_request_id; location /api-umbrella/v1/health { @@ -232,12 +233,12 @@ http { {{#hosts}} server { - listen {{http_port}}{{#default}} default_server{{/default}}; - listen [::]:{{http_port}}{{#default}} default_server{{/default}}; + listen {{http_port}}{{#default}} default_server so_keepalive=on{{/default}}; + listen [::]:{{http_port}}{{#default}} default_server so_keepalive=on{{/default}}; server_name {{_nginx_server_name}}; - listen {{https_port}} ssl{{#default}} default_server{{/default}}; - listen [::]:{{https_port}} ssl{{#default}} default_server{{/default}}; + listen {{https_port}} ssl{{#default}} default_server so_keepalive=on{{/default}}; + listen [::]:{{https_port}} ssl{{#default}} default_server so_keepalive=on{{/default}}; {{#ssl_cert}} ssl_certificate {{ssl_cert}}; ssl_certificate_key {{ssl_cert_key}}; @@ -277,7 +278,7 @@ http { {{/hosts}} server { - listen {{static_site.host}}:{{static_site.port}}; + listen {{static_site.host}}:{{static_site.port}} so_keepalive=on; server_name _; root {{static_site.build_dir}}; @@ -296,7 +297,7 @@ http { } server { - listen {{router.api_backends.host}}:{{router.api_backends.port}}; + listen {{router.api_backends.host}}:{{router.api_backends.port}} so_keepalive=on; server_name _; set $x_api_umbrella_request_id $http_x_api_umbrella_request_id; @@ -339,7 +340,7 @@ http { } server { - listen {{web.host}}:{{web.port}}; + listen {{web.host}}:{{web.port}} so_keepalive=on; server_name _; set $x_api_umbrella_request_id $http_x_api_umbrella_request_id;