Skip to content

Commit

Permalink
Merge pull request #1862 from ceph/retry-sentinel-connreset
Browse files Browse the repository at this point in the history
  • Loading branch information
zmc authored Jun 30, 2023
2 parents e25bf39 + 9aa39da commit 8dce2fc
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 16 deletions.
1 change: 0 additions & 1 deletion teuthology/contextutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ def __call__(self):
if self.timeout > 0:
self.sleep_current = min(self.timeout - self.total_seconds, self.sleep_current)
self.total_seconds += self.sleep_current
print(self.total_seconds, self.sleep_current)
self.sleeper(self.sleep_current)
return True

Expand Down
25 changes: 11 additions & 14 deletions teuthology/orchestra/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import teuthology.lock.query
import teuthology.lock.util
from teuthology.contextutil import safe_while
from teuthology.orchestra import run
from teuthology.orchestra import connection
from teuthology.orchestra import console
Expand All @@ -13,7 +14,6 @@
from teuthology.exceptions import CommandFailedError
from teuthology.misc import host_shortname
import errno
import time
import re
import logging
from io import BytesIO
Expand Down Expand Up @@ -386,7 +386,7 @@ def connect(self, timeout=None, create_key=None, context='connect'):
self.ssh = connection.connect(**args)
return self.ssh

def reconnect(self, timeout=None, socket_timeout=None, sleep_time=30):
def reconnect(self, timeout=30, socket_timeout=None):
"""
Attempts to re-establish connection. Returns True for success; False
for failure.
Expand All @@ -395,18 +395,15 @@ def reconnect(self, timeout=None, socket_timeout=None, sleep_time=30):
self.ssh.close()
if not timeout:
return self._reconnect(timeout=socket_timeout)
start_time = time.time()
elapsed_time = lambda: time.time() - start_time
while elapsed_time() < timeout:
success = self._reconnect(timeout=socket_timeout)
if success:
log.info(f"Successfully reconnected to host '{self.name}'")
break
# Don't let time_remaining be < 0
time_remaining = max(0, timeout - elapsed_time())
sleep_val = min(time_remaining, sleep_time)
time.sleep(sleep_val)
return success
action = "reconnect to {self.shortname}"
with safe_while(action=action, timeout=timeout, increment=3, _raise=False) as proceed:
success = False
while proceed():
success = self._reconnect(timeout=socket_timeout)
if success:
log.info(f"Successfully reconnected to host '{self.name}'")
return success
return success

def _reconnect(self, timeout=None):
log.info(f"Trying to reconnect to host '{self.name}'")
Expand Down
9 changes: 8 additions & 1 deletion teuthology/provision/fog.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,14 @@ def _wait_for_ready(self):
sentinel_file = config.fog.get('sentinel_file', None)
if sentinel_file:
cmd = "while [ ! -e '%s' ]; do sleep 5; done" % sentinel_file
self.remote.run(args=cmd, timeout=600)
action = f"wait for sentinel on {self.shortname}"
with safe_while(action=action, timeout=1800, increment=3) as proceed:
while proceed():
try:
self.remote.run(args=cmd, timeout=600)
break
except ConnectionResetError as e:
log.error(f"{e} on {self.shortname}")
self.log.info("Node is ready")

def _fix_hostname(self):
Expand Down

0 comments on commit 8dce2fc

Please sign in to comment.