Skip to content

Commit

Permalink
Azure: During primary nic detection, check interface status continuou…
Browse files Browse the repository at this point in the history
…sly before rebinding again (#990)

Add 10 second polling loop in wait_for_link_up after performing
an unbind and re-bind of primary NIC in hv_netvsc driver.

Also reduce cloud-init logging levels to debug for these operations.
  • Loading branch information
aswinrajamannar authored Aug 20, 2021
1 parent 7d3f5d7 commit 3ec8ddd
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 23 deletions.
38 changes: 20 additions & 18 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,12 +892,12 @@ def wait_for_link_up(self, ifname):
logger_func=LOG.info)
return

LOG.info("Attempting to bring %s up", ifname)
LOG.debug("Attempting to bring %s up", ifname)

attempts = 0
LOG.info("Unbinding and binding the interface %s", ifname)
while True:

LOG.info("Unbinding and binding the interface %s", ifname)
devicename = net.read_sys_net(ifname,
'device/device_id').strip('{}')
util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind',
Expand All @@ -912,26 +912,28 @@ def wait_for_link_up(self, ifname):
report_diagnostic_event(msg, logger_func=LOG.info)
return

sleep_duration = 1
msg = ("Link is not up after %d attempts with %d seconds sleep "
"between attempts." % (attempts, sleep_duration))

if attempts % 10 == 0:
msg = ("Link is not up after %d attempts to rebind" % attempts)
report_diagnostic_event(msg, logger_func=LOG.info)
else:
LOG.info(msg)

sleep(sleep_duration)

# Since we just did a unbind and bind, check again after sleep
# but before doing unbind and bind again to avoid races where the
# link might take a slight delay after bind to be up.
if self.distro.networking.is_up(ifname):
msg = ("Link is up after checking after sleeping for %d secs"
" after %d attempts" %
(sleep_duration, attempts))
report_diagnostic_event(msg, logger_func=LOG.info)
return
# It could take some time after rebind for the interface to be up.
# So poll for the status for some time before attempting to rebind
# again.
sleep_duration = 0.5
max_status_polls = 20
LOG.debug("Polling %d seconds for primary NIC link up after "
"rebind.", sleep_duration * max_status_polls)

for i in range(0, max_status_polls):
if self.distro.networking.is_up(ifname):
msg = ("After %d attempts to rebind, link is up after "
"polling the link status %d times" % (attempts, i))
report_diagnostic_event(msg, logger_func=LOG.info)
LOG.debug(msg)
return
else:
sleep(sleep_duration)

@azure_ds_telemetry_reporter
def _create_report_ready_marker(self):
Expand Down
20 changes: 15 additions & 5 deletions tests/unittests/test_datasource/test_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2912,19 +2912,29 @@ def test_wait_for_link_up_returns_if_already_up(
@mock.patch('cloudinit.net.read_sys_net')
@mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up')
def test_wait_for_link_up_checks_link_after_sleep(
self, m_is_link_up, m_read_sys_net, m_writefile, m_is_up):
self, m_try_set_link_up, m_read_sys_net, m_writefile, m_is_up):
"""Waiting for link to be up should return immediately if the link is
already up."""

distro_cls = distros.fetch('ubuntu')
distro = distro_cls('ubuntu', {}, self.paths)
dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths)
m_is_link_up.return_value = False
m_is_up.return_value = True
m_try_set_link_up.return_value = False

callcount = 0

def is_up_mock(key):
nonlocal callcount
if callcount == 0:
callcount += 1
return False
return True

m_is_up.side_effect = is_up_mock

dsa.wait_for_link_up("eth0")
self.assertEqual(2, m_is_link_up.call_count)
self.assertEqual(1, m_is_up.call_count)
self.assertEqual(2, m_try_set_link_up.call_count)
self.assertEqual(2, m_is_up.call_count)

@mock.patch(MOCKPATH + 'util.write_file')
@mock.patch('cloudinit.net.read_sys_net')
Expand Down

0 comments on commit 3ec8ddd

Please sign in to comment.