Commit fe06992b authored by Leon Romanovsky's avatar Leon Romanovsky Committed by Saeed Mahameed
Browse files

net/mlx5: Check returned value from health recover sequence



MLX5_INTERFACE_STATE_UP is far from being reliable check for success to
recover, because it can be changed any time and health logic doesn't
have any locks to protect from it.

The locks are not needed here because health recover is good to have,
but not must to success, so rely on the returned value from the
mlx5_recover_device() as a marker for success/failure.

Reviewed-by: default avatarMoshe Shemesh <moshe@nvidia.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@nvidia.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@nvidia.com>
parent 7ad67a20
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -335,12 +335,12 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
		return -EIO;
	}
	mlx5_core_err(dev, "starting health recovery flow\n");
	mlx5_recover_device(dev);
	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
	    mlx5_health_check_fatal_sensors(dev)) {
	if (mlx5_recover_device(dev) || mlx5_health_check_fatal_sensors(dev)) {
		mlx5_core_err(dev, "health recovery failed\n");
		return -EIO;
	}

	mlx5_core_info(dev, "health revovery succeded\n");
	return 0;
}

+5 −2
Original line number Diff line number Diff line
@@ -1721,11 +1721,14 @@ void mlx5_disable_device(struct mlx5_core_dev *dev)
	mlx5_unload_one(dev);
}

void mlx5_recover_device(struct mlx5_core_dev *dev)
int mlx5_recover_device(struct mlx5_core_dev *dev)
{
	int ret = -EIO;

	mlx5_pci_disable_device(dev);
	if (mlx5_pci_slot_reset(dev->pdev) == PCI_ERS_RESULT_RECOVERED)
		mlx5_pci_resume(dev->pdev);
		ret = mlx5_load_one(dev);
	return ret;
}

static struct pci_driver mlx5_core_driver = {
+1 −1
Original line number Diff line number Diff line
@@ -134,7 +134,7 @@ void mlx5_error_sw_reset(struct mlx5_core_dev *dev);
u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev);
int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev);
void mlx5_disable_device(struct mlx5_core_dev *dev);
void mlx5_recover_device(struct mlx5_core_dev *dev);
int mlx5_recover_device(struct mlx5_core_dev *dev);
int mlx5_sriov_init(struct mlx5_core_dev *dev);
void mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
int mlx5_sriov_attach(struct mlx5_core_dev *dev);