Commit 78ad00c9 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: Hook EEPROM table to RAS



support eeprom records load and save for ras,
move EEPROM records storing to bad page reserving

v2: remove redundant check for con->eh_data

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9dc23a63
Loading
Loading
Loading
Loading
+81 −28
Original line number Diff line number Diff line
@@ -1366,6 +1366,69 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
	return ret;
}

/*
 * write error record array to eeprom, the function should be
 * protected by recovery_lock
 */
static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data *data;
	struct amdgpu_ras_eeprom_control *control =
					&adev->psp.ras.ras->eeprom_control;
	int save_count;

	if (!con || !con->eh_data)
		return 0;

	data = con->eh_data;
	save_count = data->count - control->num_recs;
	/* only new entries are saved */
	if (save_count > 0)
		if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control,
							&data->bps[control->num_recs],
							true,
							save_count)) {
			DRM_ERROR("Failed to save EEPROM table data!");
			return -EIO;
		}

	return 0;
}

/*
 * read error record array in eeprom and reserve enough space for
 * storing new bad pages
 */
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
{
	struct amdgpu_ras_eeprom_control *control =
					&adev->psp.ras.ras->eeprom_control;
	struct eeprom_table_record *bps = NULL;
	int ret = 0;

	/* no bad page record, skip eeprom access */
	if (!control->num_recs)
		return ret;

	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
	if (!bps)
		return -ENOMEM;

	if (amdgpu_ras_eeprom_process_recods(control, bps, false,
		control->num_recs)) {
		DRM_ERROR("Failed to load EEPROM table records!");
		ret = -EIO;
		goto out;
	}

	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);

out:
	kfree(bps);
	return ret;
}

/* called in gpu recovery/init */
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
{
@@ -1373,7 +1436,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
	struct ras_err_handler_data *data;
	uint64_t bp;
	struct amdgpu_bo *bo;
	int i;
	int i, ret = 0;

	if (!con || !con->eh_data)
		return 0;
@@ -1393,9 +1456,12 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
		data->bps_bo[i] = bo;
		data->last_reserved = i + 1;
	}

	/* continue to save bad pages to eeprom even reesrve_vram fails */
	ret = amdgpu_ras_save_bad_pages(adev);
out:
	mutex_unlock(&con->recovery_lock);
	return 0;
	return ret;
}

/* called when driver unload */
@@ -1427,33 +1493,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
	return 0;
}

static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
{
	/* TODO
	 * write the array to eeprom when SMU disabled.
	 */
	return 0;
}

/*
 * read error record array in eeprom and reserve enough space for
 * storing new bad pages
 */
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
{
	struct eeprom_table_record *bps = NULL;
	int ret;

	ret = amdgpu_ras_add_bad_pages(adev, bps,
				adev->umc.max_ras_err_cnt_per_query);

	return ret;
}

static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data **data = &con->eh_data;
	int ret;

	*data = kmalloc(sizeof(**data),
			GFP_KERNEL|__GFP_ZERO);
@@ -1465,8 +1509,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
	atomic_set(&con->in_recovery, 0);
	con->adev = adev;

	amdgpu_ras_load_bad_pages(adev);
	amdgpu_ras_reserve_bad_pages(adev);
	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
	if (ret)
		return ret;

	if (adev->psp.ras.ras->eeprom_control.num_recs) {
		ret = amdgpu_ras_load_bad_pages(adev);
		if (ret)
			return ret;
		ret = amdgpu_ras_reserve_bad_pages(adev);
		if (ret)
			return ret;
	}

	return 0;
}
@@ -1477,7 +1531,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
	struct ras_err_handler_data *data = con->eh_data;

	cancel_work_sync(&con->recovery_work);
	amdgpu_ras_save_bad_pages(adev);
	amdgpu_ras_release_bad_pages(adev);

	mutex_lock(&con->recovery_lock);