Commit 761d86d3 authored by Dennis Li's avatar Dennis Li Committed by Alex Deucher
Browse files

drm/amdgpu: harvest edc status when connected to host via xGMI



When connected to a host via xGMI, system fatal errors may trigger
warm reset, driver has no change to query edc status before reset.
Therefore in this case, driver should harvest previous error loging
registers during boot, instead of only resetting them.

v2:
1. IP's ras_manager object is created when its ras feature is enabled,
so change to query edc status after amdgpu_ras_late_init called

2. change to enable watchdog timer after finishing gfx edc init

Signed-off-by: default avatarDennis Li <Dennis.Li@amd.com>
Reivewed-by: default avatarHawking Zhang <hawking.zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 63dbb0db
Loading
Loading
Loading
Loading
+8 −1
Original line number Diff line number Diff line
@@ -601,6 +601,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev)
	struct ras_ih_if ih_info = {
		.cb = amdgpu_gfx_process_ras_data_cb,
	};
	struct ras_query_if info = { 0 };

	if (!adev->gfx.ras_if) {
		adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
@@ -612,13 +613,19 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev)
		strcpy(adev->gfx.ras_if->name, "gfx");
	}
	fs_info.head = ih_info.head = *adev->gfx.ras_if;

	r = amdgpu_ras_late_init(adev, adev->gfx.ras_if,
				 &fs_info, &ih_info);
	if (r)
		goto free;

	if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) {
		if (adev->gmc.xgmi.connected_to_cpu) {
			info.head = *adev->gfx.ras_if;
			amdgpu_ras_query_error_status(adev, &info);
		} else {
			amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);
		}

		r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
		if (r)
			goto late_fini;
+1 −1
Original line number Diff line number Diff line
@@ -225,9 +225,9 @@ struct amdgpu_gfx_funcs {
	void (*reset_ras_error_count) (struct amdgpu_device *adev);
	void (*init_spm_golden)(struct amdgpu_device *adev);
	void (*query_ras_error_status) (struct amdgpu_device *adev);
	void (*reset_ras_error_status) (struct amdgpu_device *adev);
	void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
	void (*enable_watchdog_timer)(struct amdgpu_device *adev);
	void (*query_sq_timeout_status)(struct amdgpu_device *adev);
};

struct sq_work {
+41 −9
Original line number Diff line number Diff line
@@ -109,7 +109,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
	ssize_t s;
	char val[128];

	if (amdgpu_ras_error_query(obj->adev, &info))
	if (amdgpu_ras_query_error_status(obj->adev, &info))
		return -EINVAL;

	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
@@ -434,7 +434,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
		return snprintf(buf, PAGE_SIZE,
				"Query currently inaccessible\n");

	if (amdgpu_ras_error_query(obj->adev, &info))
	if (amdgpu_ras_query_error_status(obj->adev, &info))
		return -EINVAL;

	return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
@@ -757,7 +757,7 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
/* feature ctl end */

/* query/inject/cure begin */
int amdgpu_ras_error_query(struct amdgpu_device *adev,
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
	struct ras_query_if *info)
{
	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
@@ -787,10 +787,16 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
	case AMDGPU_RAS_BLOCK__GFX:
		if (adev->gfx.funcs->query_ras_error_count)
			adev->gfx.funcs->query_ras_error_count(adev, &err_data);

		if (adev->gfx.funcs->query_ras_error_status)
			adev->gfx.funcs->query_ras_error_status(adev);
		break;
	case AMDGPU_RAS_BLOCK__MMHUB:
		if (adev->mmhub.funcs->query_ras_error_count)
			adev->mmhub.funcs->query_ras_error_count(adev, &err_data);

		if (adev->mmhub.funcs->query_ras_error_status)
			adev->mmhub.funcs->query_ras_error_status(adev);
		break;
	case AMDGPU_RAS_BLOCK__PCIE_BIF:
		if (adev->nbio.funcs->query_ras_error_count)
@@ -826,6 +832,35 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
	return 0;
}

int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
		enum amdgpu_ras_block block)
{
	if (!amdgpu_ras_is_supported(adev, block))
		return -EINVAL;

	switch (block) {
	case AMDGPU_RAS_BLOCK__GFX:
		if (adev->gfx.funcs->reset_ras_error_count)
			adev->gfx.funcs->reset_ras_error_count(adev);

		if (adev->gfx.funcs->reset_ras_error_status)
			adev->gfx.funcs->reset_ras_error_status(adev);
		break;
	case AMDGPU_RAS_BLOCK__MMHUB:
		if (adev->mmhub.funcs->reset_ras_error_count)
			adev->mmhub.funcs->reset_ras_error_count(adev);
		break;
	case AMDGPU_RAS_BLOCK__SDMA:
		if (adev->sdma.funcs->reset_ras_error_count)
			adev->sdma.funcs->reset_ras_error_count(adev);
		break;
	default:
		break;
	}

	return 0;
}

/* Trigger XGMI/WAFL error */
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
				 struct ta_ras_trigger_error_input *block_info)
@@ -921,7 +956,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
			.head = obj->head,
		};

		if (amdgpu_ras_error_query(adev, &info))
		if (amdgpu_ras_query_error_status(adev, &info))
			return 0;

		data.ce_count += info.ce_count;
@@ -1451,7 +1486,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
		if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
			continue;

		amdgpu_ras_error_query(adev, &info);
		amdgpu_ras_query_error_status(adev, &info);
	}
}

@@ -1467,9 +1502,6 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
	case AMDGPU_RAS_BLOCK__GFX:
		if (adev->gfx.funcs->query_ras_error_status)
			adev->gfx.funcs->query_ras_error_status(adev);

		if (adev->gfx.funcs->query_sq_timeout_status)
			adev->gfx.funcs->query_sq_timeout_status(adev);
		break;
	case AMDGPU_RAS_BLOCK__MMHUB:
		if (adev->mmhub.funcs->query_ras_error_status)
+4 −1
Original line number Diff line number Diff line
@@ -588,9 +588,12 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,

void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev);

int amdgpu_ras_error_query(struct amdgpu_device *adev,
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
		struct ras_query_if *info);

int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
		enum amdgpu_ras_block block);

int amdgpu_ras_error_inject(struct amdgpu_device *adev,
		struct ras_inject_if *info);

+4 −8
Original line number Diff line number Diff line
@@ -2124,8 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = {
	.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
	.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
	.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
	.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
	.query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
};

static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -3970,9 +3970,6 @@ static int gfx_v9_0_hw_init(void *handle)
	if (adev->asic_type == CHIP_ALDEBARAN)
		gfx_v9_4_2_set_power_brake_sequence(adev);

	if (adev->gfx.funcs->enable_watchdog_timer)
		adev->gfx.funcs->enable_watchdog_timer(adev);

	return r;
}

@@ -4736,14 +4733,13 @@ static int gfx_v9_0_ecc_late_init(void *handle)
	if (r)
		return r;

	if (adev->gfx.funcs &&
	    adev->gfx.funcs->reset_ras_error_count)
		adev->gfx.funcs->reset_ras_error_count(adev);

	r = amdgpu_gfx_ras_late_init(adev);
	if (r)
		return r;

	if (adev->gfx.funcs->enable_watchdog_timer)
		adev->gfx.funcs->enable_watchdog_timer(adev);

	return 0;
}

Loading