Commit 640ae42e authored by John Clements's avatar John Clements Committed by Alex Deucher
Browse files

drm/amdgpu: Updated RAS infrastructure



Update RAS infrastructure to support RAS query for MCA subblocks

Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarJohn Clements <john.clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6effad8a
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
					      uint64_t mc_status_addr,
					      unsigned long *error_count)
{
	uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
	uint64_t mc_status = RREG64_PCIE(mc_status_addr);

	if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
@@ -42,7 +42,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
						uint64_t mc_status_addr,
						unsigned long *error_count)
{
	uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
	uint64_t mc_status = RREG64_PCIE(mc_status_addr);

	if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
	    (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
@@ -56,7 +56,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
				  uint64_t mc_status_addr)
{
	WREG64_PCIE(mc_status_addr * 4, 0x0ULL);
	WREG64_PCIE(mc_status_addr, 0x0ULL);
}

void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
@@ -87,8 +87,8 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
		if (!mca_dev->ras_if)
			return -ENOMEM;
		mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
		mca_dev->ras_if->sub_block_index = mca_dev->ras_funcs->ras_sub_block;
		mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
		mca_dev->ras_if->sub_block_index = 0;
	}
	ih_info.head = fs_info.head = *mca_dev->ras_if;
	r = amdgpu_ras_late_init(adev, mca_dev->ras_if,
+1 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@ struct amdgpu_mca_ras_funcs {
	void (*query_ras_error_address)(struct amdgpu_device *adev,
					void *ras_error_status);
	uint32_t ras_block;
	uint32_t ras_sub_block;
	const char* sysfs_name;
};

+109 −37
Original line number Diff line number Diff line
@@ -61,9 +61,30 @@ const char *ras_block_string[] = {
	"mp0",
	"mp1",
	"fuse",
	"mpio",
	"mca",
};

const char *ras_mca_block_string[] = {
	"mca_mp0",
	"mca_mp1",
	"mca_mpio",
	"mca_iohc",
};

const char *get_ras_block_str(struct ras_common_if *ras_block)
{
	if (!ras_block)
		return "NULL";

	if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
		return "OUT OF RANGE";

	if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
		return ras_mca_block_string[ras_block->sub_block_index];

	return ras_block_string[ras_block->block];
}

#define ras_err_str(i) (ras_error_string[ffs(i)])

#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
@@ -188,7 +209,7 @@ static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)

	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
		*block_id = i;
		if (strcmp(name, ras_block_str(i)) == 0)
		if (strcmp(name, ras_block_string[i]) == 0)
			return 0;
	}
	return -EINVAL;
@@ -510,7 +531,6 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
	if (amdgpu_ras_query_error_status(obj->adev, &info))
		return -EINVAL;


	if (obj->adev->asic_type == CHIP_ALDEBARAN) {
		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
			DRM_WARN("Failed to reset error counter and error status");
@@ -530,7 +550,7 @@ static inline void put_obj(struct ras_manager *obj)
	if (obj && (--obj->use == 0))
		list_del(&obj->node);
	if (obj && (obj->use < 0))
		DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block));
		DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
}

/* make one obj and return it. */
@@ -546,7 +566,14 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
	if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
		return NULL;

	if (head->block == AMDGPU_RAS_BLOCK__MCA) {
		if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
			return NULL;

		obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
	} else
		obj = &con->objs[head->block];

	/* already exist. return obj? */
	if (alive_obj(obj))
		return NULL;
@@ -574,21 +601,23 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
		if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
			return NULL;

		if (head->block == AMDGPU_RAS_BLOCK__MCA) {
			if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
				return NULL;

			obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
		} else
			obj = &con->objs[head->block];

		if (alive_obj(obj)) {
			WARN_ON(head->block != obj->head.block);
		if (alive_obj(obj))
			return obj;
		}
	} else {
		for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
		for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
			obj = &con->objs[i];
			if (alive_obj(obj)) {
				WARN_ON(i != obj->head.block);
			if (alive_obj(obj))
				return obj;
		}
	}
	}

	return NULL;
}
@@ -627,8 +656,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
	 */
	if (!amdgpu_ras_is_feature_allowed(adev, head))
		return 0;
	if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
		return 0;

	if (enable) {
		if (!obj) {
@@ -679,18 +706,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,

	/* Do not enable if it is not allowed. */
	WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
	/* Are we alerady in that state we are going to set? */
	if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {
		ret = 0;
		goto out;
	}

	if (!amdgpu_ras_intr_triggered()) {
		ret = psp_ras_enable_features(&adev->psp, info, enable);
		if (ret) {
			dev_err(adev->dev, "ras %s %s failed %d\n",
				enable ? "enable":"disable",
				ras_block_str(head->block),
				get_ras_block_str(head),
				ret);
			goto out;
		}
@@ -732,7 +754,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
				if (!ret)
					dev_info(adev->dev,
						"RAS INFO: %s setup object\n",
						ras_block_str(head->block));
						get_ras_block_str(head));
			}
		} else {
			/* setup the object then issue a ras TA disable cmd.*/
@@ -782,17 +804,39 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
		bool bypass)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
	int i;
	const enum amdgpu_ras_error_type default_ras_type =
		AMDGPU_RAS_ERROR__NONE;
	const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;

	for (i = 0; i < ras_block_count; i++) {
	for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
		struct ras_common_if head = {
			.block = i,
			.type = default_ras_type,
			.sub_block_index = 0,
		};

		if (i == AMDGPU_RAS_BLOCK__MCA)
			continue;

		if (bypass) {
			/*
			 * bypass psp. vbios enable ras for us.
			 * so just create the obj
			 */
			if (__amdgpu_ras_feature_enable(adev, &head, 1))
				break;
		} else {
			if (amdgpu_ras_feature_enable(adev, &head, 1))
				break;
		}
	}

	for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
		struct ras_common_if head = {
			.block = AMDGPU_RAS_BLOCK__MCA,
			.type = default_ras_type,
			.sub_block_index = i,
		};

		if (bypass) {
			/*
			 * bypass psp. vbios enable ras for us.
@@ -810,6 +854,32 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
}
/* feature ctl end */


void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
				       struct ras_common_if *ras_block,
				       struct ras_err_data  *err_data)
{
	switch (ras_block->sub_block_index) {
	case AMDGPU_RAS_MCA_BLOCK__MP0:
		if (adev->mca.mp0.ras_funcs &&
		    adev->mca.mp0.ras_funcs->query_ras_error_count)
			adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data);
		break;
	case AMDGPU_RAS_MCA_BLOCK__MP1:
		if (adev->mca.mp1.ras_funcs &&
		    adev->mca.mp1.ras_funcs->query_ras_error_count)
			adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data);
		break;
	case AMDGPU_RAS_MCA_BLOCK__MPIO:
		if (adev->mca.mpio.ras_funcs &&
		    adev->mca.mpio.ras_funcs->query_ras_error_count)
			adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data);
		break;
	default:
		break;
	}
}

/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
				  struct ras_query_if *info)
@@ -873,6 +943,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
		    adev->hdp.ras_funcs->query_ras_error_count)
			adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
		break;
	case AMDGPU_RAS_BLOCK__MCA:
		amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
		break;
	default:
		break;
	}
@@ -894,13 +967,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
					adev->smuio.funcs->get_socket_id(adev),
					adev->smuio.funcs->get_die_id(adev),
					obj->err_data.ce_count,
					ras_block_str(info->head.block));
					get_ras_block_str(&info->head));
		} else {
			dev_info(adev->dev, "%ld correctable hardware errors "
					"detected in %s block, no user "
					"action is needed.\n",
					obj->err_data.ce_count,
					ras_block_str(info->head.block));
					get_ras_block_str(&info->head));
		}
	}
	if (err_data.ue_count) {
@@ -913,12 +986,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
					adev->smuio.funcs->get_socket_id(adev),
					adev->smuio.funcs->get_die_id(adev),
					obj->err_data.ue_count,
					ras_block_str(info->head.block));
					get_ras_block_str(&info->head));
		} else {
			dev_info(adev->dev, "%ld uncorrectable hardware errors "
					"detected in %s block\n",
					obj->err_data.ue_count,
					ras_block_str(info->head.block));
					get_ras_block_str(&info->head));
		}
	}

@@ -1028,9 +1101,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
	case AMDGPU_RAS_BLOCK__SDMA:
	case AMDGPU_RAS_BLOCK__MMHUB:
	case AMDGPU_RAS_BLOCK__PCIE_BIF:
	case AMDGPU_RAS_BLOCK__MP0:
	case AMDGPU_RAS_BLOCK__MP1:
	case AMDGPU_RAS_BLOCK__MPIO:
	case AMDGPU_RAS_BLOCK__MCA:
		ret = psp_ras_trigger_error(&adev->psp, &block_info);
		break;
	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
@@ -1038,13 +1109,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
		break;
	default:
		dev_info(adev->dev, "%s error injection is not supported yet\n",
			 ras_block_str(info->head.block));
			 get_ras_block_str(&info->head));
		ret = -EINVAL;
	}

	if (ret)
		dev_err(adev->dev, "ras inject %s failed %d\n",
			ras_block_str(info->head.block), ret);
			get_ras_block_str(&info->head), ret);

	return ret;
}
@@ -1387,7 +1458,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
		if (amdgpu_ras_is_supported(adev, obj->head.block) &&
			(obj->attr_inuse == 1)) {
			sprintf(fs_info.debugfs_name, "%s_err_inject",
					ras_block_str(obj->head.block));
					get_ras_block_str(&obj->head));
			fs_info.head = obj->head;
			amdgpu_ras_debugfs_create(adev, &fs_info, dir);
		}
@@ -2185,7 +2256,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
		return 0;

	con = kmalloc(sizeof(struct amdgpu_ras) +
			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
			sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
			GFP_KERNEL|__GFP_ZERO);
	if (!con)
		return -ENOMEM;
+14 −5
Original line number Diff line number Diff line
@@ -49,15 +49,22 @@ enum amdgpu_ras_block {
	AMDGPU_RAS_BLOCK__MP0,
	AMDGPU_RAS_BLOCK__MP1,
	AMDGPU_RAS_BLOCK__FUSE,
	AMDGPU_RAS_BLOCK__MPIO,
	AMDGPU_RAS_BLOCK__MCA,

	AMDGPU_RAS_BLOCK__LAST
};

extern const char *ras_block_string[];
enum amdgpu_ras_mca_block {
	AMDGPU_RAS_MCA_BLOCK__MP0 = 0,
	AMDGPU_RAS_MCA_BLOCK__MP1,
	AMDGPU_RAS_MCA_BLOCK__MPIO,
	AMDGPU_RAS_MCA_BLOCK__IOHC,

	AMDGPU_RAS_MCA_BLOCK__LAST
};

#define ras_block_str(i) (ras_block_string[i])
#define AMDGPU_RAS_BLOCK_COUNT	AMDGPU_RAS_BLOCK__LAST
#define AMDGPU_RAS_MCA_BLOCK_COUNT	AMDGPU_RAS_MCA_BLOCK__LAST
#define AMDGPU_RAS_BLOCK_MASK	((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)

enum amdgpu_ras_gfx_subblock {
@@ -544,8 +551,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
		return TA_RAS_BLOCK__MP1;
	case AMDGPU_RAS_BLOCK__FUSE:
		return TA_RAS_BLOCK__FUSE;
	case AMDGPU_RAS_BLOCK__MPIO:
		return TA_RAS_BLOCK__MPIO;
	case AMDGPU_RAS_BLOCK__MCA:
		return TA_RAS_BLOCK__MCA;
	default:
		WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block);
		return TA_RAS_BLOCK__UMC;
@@ -640,4 +647,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev);

int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);

const char *get_ras_block_str(struct ras_common_if *ras_block);

#endif
+6 −3
Original line number Diff line number Diff line
@@ -52,7 +52,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = {
	.ras_fini = mca_v3_0_mp0_ras_fini,
	.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
	.query_ras_error_address = NULL,
	.ras_block = AMDGPU_RAS_BLOCK__MP0,
	.ras_block = AMDGPU_RAS_BLOCK__MCA,
	.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP0,
	.sysfs_name = "mp0_err_count",
};

@@ -79,7 +80,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = {
	.ras_fini = mca_v3_0_mp1_ras_fini,
	.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
	.query_ras_error_address = NULL,
	.ras_block = AMDGPU_RAS_BLOCK__MP1,
	.ras_block = AMDGPU_RAS_BLOCK__MCA,
	.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP1,
	.sysfs_name = "mp1_err_count",
};

@@ -106,7 +108,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = {
	.ras_fini = mca_v3_0_mpio_ras_fini,
	.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
	.query_ras_error_address = NULL,
	.ras_block = AMDGPU_RAS_BLOCK__MPIO,
	.ras_block = AMDGPU_RAS_BLOCK__MCA,
	.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MPIO,
	.sysfs_name = "mpio_err_count",
};

Loading