Commit 87d2b92f authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: save umc error records



save umc error records to ras bad page array

v2: add bad pages before gpu reset
v3: add NULL check for adev->umc.funcs

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 78ad00c9
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -347,7 +347,7 @@ struct ras_err_data {
	unsigned long ue_count;
	unsigned long ce_count;
	unsigned long err_addr_cnt;
	uint64_t *err_addr;
	struct eeprom_table_record *err_addr;
};

struct ras_err_handler_data {
+31 −9
Original line number Diff line number Diff line
@@ -247,21 +247,43 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
		struct ras_err_data *err_data,
		struct amdgpu_iv_entry *entry)
{
	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
		return AMDGPU_RAS_SUCCESS;

	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
		if (adev->umc.funcs->query_ras_error_count)
	if (adev->umc.funcs &&
	    adev->umc.funcs->query_ras_error_count)
	    adev->umc.funcs->query_ras_error_count(adev, err_data);

	if (adev->umc.funcs &&
	    adev->umc.funcs->query_ras_error_address &&
	    adev->umc.max_ras_err_cnt_per_query) {
		err_data->err_addr =
			kcalloc(adev->umc.max_ras_err_cnt_per_query,
				sizeof(struct eeprom_table_record), GFP_KERNEL);
		/* still call query_ras_error_address to clear error status
		 * even NOMEM error is encountered
		 */
		if(!err_data->err_addr)
			DRM_WARN("Failed to alloc memory for umc error address record!\n");

		/* umc query_ras_error_address is also responsible for clearing
		 * error status
		 */
		if (adev->umc.funcs->query_ras_error_address)
		adev->umc.funcs->query_ras_error_address(adev, err_data);
	}

	/* only uncorrectable error needs gpu reset */
		if (err_data->ue_count)
	if (err_data->ue_count) {
		if (err_data->err_addr_cnt &&
		    amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
						err_data->err_addr_cnt))
			DRM_WARN("Failed to add ras bad page!\n");

		amdgpu_ras_reset_gpu(adev, 0);
	}

	kfree(err_data->err_addr);
	return AMDGPU_RAS_SUCCESS;
}

+32 −7
Original line number Diff line number Diff line
@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
			RSMU_UMC_INDEX_MODE_EN, 0);
}

static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
{
	uint32_t rsmu_umc_index;

	rsmu_umc_index = RREG32_SOC15(RSMU, 0,
				mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
	return REG_GET_FIELD(rsmu_umc_index,
				RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
				RSMU_UMC_INDEX_INSTANCE);
}

static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
						   uint32_t umc_reg_offset,
						   unsigned long *error_count)
@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
					 uint32_t umc_reg_offset, uint32_t channel_index)
{
	uint32_t lsb, mc_umc_status_addr;
	uint64_t mc_umc_status, err_addr;
	uint64_t mc_umc_status, err_addr, retired_page;
	struct eeprom_table_record *err_rec;

	mc_umc_status_addr =
		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
		return;
	}

	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
	mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);

	/* calculate error address if ue/ce error is detected */
@@ -191,13 +204,25 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
		err_addr &= ~((0x1ULL << lsb) - 1);

		/* translate umc channel address to soc pa, 3 parts are included */
		err_data->err_addr[err_data->err_addr_cnt] =
						ADDR_OF_8KB_BLOCK(err_addr) |
		retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
				ADDR_OF_256B_BLOCK(channel_index) |
				OFFSET_IN_256B_BLOCK(err_addr);

		/* we only save ue error information currently, ce is skipped */
		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
				== 1) {
			err_rec->address = err_addr;
			/* page frame address is saved */
			err_rec->retired_page = retired_page >> PAGE_SHIFT;
			err_rec->ts = (uint64_t)ktime_get_real_seconds();
			err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
			err_rec->cu = 0;
			err_rec->mem_channel = channel_index;
			err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev);

			err_data->err_addr_cnt++;
		}
	}

	/* clear umc status */
	WREG64_UMC(mc_umc_status_addr + umc_reg_offset, 0x0ULL);