Commit 4cba5473 authored by Mauro Carvalho Chehab's avatar Mauro Carvalho Chehab
Browse files

media: venus: Rework error fail recover logic



The Venus code has a sort of watchdog that attempts to recover
from IP errors, implemented as a delayed work job, which
calls venus_sys_error_handler().

Right now, it has several issues:

1. It assumes that PM runtime resume never fails

2. It internally runs two while() loops that also assume that
   PM runtime will never fail to go idle:

	while (pm_runtime_active(core->dev_dec) || pm_runtime_active(core->dev_enc))
		msleep(10);

...

	while (core->pmdomains[0] && pm_runtime_active(core->pmdomains[0]))
		usleep_range(1000, 1500);

3. It uses an OR to merge all return codes and then report to the user

4. If the hardware never recovers, it keeps running on every 10ms,
   flooding the syslog with 2 messages (so, up to 200 messages
   per second).

Rework the code, in order to prevent that, by:

1. check the return code from PM runtime resume;
2. don't let the while() loops run forever;
3. store the failed event;
4. use warn ratelimited when it fails to recover.

Fixes: af2c3834 ("[media] media: venus: adding core part and helper functions")
Reviewed-by: default avatarJonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: default avatarMauro Carvalho Chehab <mchehab+huawei@kernel.org>
parent e90812c4
Loading
Loading
Loading
Loading
+47 −13
Original line number Diff line number Diff line
@@ -78,22 +78,32 @@ static const struct hfi_core_ops venus_core_ops = {
	.event_notify = venus_event_notify,
};

#define RPM_WAIT_FOR_IDLE_MAX_ATTEMPTS 10

static void venus_sys_error_handler(struct work_struct *work)
{
	struct venus_core *core =
			container_of(work, struct venus_core, work.work);
	int ret = 0;

	pm_runtime_get_sync(core->dev);
	int ret, i, max_attempts = RPM_WAIT_FOR_IDLE_MAX_ATTEMPTS;
	const char *err_msg = "";
	bool failed = false;

	ret = pm_runtime_get_sync(core->dev);
	if (ret < 0) {
		err_msg = "resume runtime PM";
		max_attempts = 0;
		failed = true;
	}

	hfi_core_deinit(core, true);

	dev_warn(core->dev, "system error has occurred, starting recovery!\n");

	mutex_lock(&core->lock);

	while (pm_runtime_active(core->dev_dec) || pm_runtime_active(core->dev_enc))
	for (i = 0; i < max_attempts; i++) {
		if (!pm_runtime_active(core->dev_dec) && !pm_runtime_active(core->dev_enc))
			break;
		msleep(10);
	}

	venus_shutdown(core);

@@ -101,31 +111,55 @@ static void venus_sys_error_handler(struct work_struct *work)

	pm_runtime_put_sync(core->dev);

	while (core->pmdomains[0] && pm_runtime_active(core->pmdomains[0]))
	for (i = 0; i < max_attempts; i++) {
		if (!core->pmdomains[0] || !pm_runtime_active(core->pmdomains[0]))
			break;
		usleep_range(1000, 1500);
	}

	hfi_reinit(core);

	pm_runtime_get_sync(core->dev);
	ret = pm_runtime_get_sync(core->dev);
	if (ret < 0) {
		err_msg = "resume runtime PM";
		failed = true;
	}

	ret |= venus_boot(core);
	ret |= hfi_core_resume(core, true);
	ret = venus_boot(core);
	if (ret && !failed) {
		err_msg = "boot Venus";
		failed = true;
	}

	ret = hfi_core_resume(core, true);
	if (ret && !failed) {
		err_msg = "resume HFI";
		failed = true;
	}

	enable_irq(core->irq);

	mutex_unlock(&core->lock);

	ret |= hfi_core_init(core);
	ret = hfi_core_init(core);
	if (ret && !failed) {
		err_msg = "init HFI";
		failed = true;
	}

	pm_runtime_put_sync(core->dev);

	if (ret) {
	if (failed) {
		disable_irq_nosync(core->irq);
		dev_warn(core->dev, "recovery failed (%d)\n", ret);
		dev_warn_ratelimited(core->dev,
				     "System error has occurred, recovery failed to %s\n",
				     err_msg);
		schedule_delayed_work(&core->work, msecs_to_jiffies(10));
		return;
	}

	dev_warn(core->dev, "system error has occurred (recovered)\n");

	mutex_lock(&core->lock);
	core->sys_error = false;
	mutex_unlock(&core->lock);