Commit 1679c7ee authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay
Browse files

habanalabs: expand clock throttling information uAPI



In addition to the clock throttling reason, user should be able
to obtain also the start time and the duration of the throttling
event.

Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 48f31169
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -455,6 +455,7 @@ static int device_early_init(struct hl_device *hdev)
	INIT_LIST_HEAD(&hdev->fpriv_list);
	mutex_init(&hdev->fpriv_list_lock);
	atomic_set(&hdev->in_reset, 0);
	mutex_init(&hdev->clk_throttling.lock);

	return 0;

@@ -495,6 +496,8 @@ static void device_early_fini(struct hl_device *hdev)

	mutex_destroy(&hdev->fpriv_list_lock);

	mutex_destroy(&hdev->clk_throttling.lock);

	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);

	kfree(hdev->hl_chip_info);
+29 −2
Original line number Diff line number Diff line
@@ -2378,6 +2378,32 @@ struct multi_cs_data {
	u8		update_ts;
};

/**
 * struct hl_clk_throttle_timestamp - current/last clock throttling timestamp
 * @start: timestamp taken when 'start' event is received in driver
 * @end: timestamp taken when 'end' event is received in driver
 */
struct hl_clk_throttle_timestamp {
	ktime_t		start;
	ktime_t		end;
};

/**
 * struct hl_clk_throttle - keeps current/last clock throttling timestamps
 * @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER
 *             index 1 refers to THERMAL
 * @lock: protects this structure as it can be accessed from both event queue
 *        context and info_ioctl context
 * @current_reason: bitmask represents the current clk throttling reasons
 * @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load
 */
struct hl_clk_throttle {
	struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX];
	struct mutex	lock;
	u32		current_reason;
	u32		aggregated_reason;
};

/**
 * struct hl_device - habanalabs device structure.
 * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -2445,6 +2471,7 @@ struct multi_cs_data {
 * @pci_mem_region: array of memory regions in the PCI
 * @state_dump_specs: constants and dictionaries needed to dump system state.
 * @multi_cs_completion: array of multi-CS completion.
 * @clk_throttling: holds information about current/previous clock throttling events
 * @dram_used_mem: current DRAM memory consumption.
 * @timeout_jiffies: device CS timeout value.
 * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2474,7 +2501,6 @@ struct multi_cs_data {
 * @high_pll: high PLL profile frequency.
 * @soft_reset_cnt: number of soft reset since the driver was loaded.
 * @hard_reset_cnt: number of hard reset since the driver was loaded.
 * @clk_throttling_reason: bitmask represents the current clk throttling reasons
 * @id: device minor.
 * @id_control: minor of the control device
 * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -2604,6 +2630,8 @@ struct hl_device {

	struct multi_cs_completion	multi_cs_completion[
							MULTI_CS_MAX_USER_CTX];
	struct hl_clk_throttle		clk_throttling;

	u32				*stream_master_qid_arr;
	atomic64_t			dram_used_mem;
	u64				timeout_jiffies;
@@ -2622,7 +2650,6 @@ struct hl_device {
	u32				high_pll;
	u32				soft_reset_cnt;
	u32				hard_reset_cnt;
	u32				clk_throttling_reason;
	u16				id;
	u16				id_control;
	u16				cpu_pci_msb_addr;
+25 −2
Original line number Diff line number Diff line
@@ -313,15 +313,38 @@ static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)

static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
	struct hl_device *hdev = hpriv->hdev;
	struct hl_info_clk_throttle clk_throttle = {0};
	ktime_t end_time, zero_time = ktime_set(0, 0);
	u32 max_size = args->return_size;
	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
	int i;

	if ((!max_size) || (!out))
		return -EINVAL;

	clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
	mutex_lock(&hdev->clk_throttling.lock);

	clk_throttle.clk_throttling_reason = hdev->clk_throttling.current_reason;

	for (i = 0 ; i < HL_CLK_THROTTLE_TYPE_MAX ; i++) {
		if (!(hdev->clk_throttling.aggregated_reason & BIT(i)))
			continue;

		clk_throttle.clk_throttling_timestamp_us[i] =
			ktime_to_us(hdev->clk_throttling.timestamp[i].start);

		if (ktime_compare(hdev->clk_throttling.timestamp[i].end, zero_time))
			end_time = ktime_get();
		else
			end_time = hdev->clk_throttling.timestamp[i].end;

		clk_throttle.clk_throttling_duration_ns[i] =
			ktime_to_ns(ktime_sub(end_time,
				hdev->clk_throttling.timestamp[i].start));

	}
	mutex_unlock(&hdev->clk_throttling.lock);

	return copy_to_user(out, &clk_throttle,
		min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
+18 −4
Original line number Diff line number Diff line
@@ -7925,27 +7925,39 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type)
static void gaudi_print_clk_change_info(struct hl_device *hdev,
					u16 event_type)
{
	ktime_t zero_time = ktime_set(0, 0);

	mutex_lock(&hdev->clk_throttling.lock);

	switch (event_type) {
	case GAUDI_EVENT_FIX_POWER_ENV_S:
		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
		dev_info_ratelimited(hdev->dev,
			"Clock throttling due to power consumption\n");
		break;

	case GAUDI_EVENT_FIX_POWER_ENV_E:
		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
		dev_info_ratelimited(hdev->dev,
			"Power envelop is safe, back to optimal clock\n");
		break;

	case GAUDI_EVENT_FIX_THERMAL_ENV_S:
		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
		dev_info_ratelimited(hdev->dev,
			"Clock throttling due to overheating\n");
		break;

	case GAUDI_EVENT_FIX_THERMAL_ENV_E:
		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
		dev_info_ratelimited(hdev->dev,
			"Thermal envelop is safe, back to optimal clock\n");
		break;
@@ -7955,6 +7967,8 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
			event_type);
		break;
	}

	mutex_unlock(&hdev->clk_throttling.lock);
}

static void gaudi_handle_eqe(struct hl_device *hdev,
+21 −4
Original line number Diff line number Diff line
@@ -4768,24 +4768,39 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)

static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
{
	ktime_t zero_time = ktime_set(0, 0);

	mutex_lock(&hdev->clk_throttling.lock);

	switch (event_type) {
	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
		dev_info_ratelimited(hdev->dev,
			"Clock throttling due to power consumption\n");
		break;

	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
		dev_info_ratelimited(hdev->dev,
			"Power envelop is safe, back to optimal clock\n");
		break;

	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
		dev_info_ratelimited(hdev->dev,
			"Clock throttling due to overheating\n");
		break;

	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
		dev_info_ratelimited(hdev->dev,
			"Thermal envelop is safe, back to optimal clock\n");
		break;
@@ -4795,6 +4810,8 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
			event_type);
		break;
	}

	mutex_unlock(&hdev->clk_throttling.lock);
}

void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
Loading