From 71b1bbdbf06848095a4f73b9e3a5a4eab6b4985f Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Fri, 15 May 2026 13:30:56 +0000
Subject: [PATCH 1/9] Refactor metrics to take pre-built numpy arrays instead
 of Dataset objects

Build y_true [N,D,H], y_pred [N,D,H], q_pred [N,D,H,Q], and y_past
[total_T, D] + indptr [N+1] once per window in compute_metrics, then
pass to all metric.compute() calls. Avoids repeated column access and
array construction per metric per dimension.
---
 src/fev/metrics.py   | 375 ++++++++++++++++++++++---------------------
 src/fev/task.py      |  45 ++++--
 test/test_metrics.py |  19 ++-
 3 files changed, 243 insertions(+), 196 deletions(-)

diff --git a/src/fev/metrics.py b/src/fev/metrics.py
index 965c316..ae771f6 100644
--- a/src/fev/metrics.py
+++ b/src/fev/metrics.py
@@ -1,10 +1,7 @@
 from typing import Any, Callable, Type
 
-import datasets
 import numpy as np
 
-from fev.constants import PREDICTIONS
-
 MetricConfig = str | dict[str, Any]
 
 
@@ -23,21 +20,37 @@ def _safemean(arr: np.ndarray) -> float:
         """Compute mean of an array, ignoring NaN, Inf, and -Inf values."""
         return float(np.mean(arr[np.isfinite(arr)]))
 
-    @staticmethod
-    def _get_y_test(test_data: datasets.Dataset, target_column: str) -> np.ndarray:
-        """ "Return array of shape [len(test_data), horizon] with ground truth values in float64 dtype."""
-        return np.array(test_data[target_column], dtype=np.float64)
-
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
     ) -> float:
+        """Compute the metric score.
+
+        Parameters
+        ----------
+        y_true : np.ndarray [N, D, H]
+            Ground truth values (N items, D target dims, H horizon steps).
+        y_pred : np.ndarray [N, D, H]
+            Point forecast predictions.
+        y_past : np.ndarray [total_T, D]
+            Concatenated past observations for all items. Use y_past_indptr to
+            slice per item: item i has y_past[indptr[i]:indptr[i+1], :].
+        y_past_indptr : np.ndarray [N+1]
+            CSR-style index pointer into y_past.
+        q_pred : np.ndarray [N, D, H, Q]
+            Quantile predictions. Empty (Q=0) if no quantiles were requested.
+        seasonality : int
+            Seasonal period used for scaled error metrics.
+        quantile_levels : list[float]
+            Quantile levels corresponding to q_pred's last axis.
+        """
         raise NotImplementedError
 
 
@@ -65,16 +78,16 @@ class MAE(Metric):
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-        return np.nanmean(np.abs(y_test - y_pred))
+    ) -> float:
+        per_dim = np.nanmean(np.abs(y_true - y_pred), axis=(0, 2))  # [D]
+        return float(np.mean(per_dim))
 
 
 class WAPE(Metric):
@@ -86,17 +99,18 @@ def __init__(self, epsilon: float = 0.0) -> None:
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-
-        return np.nanmean(np.abs(y_test - y_pred)) / max(self.epsilon, np.nanmean(np.abs(y_test)))
+    ) -> float:
+        abs_err_per_dim = np.nanmean(np.abs(y_true - y_pred), axis=(0, 2))  # [D]
+        abs_true_per_dim = np.nanmean(np.abs(y_true), axis=(0, 2))  # [D]
+        per_dim = abs_err_per_dim / np.maximum(abs_true_per_dim, self.epsilon)
+        return float(np.mean(per_dim))
 
 
 class MASE(Metric):
@@ -113,21 +127,20 @@ def __init__(self, epsilon: float = 0.0) -> None:
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-
-        seasonal_error = _abs_seasonal_error_per_item(
-            past_data=past_data, seasonality=seasonality, target_column=target_column
-        )
+    ) -> float:
+        seasonal_error = _abs_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
-        return self._safemean(np.abs(y_test - y_pred) / seasonal_error[:, None])
+        # Per-dim MASE: safemean over [N, H] for each dim d, then average across D
+        scaled = np.abs(y_true - y_pred) / seasonal_error[:, :, None]  # [N, D, H]
+        per_dim = np.array([self._safemean(scaled[:, d, :]) for d in range(y_true.shape[1])])
+        return float(np.mean(per_dim))
 
 
 class RMSE(Metric):
@@ -136,16 +149,16 @@ class RMSE(Metric):
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-        return np.sqrt(np.nanmean((y_test - y_pred) ** 2))
+    ) -> float:
+        per_dim = np.sqrt(np.nanmean((y_true - y_pred) ** 2, axis=(0, 2)))  # [D]
+        return float(np.mean(per_dim))
 
 
 class RMSSE(Metric):
@@ -162,20 +175,19 @@ def __init__(self, epsilon: float = 0.0) -> None:
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-        seasonal_error = _squared_seasonal_error_per_item(
-            past_data, seasonality=seasonality, target_column=target_column
-        )
+    ) -> float:
+        seasonal_error = _squared_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
-        return np.sqrt(self._safemean((y_test - y_pred) ** 2 / seasonal_error[:, None]))
+        scaled = (y_true - y_pred) ** 2 / seasonal_error[:, :, None]  # [N, D, H]
+        per_dim = np.array([np.sqrt(self._safemean(scaled[:, d, :])) for d in range(y_true.shape[1])])
+        return float(np.mean(per_dim))
 
 
 class MSE(Metric):
@@ -184,16 +196,16 @@ class MSE(Metric):
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-        return np.nanmean((y_test - y_pred) ** 2)
+    ) -> float:
+        per_dim = np.nanmean((y_true - y_pred) ** 2, axis=(0, 2))  # [D]
+        return float(np.mean(per_dim))
 
 
 class RMSLE(Metric):
@@ -202,16 +214,16 @@ class RMSLE(Metric):
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-        return np.sqrt(np.nanmean((np.log1p(y_test) - np.log1p(y_pred)) ** 2))
+    ) -> float:
+        per_dim = np.sqrt(np.nanmean((np.log1p(y_true) - np.log1p(y_pred)) ** 2, axis=(0, 2)))  # [D]
+        return float(np.mean(per_dim))
 
 
 class MAPE(Metric):
@@ -220,17 +232,17 @@ class MAPE(Metric):
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-        ratio = np.abs(y_test - y_pred) / np.abs(y_test)
-        return self._safemean(ratio)
+    ) -> float:
+        ratio = np.abs(y_true - y_pred) / np.abs(y_true)  # [N, D, H]
+        per_dim = np.array([self._safemean(ratio[:, d, :]) for d in range(y_true.shape[1])])
+        return float(np.mean(per_dim))
 
 
 class SMAPE(Metric):
@@ -239,16 +251,17 @@ class SMAPE(Metric):
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        y_test = self._get_y_test(test_data, target_column=target_column)
-        y_pred = np.array(predictions[PREDICTIONS])
-        return self._safemean(2 * np.abs(y_test - y_pred) / (np.abs(y_test) + np.abs(y_pred)))
+    ) -> float:
+        val = 2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))  # [N, D, H]
+        per_dim = np.array([self._safemean(val[:, d, :]) for d in range(y_true.shape[1])])
+        return float(np.mean(per_dim))
 
 
 class MQL(Metric):
@@ -259,22 +272,19 @@ class MQL(Metric):
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        if quantile_levels is None or len(quantile_levels) == 0:
-            raise ValueError(f"{self.__class__.__name__} cannot be computed if quantile_levels is None")
-        ql = _quantile_loss(
-            test_data=test_data,
-            predictions=predictions,
-            quantile_levels=quantile_levels,
-            target_column=target_column,
-        )
-        return np.nanmean(ql)
+    ) -> float:
+        if len(quantile_levels) == 0:
+            raise ValueError(f"{self.__class__.__name__} cannot be computed without quantile_levels")
+        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, D, H, Q]
+        per_dim = np.nanmean(ql, axis=(0, 2, 3))  # [D]
+        return float(np.mean(per_dim))
 
 
 class SQL(Metric):
@@ -293,25 +303,21 @@ def __init__(self, epsilon: float = 0.0) -> None:
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        ql = _quantile_loss(
-            test_data=test_data,
-            predictions=predictions,
-            quantile_levels=quantile_levels,
-            target_column=target_column,
-        )
-        ql_per_time_step = np.nanmean(ql, axis=2)  # [num_items, horizon]
-        seasonal_error = _abs_seasonal_error_per_item(
-            past_data=past_data, seasonality=seasonality, target_column=target_column
-        )
+    ) -> float:
+        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, D, H, Q]
+        ql_avg_q = np.nanmean(ql, axis=3)  # [N, D, H] — average over quantiles
+        seasonal_error = _abs_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
-        return self._safemean(ql_per_time_step / seasonal_error[:, None])
+        scaled = ql_avg_q / seasonal_error[:, :, None]  # [N, D, H]
+        per_dim = np.array([self._safemean(scaled[:, d, :]) for d in range(y_true.shape[1])])
+        return float(np.mean(per_dim))
 
 
 class WQL(Metric):
@@ -325,94 +331,105 @@ def __init__(self, epsilon: float = 0.0) -> None:
     def compute(
         self,
         *,
-        test_data: datasets.Dataset,
-        predictions: datasets.Dataset,
-        past_data: datasets.Dataset,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_past: np.ndarray,
+        y_past_indptr: np.ndarray,
+        q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
-        target_column: str = "target",
-    ):
-        ql = _quantile_loss(
-            test_data=test_data,
-            predictions=predictions,
-            quantile_levels=quantile_levels,
-            target_column=target_column,
-        )
-        return np.nanmean(ql) / max(self.epsilon, np.nanmean(np.abs(np.array(test_data[target_column]))))
+    ) -> float:
+        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, D, H, Q]
+        ql_per_dim = np.nanmean(ql, axis=(0, 2, 3))  # [D]
+        abs_true_per_dim = np.nanmean(np.abs(y_true), axis=(0, 2))  # [D]
+        per_dim = ql_per_dim / np.maximum(abs_true_per_dim, self.epsilon)
+        return float(np.mean(per_dim))
 
 
 def _quantile_loss(
     *,
-    test_data: datasets.Dataset,
-    predictions: datasets.Dataset,
+    y_true: np.ndarray,
+    q_pred: np.ndarray,
     quantile_levels: list[float],
-    target_column: str,
-):
-    """Compute quantile loss for each observation"""
-    pred_per_quantile = []
-    for q in quantile_levels:
-        pred_per_quantile.append(np.array(predictions[str(q)]))
-    q_pred = np.stack(pred_per_quantile, axis=-1)  # [num_series, horizon, len(quantile_levels)]
-    y_test = Metric._get_y_test(test_data, target_column=target_column)[..., None]  # [num_series, horizon, 1]
-    assert y_test.shape[:-1] == q_pred.shape[:-1]
-    return 2 * np.abs((y_test - q_pred) * ((y_test <= q_pred) - np.array(quantile_levels)))
-
-
-def _seasonal_error_per_item(
-    arrays: list[np.ndarray],
+) -> np.ndarray:
+    """Compute quantile loss.
+
+    Returns
+    -------
+    np.ndarray [N, D, H, Q]
+    """
+    y_true_expanded = y_true[..., None]  # [N, D, H, 1]
+    q_arr = np.array(quantile_levels)  # [Q]
+    return 2 * np.abs((y_true_expanded - q_pred) * ((y_true_expanded <= q_pred) - q_arr))
+
+
+def _seasonal_error(
+    *,
+    y_past: np.ndarray,
+    indptr: np.ndarray,
     seasonality: int,
     aggregate_fn: Callable,
 ) -> np.ndarray:
-    """Compute seasonal error for each time series using vectorized operations.
-
-    Uses bincount with weights to efficiently compute per-series aggregations.
+    """Compute seasonal error for each (item, dim) pair.
+
+    Parameters
+    ----------
+    y_past : np.ndarray [total_T, D]
+        Concatenated past observations.
+    indptr : np.ndarray [N+1]
+        CSR-style index pointer. Item i has y_past[indptr[i]:indptr[i+1], :].
+    seasonality : int
+        Seasonal period.
+    aggregate_fn : Callable
+        Applied element-wise to seasonal diffs (e.g. np.abs or np.square).
+
+    Returns
+    -------
+    np.ndarray [N, D]
     """
-    num_series = len(arrays)
+    num_series = len(indptr) - 1
+    num_dims = y_past.shape[1]
+
     if num_series == 0:
-        return np.array([], dtype="float64")
+        return np.array([], dtype="float64").reshape(0, 0)
 
-    lengths = np.array([a.size for a in arrays], dtype=np.int64)
+    lengths = np.diff(indptr)
     num_diffs_per_series = np.maximum(lengths - seasonality, 0)
 
     if num_diffs_per_series.sum() == 0:
-        return np.full(num_series, np.nan, dtype="float64")
-
-    flat = np.concatenate(arrays).astype("float64")
-    series_starts = np.concatenate([[0], np.cumsum(lengths[:-1])])
+        return np.full((num_series, num_dims), np.nan, dtype="float64")
 
-    # Build indices for all (t, t-seasonality) pairs across all series
     total_diffs = int(num_diffs_per_series.sum())
     series_ids = np.repeat(np.arange(num_series, dtype=np.int64), num_diffs_per_series)
     diff_offsets = np.arange(total_diffs) - np.repeat(
         np.cumsum(num_diffs_per_series) - num_diffs_per_series, num_diffs_per_series
     )
 
-    idx_current = series_starts[series_ids] + seasonality + diff_offsets
+    idx_current = indptr[series_ids] + seasonality + diff_offsets
     idx_lagged = idx_current - seasonality
 
-    diffs = flat[idx_current] - flat[idx_lagged]
-    errors = aggregate_fn(diffs)
+    diffs = y_past[idx_current] - y_past[idx_lagged]  # [total_diffs, D]
+    errors = aggregate_fn(diffs)  # [total_diffs, D]
 
-    # Compute per-series nanmean via bincount
-    valid = ~np.isnan(errors)
-    sums = np.bincount(series_ids, weights=np.where(valid, errors, 0.0), minlength=num_series)
-    counts = np.bincount(series_ids, weights=valid.astype("float64"), minlength=num_series)
+    valid = ~np.isnan(errors)  # [total_diffs, D]
+    result = np.full((num_series, num_dims), np.nan, dtype="float64")
+    for d in range(num_dims):
+        sums = np.bincount(series_ids, weights=np.where(valid[:, d], errors[:, d], 0.0), minlength=num_series)
+        counts = np.bincount(series_ids, weights=valid[:, d].astype("float64"), minlength=num_series)
+        mask = counts > 0
+        result[mask, d] = sums[mask] / counts[mask]
 
-    result = np.full(num_series, np.nan, dtype="float64")
-    np.divide(sums, counts, out=result, where=counts > 0)
     return result
 
 
-def _abs_seasonal_error_per_item(past_data: datasets.Dataset, seasonality: int, target_column: str) -> np.ndarray:
-    """Compute mean absolute seasonal error for each time series in past_data."""
-    arrays = past_data.with_format("numpy")[target_column]
-    return _seasonal_error_per_item(arrays, seasonality, aggregate_fn=np.abs)
+def _abs_seasonal_error(*, y_past: np.ndarray, indptr: np.ndarray, seasonality: int) -> np.ndarray:
+    """Compute mean absolute seasonal error. Returns [N, D]."""
+    return _seasonal_error(y_past=y_past, indptr=indptr, seasonality=seasonality, aggregate_fn=np.abs)
 
 
-def _squared_seasonal_error_per_item(past_data: datasets.Dataset, seasonality: int, target_column: str) -> np.ndarray:
-    """Compute mean squared seasonal error for each time series in past_data."""
-    arrays = past_data.with_format("numpy")[target_column]
-    return _seasonal_error_per_item(arrays, seasonality, aggregate_fn=np.square)
+def _squared_seasonal_error(*, y_past: np.ndarray, indptr: np.ndarray, seasonality: int) -> np.ndarray:
+    """Compute mean squared seasonal error. Returns [N, D]."""
+    return _seasonal_error(y_past=y_past, indptr=indptr, seasonality=seasonality, aggregate_fn=np.square)
 
 
 AVAILABLE_METRICS: dict[str, Type[Metric]] = {
diff --git a/src/fev/task.py b/src/fev/task.py
index b1cd54c..ae0bc6c 100644
--- a/src/fev/task.py
+++ b/src/fev/task.py
@@ -146,23 +146,42 @@ def compute_metrics(
                     f"match the length of test data ({len(test_data)})"
                 )
 
+        # y_true [N, D, H], y_pred [N, D, H]
+        y_true = np.stack([test_data[col] for col in self.target_columns], axis=1, dtype=np.float64)
+        y_pred = np.stack([predictions[col][PREDICTIONS] for col in self.target_columns], axis=1, dtype=np.float64)
+
+        # q_pred [N, D, H, Q]
+        if quantile_levels:
+            q_pred = np.stack(
+                [
+                    np.stack([predictions[col][str(q)] for q in quantile_levels], axis=-1)
+                    for col in self.target_columns
+                ],
+                axis=1,
+                dtype=np.float64,
+            )
+        else:
+            q_pred = np.empty((*y_true.shape, 0), dtype=np.float64)
+
+        # y_past [total_T, D] + indptr [N+1] — CSR-style ragged layout
+        past_per_dim = [np.concatenate(past_data[col]) for col in self.target_columns]
+        y_past_flat = np.column_stack(past_per_dim).astype(np.float64)
+        indptr = np.zeros(len(past_data) + 1, dtype=np.int64)
+        np.cumsum([len(ts) for ts in past_data[self.target_columns[0]]], out=indptr[1:])
+
         test_scores: dict[str, float] = {}
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=RuntimeWarning)
             for metric in metrics:
-                scores = []
-                for col in self.target_columns:
-                    scores.append(
-                        metric.compute(
-                            test_data=test_data,
-                            predictions=predictions[col],
-                            past_data=past_data,
-                            seasonality=seasonality,
-                            quantile_levels=quantile_levels,
-                            target_column=col,
-                        )
-                    )
-                test_scores[metric.name] = float(np.mean(scores))
+                test_scores[metric.name] = metric.compute(
+                    y_true=y_true,
+                    y_pred=y_pred,
+                    y_past=y_past_flat,
+                    y_past_indptr=indptr,
+                    q_pred=q_pred,
+                    seasonality=seasonality,
+                    quantile_levels=quantile_levels,
+                )
         return test_scores
 
 
diff --git a/test/test_metrics.py b/test/test_metrics.py
index acaea6f..58714db 100644
--- a/test/test_metrics.py
+++ b/test/test_metrics.py
@@ -5,7 +5,7 @@
 from autogluon.timeseries import TimeSeriesPredictor
 
 import fev
-from fev.metrics import AVAILABLE_METRICS, _seasonal_error_per_item
+from fev.metrics import AVAILABLE_METRICS, _seasonal_error
 
 
 # Include datasets with NaN values (nn5) and all-zero history values (covid deaths)
@@ -75,6 +75,15 @@ def _reference_seasonal_error_per_item(arrays, seasonality, aggregate_fn):
     return np.array(result, dtype="float64")
 
 
+def _arrays_to_indptr(arrays):
+    """Helper to convert list of 1D arrays to flat [total_T, 1] + indptr."""
+    lengths = np.array([len(a) for a in arrays], dtype=np.int64)
+    indptr = np.zeros(len(arrays) + 1, dtype=np.int64)
+    np.cumsum(lengths, out=indptr[1:])
+    flat = np.concatenate(arrays).astype(np.float64).reshape(-1, 1) if arrays else np.empty((0, 1), dtype=np.float64)
+    return flat, indptr
+
+
 @pytest.mark.parametrize("aggregate_fn", [np.abs, np.square])
 def test_seasonal_error_per_item(aggregate_fn):
     """Test vectorized impl against reference with mixed edge cases."""
@@ -88,7 +97,8 @@ def test_seasonal_error_per_item(aggregate_fn):
     ]
     seasonality = 2
 
-    result = _seasonal_error_per_item(arrays, seasonality, aggregate_fn)
+    flat, indptr = _arrays_to_indptr(arrays)
+    result = _seasonal_error(y_past=flat, indptr=indptr, seasonality=seasonality, aggregate_fn=aggregate_fn)[:, 0]
     expected = _reference_seasonal_error_per_item(arrays, seasonality, aggregate_fn)
 
     np.testing.assert_allclose(result, expected)
@@ -96,6 +106,7 @@ def test_seasonal_error_per_item(aggregate_fn):
 
 def test_seasonal_error_per_item_empty():
     """Test with empty input."""
-    result = _seasonal_error_per_item([], 2, np.abs)
-    assert len(result) == 0
+    flat, indptr = _arrays_to_indptr([])
+    result = _seasonal_error(y_past=flat, indptr=indptr, seasonality=2, aggregate_fn=np.abs)
+    assert result.size == 0
     assert result.dtype == np.float64

From 6b0bd016a3fad2360e2f2f5b517108bfb8d1887d Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Fri, 15 May 2026 13:55:56 +0000
Subject: [PATCH 2/9] Use pyarrow for array construction and lightweight
 validation

- Replace HF Dataset column access with pc.list_flatten + to_numpy for
  building y_true, y_pred, q_pred, y_past arrays (~100x faster)
- Replace .cast(schema) + numpy finite check with pyarrow-native
  validation: check column names, list lengths, and pc.is_finite
- Total evaluation_summary speedup: 18.6s -> 0.48s (39x) on 35-dim
  multivariate task with 10 windows
---
 src/fev/task.py | 90 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 31 deletions(-)

diff --git a/src/fev/task.py b/src/fev/task.py
index ae0bc6c..1c7810d 100644
--- a/src/fev/task.py
+++ b/src/fev/task.py
@@ -9,6 +9,7 @@
 import datasets
 import numpy as np
 import pandas as pd
+import pyarrow.compute as pc
 import pydantic
 from pydantic_core import ArgsKwargs
 
@@ -136,8 +137,6 @@ def compute_metrics(
         This is a convenience method that exists for debugging and additional evaluation.
         """
         past_data, _, test_data = self._get_past_future_test_data()
-        past_data.set_format("numpy")
-        test_data.set_format("numpy")
 
         for target_column, predictions_for_column in predictions.items():
             if len(predictions_for_column) != len(test_data):
@@ -146,28 +145,45 @@ def compute_metrics(
                     f"match the length of test data ({len(test_data)})"
                 )
 
-        # y_true [N, D, H], y_pred [N, D, H]
-        y_true = np.stack([test_data[col] for col in self.target_columns], axis=1, dtype=np.float64)
-        y_pred = np.stack([predictions[col][PREDICTIONS] for col in self.target_columns], axis=1, dtype=np.float64)
+        N = len(test_data)
+        D = len(self.target_columns)
+        H = self.horizon
+        Q = len(quantile_levels)
+
+        # y_true [N, D, H] — via pyarrow for fast column access
+        test_table = test_data.data.table
+        y_true = np.stack(
+            [pc.list_flatten(test_table.column(col)).to_numpy(zero_copy_only=False) for col in self.target_columns],
+            axis=1,
+            dtype=np.float64,
+        ).reshape(N, H, D).transpose(0, 2, 1)
+
+        # y_pred [N, D, H]
+        pred_arrs = []
+        for col in self.target_columns:
+            pred_arrs.append(pc.list_flatten(predictions[col].data.table.column(PREDICTIONS)).to_numpy(zero_copy_only=False))
+        y_pred = np.stack(pred_arrs, axis=1, dtype=np.float64).reshape(N, H, D).transpose(0, 2, 1)
 
         # q_pred [N, D, H, Q]
-        if quantile_levels:
-            q_pred = np.stack(
-                [
-                    np.stack([predictions[col][str(q)] for q in quantile_levels], axis=-1)
-                    for col in self.target_columns
-                ],
-                axis=1,
-                dtype=np.float64,
-            )
+        if Q > 0:
+            q_arrs = []
+            for col in self.target_columns:
+                col_table = predictions[col].data.table
+                for q in quantile_levels:
+                    q_arrs.append(pc.list_flatten(col_table.column(str(q))).to_numpy(zero_copy_only=False))
+            q_pred = np.stack(q_arrs, axis=1, dtype=np.float64).reshape(N, H, D, Q).transpose(0, 2, 1, 3)
         else:
-            q_pred = np.empty((*y_true.shape, 0), dtype=np.float64)
+            q_pred = np.empty((N, D, H, 0), dtype=np.float64)
 
         # y_past [total_T, D] + indptr [N+1] — CSR-style ragged layout
-        past_per_dim = [np.concatenate(past_data[col]) for col in self.target_columns]
-        y_past_flat = np.column_stack(past_per_dim).astype(np.float64)
-        indptr = np.zeros(len(past_data) + 1, dtype=np.int64)
-        np.cumsum([len(ts) for ts in past_data[self.target_columns[0]]], out=indptr[1:])
+        past_table = past_data.data.table
+        y_past_flat = np.stack(
+            [pc.list_flatten(past_table.column(col)).to_numpy(zero_copy_only=False) for col in self.target_columns],
+            axis=1,
+            dtype=np.float64,
+        )
+        indptr = np.zeros(N + 1, dtype=np.int64)
+        np.cumsum(pc.list_value_length(past_table.column(self.target_columns[0])).to_numpy(), out=indptr[1:])
 
         test_scores: dict[str, float] = {}
         with warnings.catch_warnings():
@@ -765,21 +781,33 @@ def _to_dataset(preds: datasets.Dataset | list[dict]) -> datasets.Dataset:
             )
         if missing_columns := set(self.target_columns) - set(predictions.keys()):
             raise ValueError(f"Missing predictions for columns {missing_columns} (got {sorted(predictions.keys())})")
-        predictions = predictions.cast(self.predictions_schema).with_format("numpy")
-        for target_column, predictions_for_column in predictions.items():
-            self._assert_all_columns_finite(predictions_for_column)
-        return predictions
 
-    @staticmethod
-    def _assert_all_columns_finite(predictions: datasets.Dataset) -> None:
-        for col in predictions.column_names:
-            nan_row_idx, _ = np.where(~np.isfinite(np.array(predictions[col])))
-            if len(nan_row_idx) > 0:
+        expected_columns = set(self.predictions_schema.keys())
+        for target_col, pred_ds in predictions.items():
+            table = pred_ds.data.table
+            if missing := expected_columns - set(table.column_names):
                 raise ValueError(
-                    "Predictions contain NaN or Inf values. "
-                    f"First invalid value encountered in column {col} for item {nan_row_idx[0]}:\n"
-                    f"{predictions[int(nan_row_idx[0])]}"
+                    f"Predictions for '{target_col}' are missing columns {sorted(missing)}. "
+                    f"Expected: {sorted(expected_columns)}"
                 )
+            lengths = pc.list_value_length(table.column(PREDICTIONS)).to_numpy()
+            if not np.all(lengths == self.horizon):
+                bad_idx = int(np.argmax(lengths != self.horizon))
+                raise ValueError(
+                    f"Predictions for '{target_col}' have wrong length at item {bad_idx}: "
+                    f"got {lengths[bad_idx]}, expected {self.horizon}"
+                )
+            for col in expected_columns:
+                flat = pc.list_flatten(table.column(col))
+                if not pc.all(pc.is_finite(flat)).as_py():
+                    flat_np = flat.to_numpy(zero_copy_only=False)
+                    bad_flat_idx = int(np.argmax(~np.isfinite(flat_np)))
+                    bad_item = bad_flat_idx // self.horizon
+                    raise ValueError(
+                        f"Predictions contain NaN or Inf values. "
+                        f"First invalid value in column '{col}' for target '{target_col}' at item {bad_item}."
+                    )
+        return predictions
 
     def evaluation_summary(
         self,

From 548a96daf3e12fd026e7bd34ecafad2ca367a485 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Fri, 15 May 2026 14:07:13 +0000
Subject: [PATCH 3/9] Use pyarrow for validation and fix combine_univariate to
 materialize

- Replace .cast() + numpy finite check with pyarrow-native validation:
  check column names, list lengths via pc.list_value_length, and
  pc.is_finite on flattened arrays
- Fix combine_univariate_predictions_to_multivariate to use table.take()
  instead of Dataset.select() to avoid lazy _indices views
- evaluation_summary: 18.6s -> 0.5s on 35-dim task with 10 windows
---
 src/fev/metrics.py | 75 +++++++++++++++++++++++-----------------------
 src/fev/task.py    | 21 ++++++-------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/fev/metrics.py b/src/fev/metrics.py
index ae771f6..b07dc46 100644
--- a/src/fev/metrics.py
+++ b/src/fev/metrics.py
@@ -31,25 +31,25 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        """Compute the metric score.
+        """Compute the metric score. Computed per target dim, then averaged across dims.
 
         Parameters
         ----------
-        y_true : np.ndarray [N, D, H]
-            Ground truth values (N items, D target dims, H horizon steps).
-        y_pred : np.ndarray [N, D, H]
-            Point forecast predictions.
+        y_true : np.ndarray [N, H, D]
+            Ground truth. N=number of time series, H=forecast horizon, D=target dimensions.
+        y_pred : np.ndarray [N, H, D]
+            Point forecast predictions, same shape as y_true.
         y_past : np.ndarray [total_T, D]
-            Concatenated past observations for all items. Use y_past_indptr to
-            slice per item: item i has y_past[indptr[i]:indptr[i+1], :].
+            Concatenated historical observations for all items (ragged time axis).
+            Item i spans rows y_past[y_past_indptr[i] : y_past_indptr[i+1]].
         y_past_indptr : np.ndarray [N+1]
-            CSR-style index pointer into y_past.
-        q_pred : np.ndarray [N, D, H, Q]
-            Quantile predictions. Empty (Q=0) if no quantiles were requested.
+            Row boundaries into y_past (CSR-style). Length N+1, starts at 0.
+        q_pred : np.ndarray [N, H, D, Q]
+            Quantile predictions. Q=len(quantile_levels), or Q=0 if none requested.
         seasonality : int
-            Seasonal period used for scaled error metrics.
+            Seasonal period for scaled error metrics (MASE, RMSSE, SQL).
         quantile_levels : list[float]
-            Quantile levels corresponding to q_pred's last axis.
+            Quantile levels in (0, 1) corresponding to q_pred's last axis.
         """
         raise NotImplementedError
 
@@ -86,7 +86,7 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        per_dim = np.nanmean(np.abs(y_true - y_pred), axis=(0, 2))  # [D]
+        per_dim = np.nanmean(np.abs(y_true - y_pred), axis=(0, 1))  # [D]
         return float(np.mean(per_dim))
 
 
@@ -107,8 +107,8 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        abs_err_per_dim = np.nanmean(np.abs(y_true - y_pred), axis=(0, 2))  # [D]
-        abs_true_per_dim = np.nanmean(np.abs(y_true), axis=(0, 2))  # [D]
+        abs_err_per_dim = np.nanmean(np.abs(y_true - y_pred), axis=(0, 1))  # [D]
+        abs_true_per_dim = np.nanmean(np.abs(y_true), axis=(0, 1))  # [D]
         per_dim = abs_err_per_dim / np.maximum(abs_true_per_dim, self.epsilon)
         return float(np.mean(per_dim))
 
@@ -137,9 +137,8 @@ def compute(
     ) -> float:
         seasonal_error = _abs_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
-        # Per-dim MASE: safemean over [N, H] for each dim d, then average across D
-        scaled = np.abs(y_true - y_pred) / seasonal_error[:, :, None]  # [N, D, H]
-        per_dim = np.array([self._safemean(scaled[:, d, :]) for d in range(y_true.shape[1])])
+        scaled = np.abs(y_true - y_pred) / seasonal_error[:, None, :]  # [N, H, D]
+        per_dim = np.array([self._safemean(scaled[:, :, d]) for d in range(y_true.shape[2])])
         return float(np.mean(per_dim))
 
 
@@ -157,7 +156,7 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        per_dim = np.sqrt(np.nanmean((y_true - y_pred) ** 2, axis=(0, 2)))  # [D]
+        per_dim = np.sqrt(np.nanmean((y_true - y_pred) ** 2, axis=(0, 1)))  # [D]
         return float(np.mean(per_dim))
 
 
@@ -185,8 +184,8 @@ def compute(
     ) -> float:
         seasonal_error = _squared_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
-        scaled = (y_true - y_pred) ** 2 / seasonal_error[:, :, None]  # [N, D, H]
-        per_dim = np.array([np.sqrt(self._safemean(scaled[:, d, :])) for d in range(y_true.shape[1])])
+        scaled = (y_true - y_pred) ** 2 / seasonal_error[:, None, :]  # [N, H, D]
+        per_dim = np.array([np.sqrt(self._safemean(scaled[:, :, d])) for d in range(y_true.shape[2])])
         return float(np.mean(per_dim))
 
 
@@ -204,7 +203,7 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        per_dim = np.nanmean((y_true - y_pred) ** 2, axis=(0, 2))  # [D]
+        per_dim = np.nanmean((y_true - y_pred) ** 2, axis=(0, 1))  # [D]
         return float(np.mean(per_dim))
 
 
@@ -222,7 +221,7 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        per_dim = np.sqrt(np.nanmean((np.log1p(y_true) - np.log1p(y_pred)) ** 2, axis=(0, 2)))  # [D]
+        per_dim = np.sqrt(np.nanmean((np.log1p(y_true) - np.log1p(y_pred)) ** 2, axis=(0, 1)))  # [D]
         return float(np.mean(per_dim))
 
 
@@ -240,8 +239,8 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        ratio = np.abs(y_true - y_pred) / np.abs(y_true)  # [N, D, H]
-        per_dim = np.array([self._safemean(ratio[:, d, :]) for d in range(y_true.shape[1])])
+        ratio = np.abs(y_true - y_pred) / np.abs(y_true)  # [N, H, D]
+        per_dim = np.array([self._safemean(ratio[:, :, d]) for d in range(y_true.shape[2])])
         return float(np.mean(per_dim))
 
 
@@ -259,8 +258,8 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        val = 2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))  # [N, D, H]
-        per_dim = np.array([self._safemean(val[:, d, :]) for d in range(y_true.shape[1])])
+        val = 2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))  # [N, H, D]
+        per_dim = np.array([self._safemean(val[:, :, d]) for d in range(y_true.shape[2])])
         return float(np.mean(per_dim))
 
 
@@ -282,8 +281,8 @@ def compute(
     ) -> float:
         if len(quantile_levels) == 0:
             raise ValueError(f"{self.__class__.__name__} cannot be computed without quantile_levels")
-        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, D, H, Q]
-        per_dim = np.nanmean(ql, axis=(0, 2, 3))  # [D]
+        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, H, D, Q]
+        per_dim = np.nanmean(ql, axis=(0, 1, 3))  # [D]
         return float(np.mean(per_dim))
 
 
@@ -311,12 +310,12 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, D, H, Q]
-        ql_avg_q = np.nanmean(ql, axis=3)  # [N, D, H] — average over quantiles
+        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, H, D, Q]
+        ql_avg_q = np.nanmean(ql, axis=3)  # [N, H, D]
         seasonal_error = _abs_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
-        scaled = ql_avg_q / seasonal_error[:, :, None]  # [N, D, H]
-        per_dim = np.array([self._safemean(scaled[:, d, :]) for d in range(y_true.shape[1])])
+        scaled = ql_avg_q / seasonal_error[:, None, :]  # [N, H, D]
+        per_dim = np.array([self._safemean(scaled[:, :, d]) for d in range(y_true.shape[2])])
         return float(np.mean(per_dim))
 
 
@@ -339,9 +338,9 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, D, H, Q]
-        ql_per_dim = np.nanmean(ql, axis=(0, 2, 3))  # [D]
-        abs_true_per_dim = np.nanmean(np.abs(y_true), axis=(0, 2))  # [D]
+        ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, H, D, Q]
+        ql_per_dim = np.nanmean(ql, axis=(0, 1, 3))  # [D]
+        abs_true_per_dim = np.nanmean(np.abs(y_true), axis=(0, 1))  # [D]
         per_dim = ql_per_dim / np.maximum(abs_true_per_dim, self.epsilon)
         return float(np.mean(per_dim))
 
@@ -356,9 +355,9 @@ def _quantile_loss(
 
     Returns
     -------
-    np.ndarray [N, D, H, Q]
+    np.ndarray [N, H, D, Q]
     """
-    y_true_expanded = y_true[..., None]  # [N, D, H, 1]
+    y_true_expanded = y_true[..., None]  # [N, H, D, 1]
     q_arr = np.array(quantile_levels)  # [Q]
     return 2 * np.abs((y_true_expanded - q_pred) * ((y_true_expanded <= q_pred) - q_arr))
 
diff --git a/src/fev/task.py b/src/fev/task.py
index 1c7810d..763cf15 100644
--- a/src/fev/task.py
+++ b/src/fev/task.py
@@ -150,30 +150,31 @@ def compute_metrics(
         H = self.horizon
         Q = len(quantile_levels)
 
-        # y_true [N, D, H] — via pyarrow for fast column access
+        # y_true [N, H, D] — via pyarrow for fast column access
         test_table = test_data.data.table
         y_true = np.stack(
             [pc.list_flatten(test_table.column(col)).to_numpy(zero_copy_only=False) for col in self.target_columns],
             axis=1,
             dtype=np.float64,
-        ).reshape(N, H, D).transpose(0, 2, 1)
+        ).reshape(N, H, D)
 
-        # y_pred [N, D, H]
+        # y_pred [N, H, D]
         pred_arrs = []
+        pred_tables = {}
         for col in self.target_columns:
-            pred_arrs.append(pc.list_flatten(predictions[col].data.table.column(PREDICTIONS)).to_numpy(zero_copy_only=False))
-        y_pred = np.stack(pred_arrs, axis=1, dtype=np.float64).reshape(N, H, D).transpose(0, 2, 1)
+            pred_tables[col] = predictions[col].data.table
+            pred_arrs.append(pc.list_flatten(pred_tables[col].column(PREDICTIONS)).to_numpy(zero_copy_only=False))
+        y_pred = np.stack(pred_arrs, axis=1, dtype=np.float64).reshape(N, H, D)
 
-        # q_pred [N, D, H, Q]
+        # q_pred [N, H, D, Q]
         if Q > 0:
             q_arrs = []
             for col in self.target_columns:
-                col_table = predictions[col].data.table
                 for q in quantile_levels:
-                    q_arrs.append(pc.list_flatten(col_table.column(str(q))).to_numpy(zero_copy_only=False))
-            q_pred = np.stack(q_arrs, axis=1, dtype=np.float64).reshape(N, H, D, Q).transpose(0, 2, 1, 3)
+                    q_arrs.append(pc.list_flatten(pred_tables[col].column(str(q))).to_numpy(zero_copy_only=False))
+            q_pred = np.stack(q_arrs, axis=1, dtype=np.float64).reshape(N, H, D, Q)
         else:
-            q_pred = np.empty((N, D, H, 0), dtype=np.float64)
+            q_pred = np.empty((N, H, D, 0), dtype=np.float64)
 
         # y_past [total_T, D] + indptr [N+1] — CSR-style ragged layout
         past_table = past_data.data.table

From 59bf2e3cfea29389d2c884cf6d671be7a9ced057 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Fri, 15 May 2026 19:27:33 +0000
Subject: [PATCH 4/9] Make _safemean accept axis arg, remove per-dim loops

_safemean(arr, axis=(0, 1)) replaces the manual
[self._safemean(arr[:, :, d]) for d in range(D)] pattern.
---
 src/fev/metrics.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/fev/metrics.py b/src/fev/metrics.py
index b07dc46..c2d868e 100644
--- a/src/fev/metrics.py
+++ b/src/fev/metrics.py
@@ -16,9 +16,12 @@ def name(self) -> str:
         return self.__class__.__name__
 
     @staticmethod
-    def _safemean(arr: np.ndarray) -> float:
-        """Compute mean of an array, ignoring NaN, Inf, and -Inf values."""
-        return float(np.mean(arr[np.isfinite(arr)]))
+    def _safemean(arr: np.ndarray, axis=None) -> float | np.ndarray:
+        """Compute mean ignoring NaN, Inf, and -Inf values."""
+        mask = ~np.isfinite(arr)
+        if mask.any():
+            arr = np.where(mask, np.nan, arr)
+        return np.nanmean(arr, axis=axis)
 
     def compute(
         self,
@@ -138,8 +141,7 @@ def compute(
         seasonal_error = _abs_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = np.abs(y_true - y_pred) / seasonal_error[:, None, :]  # [N, H, D]
-        per_dim = np.array([self._safemean(scaled[:, :, d]) for d in range(y_true.shape[2])])
-        return float(np.mean(per_dim))
+        return float(np.mean(self._safemean(scaled, axis=(0, 1))))
 
 
 class RMSE(Metric):
@@ -185,8 +187,7 @@ def compute(
         seasonal_error = _squared_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = (y_true - y_pred) ** 2 / seasonal_error[:, None, :]  # [N, H, D]
-        per_dim = np.array([np.sqrt(self._safemean(scaled[:, :, d])) for d in range(y_true.shape[2])])
-        return float(np.mean(per_dim))
+        return float(np.mean(np.sqrt(self._safemean(scaled, axis=(0, 1)))))
 
 
 class MSE(Metric):
@@ -240,8 +241,7 @@ def compute(
         quantile_levels: list[float],
     ) -> float:
         ratio = np.abs(y_true - y_pred) / np.abs(y_true)  # [N, H, D]
-        per_dim = np.array([self._safemean(ratio[:, :, d]) for d in range(y_true.shape[2])])
-        return float(np.mean(per_dim))
+        return float(np.mean(self._safemean(ratio, axis=(0, 1))))
 
 
 class SMAPE(Metric):
@@ -259,8 +259,7 @@ def compute(
         quantile_levels: list[float],
     ) -> float:
         val = 2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))  # [N, H, D]
-        per_dim = np.array([self._safemean(val[:, :, d]) for d in range(y_true.shape[2])])
-        return float(np.mean(per_dim))
+        return float(np.mean(self._safemean(val, axis=(0, 1))))
 
 
 class MQL(Metric):
@@ -315,8 +314,7 @@ def compute(
         seasonal_error = _abs_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = ql_avg_q / seasonal_error[:, None, :]  # [N, H, D]
-        per_dim = np.array([self._safemean(scaled[:, :, d]) for d in range(y_true.shape[2])])
-        return float(np.mean(per_dim))
+        return float(np.mean(self._safemean(scaled, axis=(0, 1))))
 
 
 class WQL(Metric):

From 546cb5ee5f888ad43db9e0d21419a58948396353 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Fri, 15 May 2026 19:33:14 +0000
Subject: [PATCH 5/9] Replace y_past_indptr with y_past_lengths, rename to
 _seasonal_error_per_item
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Passing lengths [N] is simpler than CSR-style indptr [N+1] — offsets
are only needed inside _seasonal_error_per_item and are cheap to
reconstruct. Rename back to _per_item to clarify return shape [N, D].
---
 src/fev/metrics.py   | 61 ++++++++++++++++++++++----------------------
 src/fev/task.py      |  7 +++--
 test/test_metrics.py | 18 ++++++-------
 3 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/src/fev/metrics.py b/src/fev/metrics.py
index c2d868e..d3718c5 100644
--- a/src/fev/metrics.py
+++ b/src/fev/metrics.py
@@ -29,7 +29,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -44,9 +44,8 @@ def compute(
             Point forecast predictions, same shape as y_true.
         y_past : np.ndarray [total_T, D]
             Concatenated historical observations for all items (ragged time axis).
-            Item i spans rows y_past[y_past_indptr[i] : y_past_indptr[i+1]].
-        y_past_indptr : np.ndarray [N+1]
-            Row boundaries into y_past (CSR-style). Length N+1, starts at 0.
+        y_past_lengths : np.ndarray [N]
+            Number of past observations per item. sum(y_past_lengths) == total_T.
         q_pred : np.ndarray [N, H, D, Q]
             Quantile predictions. Q=len(quantile_levels), or Q=0 if none requested.
         seasonality : int
@@ -84,7 +83,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -105,7 +104,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -133,12 +132,12 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        seasonal_error = _abs_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
+        seasonal_error = _abs_seasonal_error_per_item(y_past=y_past, lengths=y_past_lengths, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = np.abs(y_true - y_pred) / seasonal_error[:, None, :]  # [N, H, D]
         return float(np.mean(self._safemean(scaled, axis=(0, 1))))
@@ -153,7 +152,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -179,12 +178,12 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        seasonal_error = _squared_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
+        seasonal_error = _squared_seasonal_error_per_item(y_past=y_past, lengths=y_past_lengths, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = (y_true - y_pred) ** 2 / seasonal_error[:, None, :]  # [N, H, D]
         return float(np.mean(np.sqrt(self._safemean(scaled, axis=(0, 1)))))
@@ -199,7 +198,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -217,7 +216,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -235,7 +234,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -253,7 +252,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -273,7 +272,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -304,14 +303,14 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
         ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, H, D, Q]
         ql_avg_q = np.nanmean(ql, axis=3)  # [N, H, D]
-        seasonal_error = _abs_seasonal_error(y_past=y_past, indptr=y_past_indptr, seasonality=seasonality)  # [N, D]
+        seasonal_error = _abs_seasonal_error_per_item(y_past=y_past, lengths=y_past_lengths, seasonality=seasonality)  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = ql_avg_q / seasonal_error[:, None, :]  # [N, H, D]
         return float(np.mean(self._safemean(scaled, axis=(0, 1))))
@@ -331,7 +330,7 @@ def compute(
         y_true: np.ndarray,
         y_pred: np.ndarray,
         y_past: np.ndarray,
-        y_past_indptr: np.ndarray,
+        y_past_lengths: np.ndarray,
         q_pred: np.ndarray,
         seasonality: int,
         quantile_levels: list[float],
@@ -360,10 +359,10 @@ def _quantile_loss(
     return 2 * np.abs((y_true_expanded - q_pred) * ((y_true_expanded <= q_pred) - q_arr))
 
 
-def _seasonal_error(
+def _seasonal_error_per_item(
     *,
     y_past: np.ndarray,
-    indptr: np.ndarray,
+    lengths: np.ndarray,
     seasonality: int,
     aggregate_fn: Callable,
 ) -> np.ndarray:
@@ -373,8 +372,8 @@ def _seasonal_error(
     ----------
     y_past : np.ndarray [total_T, D]
         Concatenated past observations.
-    indptr : np.ndarray [N+1]
-        CSR-style index pointer. Item i has y_past[indptr[i]:indptr[i+1], :].
+    lengths : np.ndarray [N]
+        Number of observations per item.
     seasonality : int
         Seasonal period.
     aggregate_fn : Callable
@@ -384,13 +383,12 @@ def _seasonal_error(
     -------
     np.ndarray [N, D]
     """
-    num_series = len(indptr) - 1
+    num_series = len(lengths)
     num_dims = y_past.shape[1]
 
     if num_series == 0:
         return np.array([], dtype="float64").reshape(0, 0)
 
-    lengths = np.diff(indptr)
     num_diffs_per_series = np.maximum(lengths - seasonality, 0)
 
     if num_diffs_per_series.sum() == 0:
@@ -402,7 +400,10 @@ def _seasonal_error(
         np.cumsum(num_diffs_per_series) - num_diffs_per_series, num_diffs_per_series
     )
 
-    idx_current = indptr[series_ids] + seasonality + diff_offsets
+    offsets = np.empty(num_series + 1, dtype=np.int64)
+    offsets[0] = 0
+    np.cumsum(lengths, out=offsets[1:])
+    idx_current = offsets[series_ids] + seasonality + diff_offsets
     idx_lagged = idx_current - seasonality
 
     diffs = y_past[idx_current] - y_past[idx_lagged]  # [total_diffs, D]
@@ -419,14 +420,14 @@ def _seasonal_error(
     return result
 
 
-def _abs_seasonal_error(*, y_past: np.ndarray, indptr: np.ndarray, seasonality: int) -> np.ndarray:
+def _abs_seasonal_error_per_item(*, y_past: np.ndarray, lengths: np.ndarray, seasonality: int) -> np.ndarray:
     """Compute mean absolute seasonal error. Returns [N, D]."""
-    return _seasonal_error(y_past=y_past, indptr=indptr, seasonality=seasonality, aggregate_fn=np.abs)
+    return _seasonal_error_per_item(y_past=y_past, lengths=lengths, seasonality=seasonality, aggregate_fn=np.abs)
 
 
-def _squared_seasonal_error(*, y_past: np.ndarray, indptr: np.ndarray, seasonality: int) -> np.ndarray:
+def _squared_seasonal_error_per_item(*, y_past: np.ndarray, lengths: np.ndarray, seasonality: int) -> np.ndarray:
     """Compute mean squared seasonal error. Returns [N, D]."""
-    return _seasonal_error(y_past=y_past, indptr=indptr, seasonality=seasonality, aggregate_fn=np.square)
+    return _seasonal_error_per_item(y_past=y_past, lengths=lengths, seasonality=seasonality, aggregate_fn=np.square)
 
 
 AVAILABLE_METRICS: dict[str, Type[Metric]] = {
diff --git a/src/fev/task.py b/src/fev/task.py
index 763cf15..116fc13 100644
--- a/src/fev/task.py
+++ b/src/fev/task.py
@@ -176,15 +176,14 @@ def compute_metrics(
         else:
             q_pred = np.empty((N, H, D, 0), dtype=np.float64)
 
-        # y_past [total_T, D] + indptr [N+1] — CSR-style ragged layout
+        # y_past [total_T, D] + lengths [N]
         past_table = past_data.data.table
         y_past_flat = np.stack(
             [pc.list_flatten(past_table.column(col)).to_numpy(zero_copy_only=False) for col in self.target_columns],
             axis=1,
             dtype=np.float64,
         )
-        indptr = np.zeros(N + 1, dtype=np.int64)
-        np.cumsum(pc.list_value_length(past_table.column(self.target_columns[0])).to_numpy(), out=indptr[1:])
+        y_past_lengths = pc.list_value_length(past_table.column(self.target_columns[0])).to_numpy()
 
         test_scores: dict[str, float] = {}
         with warnings.catch_warnings():
@@ -194,7 +193,7 @@ def compute_metrics(
                     y_true=y_true,
                     y_pred=y_pred,
                     y_past=y_past_flat,
-                    y_past_indptr=indptr,
+                    y_past_lengths=y_past_lengths,
                     q_pred=q_pred,
                     seasonality=seasonality,
                     quantile_levels=quantile_levels,
diff --git a/test/test_metrics.py b/test/test_metrics.py
index 58714db..605e75e 100644
--- a/test/test_metrics.py
+++ b/test/test_metrics.py
@@ -5,7 +5,7 @@
 from autogluon.timeseries import TimeSeriesPredictor
 
 import fev
-from fev.metrics import AVAILABLE_METRICS, _seasonal_error
+from fev.metrics import AVAILABLE_METRICS, _seasonal_error_per_item
 
 
 # Include datasets with NaN values (nn5) and all-zero history values (covid deaths)
@@ -75,13 +75,11 @@ def _reference_seasonal_error_per_item(arrays, seasonality, aggregate_fn):
     return np.array(result, dtype="float64")
 
 
-def _arrays_to_indptr(arrays):
-    """Helper to convert list of 1D arrays to flat [total_T, 1] + indptr."""
+def _arrays_to_flat(arrays):
+    """Helper to convert list of 1D arrays to flat [total_T, 1] + lengths [N]."""
     lengths = np.array([len(a) for a in arrays], dtype=np.int64)
-    indptr = np.zeros(len(arrays) + 1, dtype=np.int64)
-    np.cumsum(lengths, out=indptr[1:])
     flat = np.concatenate(arrays).astype(np.float64).reshape(-1, 1) if arrays else np.empty((0, 1), dtype=np.float64)
-    return flat, indptr
+    return flat, lengths
 
 
 @pytest.mark.parametrize("aggregate_fn", [np.abs, np.square])
@@ -97,8 +95,8 @@ def test_seasonal_error_per_item(aggregate_fn):
     ]
     seasonality = 2
 
-    flat, indptr = _arrays_to_indptr(arrays)
-    result = _seasonal_error(y_past=flat, indptr=indptr, seasonality=seasonality, aggregate_fn=aggregate_fn)[:, 0]
+    flat, lengths = _arrays_to_flat(arrays)
+    result = _seasonal_error_per_item(y_past=flat, lengths=lengths, seasonality=seasonality, aggregate_fn=aggregate_fn)[:, 0]
     expected = _reference_seasonal_error_per_item(arrays, seasonality, aggregate_fn)
 
     np.testing.assert_allclose(result, expected)
@@ -106,7 +104,7 @@ def test_seasonal_error_per_item(aggregate_fn):
 
 def test_seasonal_error_per_item_empty():
     """Test with empty input."""
-    flat, indptr = _arrays_to_indptr([])
-    result = _seasonal_error(y_past=flat, indptr=indptr, seasonality=2, aggregate_fn=np.abs)
+    flat, lengths = _arrays_to_flat([])
+    result = _seasonal_error_per_item(y_past=flat, lengths=lengths, seasonality=2, aggregate_fn=np.abs)
     assert result.size == 0
     assert result.dtype == np.float64

From 9ecb4a10e38f3d5021882b1758abc1071ec19364 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Fri, 15 May 2026 19:36:14 +0000
Subject: [PATCH 6/9] Fix format

---
 src/fev/metrics.py   | 12 +++++++++---
 test/test_metrics.py |  4 +++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/fev/metrics.py b/src/fev/metrics.py
index d3718c5..dfafea7 100644
--- a/src/fev/metrics.py
+++ b/src/fev/metrics.py
@@ -137,7 +137,9 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        seasonal_error = _abs_seasonal_error_per_item(y_past=y_past, lengths=y_past_lengths, seasonality=seasonality)  # [N, D]
+        seasonal_error = _abs_seasonal_error_per_item(
+            y_past=y_past, lengths=y_past_lengths, seasonality=seasonality
+        )  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = np.abs(y_true - y_pred) / seasonal_error[:, None, :]  # [N, H, D]
         return float(np.mean(self._safemean(scaled, axis=(0, 1))))
@@ -183,7 +185,9 @@ def compute(
         seasonality: int,
         quantile_levels: list[float],
     ) -> float:
-        seasonal_error = _squared_seasonal_error_per_item(y_past=y_past, lengths=y_past_lengths, seasonality=seasonality)  # [N, D]
+        seasonal_error = _squared_seasonal_error_per_item(
+            y_past=y_past, lengths=y_past_lengths, seasonality=seasonality
+        )  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = (y_true - y_pred) ** 2 / seasonal_error[:, None, :]  # [N, H, D]
         return float(np.mean(np.sqrt(self._safemean(scaled, axis=(0, 1)))))
@@ -310,7 +314,9 @@ def compute(
     ) -> float:
         ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, H, D, Q]
         ql_avg_q = np.nanmean(ql, axis=3)  # [N, H, D]
-        seasonal_error = _abs_seasonal_error_per_item(y_past=y_past, lengths=y_past_lengths, seasonality=seasonality)  # [N, D]
+        seasonal_error = _abs_seasonal_error_per_item(
+            y_past=y_past, lengths=y_past_lengths, seasonality=seasonality
+        )  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = ql_avg_q / seasonal_error[:, None, :]  # [N, H, D]
         return float(np.mean(self._safemean(scaled, axis=(0, 1))))
diff --git a/test/test_metrics.py b/test/test_metrics.py
index 605e75e..26d36e8 100644
--- a/test/test_metrics.py
+++ b/test/test_metrics.py
@@ -96,7 +96,9 @@ def test_seasonal_error_per_item(aggregate_fn):
     seasonality = 2
 
     flat, lengths = _arrays_to_flat(arrays)
-    result = _seasonal_error_per_item(y_past=flat, lengths=lengths, seasonality=seasonality, aggregate_fn=aggregate_fn)[:, 0]
+    result = _seasonal_error_per_item(
+        y_past=flat, lengths=lengths, seasonality=seasonality, aggregate_fn=aggregate_fn
+    )[:, 0]
     expected = _reference_seasonal_error_per_item(arrays, seasonality, aggregate_fn)
 
     np.testing.assert_allclose(result, expected)

From 4efd21ed96cb3fb3a3b42af41b74ceb60a5683c1 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Fri, 15 May 2026 20:02:33 +0000
Subject: [PATCH 7/9] Fast path for seasonal error computation

---
 src/fev/metrics.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/fev/metrics.py b/src/fev/metrics.py
index dfafea7..649b9d3 100644
--- a/src/fev/metrics.py
+++ b/src/fev/metrics.py
@@ -400,6 +400,13 @@ def _seasonal_error_per_item(
     if num_diffs_per_series.sum() == 0:
         return np.full((num_series, num_dims), np.nan, dtype="float64")
 
+    # Fast path: all items have equal length — reshape + slice instead of fancy indexing
+    if np.all(lengths == lengths[0]):
+        T = int(lengths[0])
+        y_reshaped = y_past.reshape(num_series, T, num_dims)
+        diffs = y_reshaped[:, seasonality:, :] - y_reshaped[:, :-seasonality, :]
+        return np.nanmean(aggregate_fn(diffs), axis=1)
+
     total_diffs = int(num_diffs_per_series.sum())
     series_ids = np.repeat(np.arange(num_series, dtype=np.int64), num_diffs_per_series)
     diff_offsets = np.arange(total_diffs) - np.repeat(

From 28fd98d9af31317a315419cfb1330bfde60db3b1 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Mon, 18 May 2026 15:14:21 +0000
Subject: [PATCH 8/9] Rename lengths->y_past_lengths

---
 src/fev/metrics.py   | 28 ++++++++++++++--------------
 test/test_metrics.py |  4 ++--
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/fev/metrics.py b/src/fev/metrics.py
index 649b9d3..b2685fd 100644
--- a/src/fev/metrics.py
+++ b/src/fev/metrics.py
@@ -138,7 +138,7 @@ def compute(
         quantile_levels: list[float],
     ) -> float:
         seasonal_error = _abs_seasonal_error_per_item(
-            y_past=y_past, lengths=y_past_lengths, seasonality=seasonality
+            y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality
         )  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = np.abs(y_true - y_pred) / seasonal_error[:, None, :]  # [N, H, D]
@@ -186,7 +186,7 @@ def compute(
         quantile_levels: list[float],
     ) -> float:
         seasonal_error = _squared_seasonal_error_per_item(
-            y_past=y_past, lengths=y_past_lengths, seasonality=seasonality
+            y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality
         )  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = (y_true - y_pred) ** 2 / seasonal_error[:, None, :]  # [N, H, D]
@@ -315,7 +315,7 @@ def compute(
         ql = _quantile_loss(y_true=y_true, q_pred=q_pred, quantile_levels=quantile_levels)  # [N, H, D, Q]
         ql_avg_q = np.nanmean(ql, axis=3)  # [N, H, D]
         seasonal_error = _abs_seasonal_error_per_item(
-            y_past=y_past, lengths=y_past_lengths, seasonality=seasonality
+            y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality
         )  # [N, D]
         seasonal_error = np.clip(seasonal_error, self.epsilon, None)
         scaled = ql_avg_q / seasonal_error[:, None, :]  # [N, H, D]
@@ -368,7 +368,7 @@ def _quantile_loss(
 def _seasonal_error_per_item(
     *,
     y_past: np.ndarray,
-    lengths: np.ndarray,
+    y_past_lengths: np.ndarray,
     seasonality: int,
     aggregate_fn: Callable,
 ) -> np.ndarray:
@@ -378,7 +378,7 @@ def _seasonal_error_per_item(
     ----------
     y_past : np.ndarray [total_T, D]
         Concatenated past observations.
-    lengths : np.ndarray [N]
+    y_past_lengths : np.ndarray [N]
         Number of observations per item.
     seasonality : int
         Seasonal period.
@@ -389,20 +389,20 @@ def _seasonal_error_per_item(
     -------
     np.ndarray [N, D]
     """
-    num_series = len(lengths)
+    num_series = len(y_past_lengths)
     num_dims = y_past.shape[1]
 
     if num_series == 0:
         return np.array([], dtype="float64").reshape(0, 0)
 
-    num_diffs_per_series = np.maximum(lengths - seasonality, 0)
+    num_diffs_per_series = np.maximum(y_past_lengths - seasonality, 0)
 
     if num_diffs_per_series.sum() == 0:
         return np.full((num_series, num_dims), np.nan, dtype="float64")
 
     # Fast path: all items have equal length — reshape + slice instead of fancy indexing
-    if np.all(lengths == lengths[0]):
-        T = int(lengths[0])
+    if np.all(y_past_lengths == y_past_lengths[0]):
+        T = int(y_past_lengths[0])
         y_reshaped = y_past.reshape(num_series, T, num_dims)
         diffs = y_reshaped[:, seasonality:, :] - y_reshaped[:, :-seasonality, :]
         return np.nanmean(aggregate_fn(diffs), axis=1)
@@ -415,7 +415,7 @@ def _seasonal_error_per_item(
 
     offsets = np.empty(num_series + 1, dtype=np.int64)
     offsets[0] = 0
-    np.cumsum(lengths, out=offsets[1:])
+    np.cumsum(y_past_lengths, out=offsets[1:])
     idx_current = offsets[series_ids] + seasonality + diff_offsets
     idx_lagged = idx_current - seasonality
 
@@ -433,14 +433,14 @@ def _seasonal_error_per_item(
     return result
 
 
-def _abs_seasonal_error_per_item(*, y_past: np.ndarray, lengths: np.ndarray, seasonality: int) -> np.ndarray:
+def _abs_seasonal_error_per_item(*, y_past: np.ndarray, y_past_lengths: np.ndarray, seasonality: int) -> np.ndarray:
     """Compute mean absolute seasonal error. Returns [N, D]."""
-    return _seasonal_error_per_item(y_past=y_past, lengths=lengths, seasonality=seasonality, aggregate_fn=np.abs)
+    return _seasonal_error_per_item(y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality, aggregate_fn=np.abs)
 
 
-def _squared_seasonal_error_per_item(*, y_past: np.ndarray, lengths: np.ndarray, seasonality: int) -> np.ndarray:
+def _squared_seasonal_error_per_item(*, y_past: np.ndarray, y_past_lengths: np.ndarray, seasonality: int) -> np.ndarray:
     """Compute mean squared seasonal error. Returns [N, D]."""
-    return _seasonal_error_per_item(y_past=y_past, lengths=lengths, seasonality=seasonality, aggregate_fn=np.square)
+    return _seasonal_error_per_item(y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality, aggregate_fn=np.square)
 
 
 AVAILABLE_METRICS: dict[str, Type[Metric]] = {
diff --git a/test/test_metrics.py b/test/test_metrics.py
index 26d36e8..0343be4 100644
--- a/test/test_metrics.py
+++ b/test/test_metrics.py
@@ -97,7 +97,7 @@ def test_seasonal_error_per_item(aggregate_fn):
 
     flat, lengths = _arrays_to_flat(arrays)
     result = _seasonal_error_per_item(
-        y_past=flat, lengths=lengths, seasonality=seasonality, aggregate_fn=aggregate_fn
+        y_past=flat, y_past_lengths=lengths, seasonality=seasonality, aggregate_fn=aggregate_fn
     )[:, 0]
     expected = _reference_seasonal_error_per_item(arrays, seasonality, aggregate_fn)
 
@@ -107,6 +107,6 @@ def test_seasonal_error_per_item(aggregate_fn):
 def test_seasonal_error_per_item_empty():
     """Test with empty input."""
     flat, lengths = _arrays_to_flat([])
-    result = _seasonal_error_per_item(y_past=flat, lengths=lengths, seasonality=2, aggregate_fn=np.abs)
+    result = _seasonal_error_per_item(y_past=flat, y_past_lengths=lengths, seasonality=2, aggregate_fn=np.abs)
     assert result.size == 0
     assert result.dtype == np.float64

From 11080b7e0cac70e1203441d8de88a88fc9414406 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Mon, 18 May 2026 15:15:03 +0000
Subject: [PATCH 9/9] Fix format

---
 src/fev/metrics.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/fev/metrics.py b/src/fev/metrics.py
index b2685fd..0091e73 100644
--- a/src/fev/metrics.py
+++ b/src/fev/metrics.py
@@ -435,12 +435,18 @@ def _seasonal_error_per_item(
 
 def _abs_seasonal_error_per_item(*, y_past: np.ndarray, y_past_lengths: np.ndarray, seasonality: int) -> np.ndarray:
     """Compute mean absolute seasonal error. Returns [N, D]."""
-    return _seasonal_error_per_item(y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality, aggregate_fn=np.abs)
+    return _seasonal_error_per_item(
+        y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality, aggregate_fn=np.abs
+    )
 
 
-def _squared_seasonal_error_per_item(*, y_past: np.ndarray, y_past_lengths: np.ndarray, seasonality: int) -> np.ndarray:
+def _squared_seasonal_error_per_item(
+    *, y_past: np.ndarray, y_past_lengths: np.ndarray, seasonality: int
+) -> np.ndarray:
     """Compute mean squared seasonal error. Returns [N, D]."""
-    return _seasonal_error_per_item(y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality, aggregate_fn=np.square)
+    return _seasonal_error_per_item(
+        y_past=y_past, y_past_lengths=y_past_lengths, seasonality=seasonality, aggregate_fn=np.square
+    )
 
 
 AVAILABLE_METRICS: dict[str, Type[Metric]] = {