From 338c7eadcd52a902f23fb955a9d59d16ef456de5 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Tue, 19 May 2026 16:11:09 +0000
Subject: [PATCH 1/3] Update docs for the new ForecastingModel class

---
 README.md                                     |    5 +-
 docs/tutorials/05-add-your-model.ipynb        |  270 ++-
 examples/README.md                            |    6 +
 examples/chronos-2/evaluate_model.py          |   66 -
 examples/chronos-2/requirements.txt           |    7 -
 examples/chronos/evaluate_model.py            |  113 --
 examples/chronos/requirements.txt             |    2 -
 examples/mlforecast/evaluate_model.py         |  439 -----
 examples/mlforecast/requirements.txt          |    5 -
 examples/moirai/evaluate_model.py             |   93 --
 examples/moirai/requirements.txt              |    2 -
 examples/seasonal_naive/evaluate_model.py     |   59 -
 examples/seasonal_naive/requirements.txt      |    1 -
 examples/statsforecast/evaluate_model.py      |  125 --
 examples/statsforecast/requirements.txt       |    1 -
 examples/timecopilot/.python-version          |    1 -
 examples/timecopilot/README.md                |   69 -
 examples/timecopilot/pyproject.toml           |   17 -
 examples/timecopilot/requirements.txt         | 1458 -----------------
 examples/timecopilot/src/__init__.py          |    0
 examples/timecopilot/src/download_results.py  |   19 -
 examples/timecopilot/src/evaluate_model.py    |   99 --
 .../timecopilot/src/evaluate_model_modal.py   |   56 -
 examples/timesfm-2.0/evaluate_model.py        |  139 --
 examples/timesfm-2.0/requirements.txt         |    4 -
 examples/timesfm-2.5/evaluate_model.py        |  133 --
 examples/timesfm-2.5/requirements.txt         |    3 -
 examples/tirex/README.md                      |    9 -
 examples/tirex/evaluate_model.py              |   82 -
 examples/tirex/freezed_test_environment.yaml  |  284 ----
 examples/tirex/requirements.txt               |    2 -
 examples/toto/evaluate_model.py               |  247 ---
 examples/toto/requirements.txt                |    1 -
 examples/ttm-r2/evaluate_model.py             |  150 --
 examples/ttm-r2/requirements.txt              |    3 -
 src/fev/__about__.py                          |    2 +-
 36 files changed, 134 insertions(+), 3838 deletions(-)
 create mode 100644 examples/README.md
 delete mode 100644 examples/chronos-2/evaluate_model.py
 delete mode 100644 examples/chronos-2/requirements.txt
 delete mode 100644 examples/chronos/evaluate_model.py
 delete mode 100644 examples/chronos/requirements.txt
 delete mode 100644 examples/mlforecast/evaluate_model.py
 delete mode 100644 examples/mlforecast/requirements.txt
 delete mode 100644 examples/moirai/evaluate_model.py
 delete mode 100644 examples/moirai/requirements.txt
 delete mode 100644 examples/seasonal_naive/evaluate_model.py
 delete mode 100644 examples/seasonal_naive/requirements.txt
 delete mode 100644 examples/statsforecast/evaluate_model.py
 delete mode 100644 examples/statsforecast/requirements.txt
 delete mode 100644 examples/timecopilot/.python-version
 delete mode 100644 examples/timecopilot/README.md
 delete mode 100644 examples/timecopilot/pyproject.toml
 delete mode 100644 examples/timecopilot/requirements.txt
 delete mode 100644 examples/timecopilot/src/__init__.py
 delete mode 100644 examples/timecopilot/src/download_results.py
 delete mode 100644 examples/timecopilot/src/evaluate_model.py
 delete mode 100644 examples/timecopilot/src/evaluate_model_modal.py
 delete mode 100644 examples/timesfm-2.0/evaluate_model.py
 delete mode 100644 examples/timesfm-2.0/requirements.txt
 delete mode 100644 examples/timesfm-2.5/evaluate_model.py
 delete mode 100644 examples/timesfm-2.5/requirements.txt
 delete mode 100644 examples/tirex/README.md
 delete mode 100644 examples/tirex/evaluate_model.py
 delete mode 100644 examples/tirex/freezed_test_environment.yaml
 delete mode 100644 examples/tirex/requirements.txt
 delete mode 100644 examples/toto/evaluate_model.py
 delete mode 100644 examples/toto/requirements.txt
 delete mode 100644 examples/ttm-r2/evaluate_model.py
 delete mode 100644 examples/ttm-r2/requirements.txt

diff --git a/README.md b/README.md
index b09e348..5a6614e 100644
--- a/README.md
+++ b/README.md
@@ -26,9 +26,6 @@ Existing forecasting benchmarks usually fall into one of two categories:
 
 `fev` aims for the middle ground - it provides the core benchmarking functionality without introducing unnecessary constraints or bloated dependencies. The library supports point & probabilistic forecasting, different types of covariates, as well as all popular forecasting metrics.
 
-## 📝 Updates
-- **2025-09-16**: The new version `0.6.0` contains major new functionality, [updated documentation](https://autogluon.github.io/fev/latest/), as well as some breaking changes to the `Task` API. Please check the [release notes](https://github.com/autogluon/fev/releases) for more details.
-
 ## ⚙️ Installation
 ```
 pip install fev
@@ -123,7 +120,7 @@ fev.leaderboard(summaries)
     - [Models](https://autogluon.github.io/fev/latest/tutorials/05-add-your-model/): Evaluate your models and submit results to the leaderboard.
 - [API reference](https://autogluon.github.io/fev/latest/api/task/)
 
-Examples of model implementations compatible with `fev` are available in [`examples/`](./examples/).
+Model wrappers and instructions for contributing models are available in [`models/`](./models/).
 
 
 ## 🏅 Leaderboards
diff --git a/docs/tutorials/05-add-your-model.ipynb b/docs/tutorials/05-add-your-model.ipynb
index 6b169c2..d577e46 100644
--- a/docs/tutorials/05-add-your-model.ipynb
+++ b/docs/tutorials/05-add-your-model.ipynb
@@ -5,7 +5,7 @@
    "metadata": {},
    "source": [
     "This notebook covers the following topics:\n",
-    "1. Adding a wrapper for your model to [fev/examples](https://github.com/autogluon/fev/tree/main/examples).\n",
+    "1. Adding a wrapper for your model to [fev/models](https://github.com/autogluon/fev/tree/main/models).\n",
     "2. Submitting the results for your model to the [fev-leaderboard](https://huggingface.co/spaces/autogluon/fev-leaderboard)."
    ]
   },
@@ -13,52 +13,96 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Adding a wrapper for your model to [fev/examples](https://github.com/autogluon/fev/tree/main/examples).\n",
-    "To add a wrapper for your library to `fev/examples`, you need to create a folder under `fev/examples/{YOUR_MODEL_NAME}` that contains:\n",
-    "- A Python file `evaluate_model.py` that contains a method `predict_with_model` with signature\n",
-    "    ```python\n",
-    "    def predict_with_model(task: fev.Task, **kwargs) -> tuple[list[datasets.DatasetDict], float, dict]:\n",
-    "        \"\"\"Returns model predictions, inference time and potentially extra information about the model.\"\"\"\n",
-    "        ...\n",
-    "    ```\n",
-    "- `requirements.txt` file containing the required dependencies for your model.\n",
-    "\n",
-    "Defining the method `predict_with_model` is the most complex part of this process. We recommend looking at the implementations of some existing models to see how this can be done.\n",
-    "\n",
-    "The only hard requirement for this method is that it should return a tuple consisting of 3 elements:\n",
-    "1. `predictions` (`list[datasets.DatasetDict]`) object containing the model predictions for each evaluation window.  \n",
-    "2. `inference_time` (`float`) inference time of the model for the entire task (in seconds).\n",
-    "3. `extra_info` (`dict | None`) optional information about the model such as model configuration.\n",
-    "\n",
-    "Predictions should follow the schema provided by `task.predictions_schema`.\n",
-    "\n",
-    "Each entry of `predictions` must contain a list of length `task.horizon`"
+    "## Adding a wrapper for your model\n",
+    "\n",
+    "Each model wrapper lives in its own subfolder under `models/`. The evaluation harness (`models/evaluate.py`) discovers and runs them automatically.\n",
+    "\n",
+    "### Step 1: Create the folder\n",
+    "\n",
+    "Create a folder `models/<name>/` where `<name>` is how you'll refer to the model with the `-m` flag.\n",
+    "\n",
+    "### Step 2: Add `model.py`\n",
+    "\n",
+    "Implement a subclass of `fev.ForecastingModel`. The `model_name` class attribute must match the folder name."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```python\n",
+    "# models/my-model/model.py\n",
+    "import datasets\n",
+    "\n",
+    "import fev\n",
+    "\n",
+    "\n",
+    "class MyModel(fev.ForecastingModel):\n",
+    "    model_name = \"my-model\"  # must match the folder name\n",
+    "\n",
+    "    # List HF dataset configs (from autogluon/fev_datasets) used during pretraining.\n",
+    "    # Used to flag potential data leakage. Leave empty for models that train from scratch.\n",
+    "    trained_on_datasets = [\"kdd_cup_2022_10T\", \"m5_1D\"]\n",
+    "\n",
+    "    def __init__(self, model_size: str = \"small\"):\n",
+    "        super().__init__()\n",
+    "        self.model_size = model_size\n",
+    "\n",
+    "    def _fit_predict(self, task: fev.Task) -> list[datasets.DatasetDict]:\n",
+    "        predictions_per_window = []\n",
+    "        for window in task.iter_windows():\n",
+    "            past_data, future_data = window.get_input_data()\n",
+    "\n",
+    "            with self._record_inference_time():\n",
+    "                # Generate predictions for each time series\n",
+    "                predictions = {\"predictions\": [...]}\n",
+    "\n",
+    "            predictions_per_window.append(predictions)\n",
+    "        return predictions_per_window\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Key points about `_fit_predict`:\n",
+    "- Called once per task. Must return predictions for **all** evaluation windows.\n",
+    "- Use `self._record_inference_time()` context manager to track inference time.\n",
+    "- Use `self._record_training_time()` if your model has a training step.\n",
+    "- Each call should be independent — don't carry over state from prior tasks.\n",
+    "- Caching expensive resources (weights, tokenizers) on `self` across calls is fine."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 3: Add `requirements.txt`\n",
+    "\n",
+    "List pinned dependencies for your model. These are installed automatically in an ephemeral environment when running `evaluate.py` — your project environment is not modified.\n",
+    "\n",
+    "```\n",
+    "# models/my-model/requirements.txt\n",
+    "my-forecasting-lib==1.2.3\n",
+    "torch>=2.0\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Predictions format\n",
+    "\n",
+    "Predictions must follow the schema provided by `task.predictions_schema`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/shchuro/envs/fev/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'predictions': Sequence(feature=Value(dtype='float64', id=None), length=30, id=None)}"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import fev\n",
     "\n",
@@ -75,28 +119,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "  For a probabilistic forecasting task (if `task.quantile_levels` are provided), each entry of `predictions` must additionally contain the quantile forecasts. For example"
+    "For probabilistic forecasting tasks (when `task.quantile_levels` is set), predictions must additionally contain quantile forecasts:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'predictions': Sequence(feature=Value(dtype='float64', id=None), length=30, id=None),\n",
-       " '0.1': Sequence(feature=Value(dtype='float64', id=None), length=30, id=None),\n",
-       " '0.5': Sequence(feature=Value(dtype='float64', id=None), length=30, id=None),\n",
-       " '0.9': Sequence(feature=Value(dtype='float64', id=None), length=30, id=None)}"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "task = fev.Task(\n",
     "    dataset_path=\"autogluon/chronos_datasets\",\n",
@@ -112,116 +142,68 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The `predictions` cannot contain any missing values represented by `NaN`, otherwise an exception will be raised.\n",
+    "Predictions cannot contain any `NaN` values.\n",
     "\n",
-    "Other than what's described above, there are no hard restrictions on how the `predict_with_model` method needs to be implemented. For example, it's completely up to you whether the method uses any datasets columns except the target or how the data is preprocessed.\n",
+    "### Tips\n",
     "\n",
-    "Still, here is some general advice:\n",
-    "- If your model is capable of generating probabilistic forecasts, make sure that you correct the \"optimal\" forecast for the `task.eval_metric`. For example, metrics like `\"MSE\"` or `\"RMSSE\"`, the mean forecast is preferred, while metrics like `\"MASE\"` are optimized by the median forecast.\n",
-    "- Use `fev.convert_input_data()` to take advantage of the adapters and reduce the boilerplate preprocessing code.\n",
-    "- Make sure that your wrapper can deal with missing values (or at least imputes them before passing the data to your model).\n",
-    "- Make sure that your wrapper takes advantage of the extra features of the task. For example, the following attributes might be useful:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "task.static_columns=[]\n",
-      "task.dynamic_columns=[]\n",
-      "task.known_dynamic_columns=[]\n",
-      "task.past_dynamic_columns=[]\n",
-      "task.freq='h'\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"{task.static_columns=}\")\n",
-    "print(f\"{task.dynamic_columns=}\")\n",
-    "print(f\"{task.known_dynamic_columns=}\")\n",
-    "print(f\"{task.past_dynamic_columns=}\")\n",
-    "# Attributes available after `task.load_full_dataset` is called\n",
-    "task.load_full_dataset()\n",
-    "print(f\"{task.freq=}\")"
+    "- If your model generates probabilistic forecasts, choose the \"optimal\" point forecast for the `task.eval_metric`. For example, metrics like `\"MSE\"` prefer the mean, while `\"MASE\"` is optimized by the median.\n",
+    "- Use `fev.convert_input_data()` to take advantage of adapters and reduce boilerplate preprocessing.\n",
+    "- Make sure your wrapper handles missing values (or imputes them before passing data to the model).\n",
+    "- Take advantage of extra features available via `task.static_columns`, `task.dynamic_columns`, `task.known_dynamic_columns`, and `task.past_dynamic_columns`."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Submitting the results for your model to the [fev-leaderboard](https://huggingface.co/spaces/autogluon/fev-leaderboard)\n",
-    "After you've implemented the wrapper for your model in `fev/examples`, complete the following steps:\n",
-    "1. Fork [`autogluon/fev`](https://github.com/autogluon/fev) and clone your fork to your machine.\n",
-    "2. Implement your model's wrapper in `fev/examples`.\n",
-    "3. Run the model on all tasks from the benchmark and save the results to `fev/benchmarks/fev_bench/results/{model_name}.csv`.\n",
-    "4. Open a pull request to `autogluon/fev` containing the following files:\n",
-    "    - `fev/examples/{model_name}/evaluate_model.py`\n",
-    "    - `fev/examples/{model_name}/requirements.txt`\n",
-    "    - `fev/benchmarks/fev_bench/results/{model_name}.csv`\n",
-    "5. We will independently reproduce the results using the code you provided and add the results to the leaderboard."
+    "## Running evaluation\n",
+    "\n",
+    "```bash\n",
+    "python models/evaluate.py -m my-model\n",
+    "```\n",
+    "\n",
+    "Options:\n",
+    "- `-m` — model name (must match a subfolder in `models/`)\n",
+    "- `-b` — path or URL to benchmark YAML (default: `fev_bench_mini`)\n",
+    "- `-n` — display name for results (default: same as `-m`)\n",
+    "- `-k` — JSON dict of kwargs passed to the model constructor\n",
+    "- `-t` — limit number of tasks (useful for quick testing)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "```python\n",
-    "# Example code from fev/examples/my_amazing_model/evaluate_model.py\n",
-    "\n",
-    "def predict_with_model(task: fev.Task, **kwargs) -> tuple[list[datasets.DatasetDict], float, dict]:\n",
-    "    \"\"\"Wrapper for my_amazing_model\"\"\"\n",
-    "    ...\n",
-    "    return predictions_per_window, inference_time, extra_info\n",
-    "\n",
-    "if __name__ == \"__main__\":\n",
-    "    model_name = \"my_amazing_model\"\n",
-    "    benchmark = fev.Benchmark.from_yaml(\n",
-    "        \"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/fev_bench/tasks.yaml\"\n",
-    "    )\n",
-    "\n",
-    "    summaries = []\n",
-    "    for task in benchmark.tasks:\n",
-    "        predictions_per_window, inference_time, extra_info = predict_with_model(task)\n",
-    "        evaluation_summary = task.evaluation_summary(\n",
-    "            predictions_per_window,\n",
-    "            model_name=model_name,\n",
-    "            inference_time_s=inference_time,\n",
-    "            extra_info=extra_info,\n",
-    "            trained_on_this_dataset=True,  # True if model has seen this dataset during training, False otherwise. Please try to be honest!\n",
-    "        )\n",
-    "        summaries.append(evaluation_summary)\n",
-    "\n",
-    "    summary_df = pd.DataFrame(summaries)\n",
-    "    print(summary_df)\n",
-    "    summary_df.to_csv(f\"{model_name}.csv\", index=False)\n",
-    "```"
+    "## Submitting results to the leaderboard\n",
+    "\n",
+    "After implementing your model wrapper, follow these steps to submit results to the [fev-leaderboard](https://huggingface.co/spaces/autogluon/fev-leaderboard):\n",
+    "\n",
+    "1. Fork [`autogluon/fev`](https://github.com/autogluon/fev) and clone your fork.\n",
+    "2. Implement your model wrapper in `models/<name>/`.\n",
+    "3. Run the model on all tasks from the benchmark and save results:\n",
+    "   ```bash\n",
+    "   python models/evaluate.py -m <name> -b benchmarks/fev_bench/tasks.yaml\n",
+    "   mv <name>.csv benchmarks/fev_bench/results/<name>.csv\n",
+    "   ```\n",
+    "4. Open a pull request to `autogluon/fev` containing:\n",
+    "   - `models/<name>/model.py`\n",
+    "   - `models/<name>/requirements.txt`\n",
+    "   - `benchmarks/fev_bench/results/<name>.csv`\n",
+    "5. We will independently reproduce the results using your code and add them to the leaderboard."
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "fev",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
    "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.11"
+   "version": "3.11.0"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..5af05f0
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,6 @@
+# Examples
+
+Model wrappers and instructions for contributing models are now in [`models/`](../models/).
+
+For archival purposes, old wrappers from `examples/` are available on
+https://github.com/autogluon/fev/tree/v0.7.0/examples
diff --git a/examples/chronos-2/evaluate_model.py b/examples/chronos-2/evaluate_model.py
deleted file mode 100644
index 8a1500d..0000000
--- a/examples/chronos-2/evaluate_model.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import chronos
-import datasets
-import pandas as pd
-import torch
-from chronos import BaseChronosPipeline
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_name: str = "amazon/chronos-2",
-    batch_size: int = 100,
-    device_map: str = "cuda",
-    torch_dtype: torch.dtype = torch.float32,
-    as_univariate: bool = False,
-    cross_learning: bool = True,
-    seed: int = 123,
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    pipeline = BaseChronosPipeline.from_pretrained(model_name, device_map=device_map, torch_dtype=torch_dtype)
-    torch.manual_seed(seed)
-
-    predictions_per_window, inference_time = pipeline.predict_fev(
-        task, batch_size=batch_size, as_univariate=as_univariate, cross_learning=cross_learning
-    )
-
-    extra_info = {
-        "framework_version": chronos.__version__,
-        "model_config": {
-            "model_name": model_name,
-            "batch_size": batch_size,
-            "device_map": device_map,
-            "torch_dtype": str(torch_dtype),
-            "as_univariate": as_univariate,
-            "cross_learning": cross_learning,
-            "seed": seed,
-        },
-    }
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_name = "amazon/chronos-2"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://github.com/autogluon/fev/raw/refs/heads/main/benchmarks/fev_bench/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_name=model_name)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv("chronos-2.csv", index=False)
diff --git a/examples/chronos-2/requirements.txt b/examples/chronos-2/requirements.txt
deleted file mode 100644
index 2167f82..0000000
--- a/examples/chronos-2/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-chronos-forecasting==2.0.0
-torch==2.7.1
-transformers==4.54.1
-accelerate==1.9.0
-numpy==2.1.3
-einops==0.8.1
-scikit-learn==1.7.2
diff --git a/examples/chronos/evaluate_model.py b/examples/chronos/evaluate_model.py
deleted file mode 100644
index 84b75c1..0000000
--- a/examples/chronos/evaluate_model.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import time
-
-import datasets
-import numpy as np
-import pandas as pd
-import torch
-from chronos import BaseChronosPipeline, ForecastType
-from tqdm.auto import tqdm
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-def batchify(lst: list, batch_size: int = 32):
-    """Convert list into batches of desired size."""
-    for i in range(0, len(lst), batch_size):
-        yield lst[i : i + batch_size]
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_name: str = "amazon/chronos-bolt-small",
-    batch_size: int = 32,
-    device_map: str = "cuda",
-    torch_dtype: str = torch.bfloat16,
-    num_samples: int = 20,
-    seed: int = 123,
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    pipeline = BaseChronosPipeline.from_pretrained(model_name, device_map=device_map, torch_dtype=torch_dtype)
-    torch.manual_seed(seed)
-
-    inference_time = 0.0
-    quantile_levels = task.quantile_levels.copy()
-    if 0.5 not in quantile_levels:
-        quantile_levels.append(0.5)
-
-    predictions_per_window = []
-    for window in task.iter_windows():
-        past_data, _ = fev.convert_input_data(window, adapter="datasets", as_univariate=True)
-        past_data = past_data.with_format("torch").cast_column("target", datasets.Sequence(datasets.Value("float32")))
-
-        quantiles_all = []
-        mean_all = []
-
-        start_time = time.monotonic()
-        for batch in batchify(past_data["target"], batch_size=batch_size):
-            quantiles, mean = pipeline.predict_quantiles(
-                context=batch,
-                prediction_length=task.horizon,
-                limit_prediction_length=False,
-                quantile_levels=quantile_levels,
-            )
-
-            quantiles_all.append(quantiles.numpy())
-            mean_all.append(mean.numpy())
-        inference_time += time.monotonic() - start_time
-
-        quantiles_np = np.concatenate(quantiles_all, axis=0)  # [num_items, horizon, num_quantiles]
-        mean_np = np.concatenate(mean_all, axis=0)  # [num_items, horizon]
-
-        if task.eval_metric in ["MSE", "RMSE", "RMSSE"]:
-            point_forecast = mean_np  # [num_items, horizon]
-        else:
-            # use median as the point forecast
-            point_forecast = quantiles_np[:, :, quantile_levels.index(0.5)]  # [num_items, horizon]
-        predictions_dict = {"predictions": point_forecast}
-
-        for idx, level in enumerate(task.quantile_levels):
-            predictions_dict[str(level)] = quantiles_np[:, :, idx]
-
-        predictions_per_window.append(
-            fev.utils.combine_univariate_predictions_to_multivariate(
-                datasets.Dataset.from_dict(predictions_dict), target_columns=task.target_columns
-            )
-        )
-
-    extra_info = {
-        "model_config": {
-            "model_name": model_name,
-            "batch_size": batch_size,
-            "device_map": device_map,
-            "torch_dtype": str(torch_dtype),
-            "num_samples": num_samples,
-            "seed": seed,
-        }
-    }
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_name = "amazon/chronos-bolt-small"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_name=model_name)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv(f"{model_name}.csv", index=False)
diff --git a/examples/chronos/requirements.txt b/examples/chronos/requirements.txt
deleted file mode 100644
index f0d43bd..0000000
--- a/examples/chronos/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-chronos-forecasting==1.5.3
-torch==2.6.0
diff --git a/examples/mlforecast/evaluate_model.py b/examples/mlforecast/evaluate_model.py
deleted file mode 100644
index f42fbbf..0000000
--- a/examples/mlforecast/evaluate_model.py
+++ /dev/null
@@ -1,439 +0,0 @@
-"""MLForecast recursive models with LightGBM and CatBoost.
-
-Design notes:
-- Lightweight HPO tunes preprocessing only (differencing, scaling, lag transforms)
-- Lags and time features selected via heuristics based on frequency
-- No HPO of model hyperparameters to keep runtime manageable
-
-This is the simplest setup we found that produces reasonable results across all 100 tasks
-of fev-bench. Removing HPO or the heuristics significantly hurts performance. There's
-likely room for better automated logic - contributions welcome!
-"""
-
-import time
-import warnings
-from typing import Literal
-
-import datasets
-import pandas as pd
-from tqdm.auto import tqdm
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-def _create_lgbm(fit_time_limit: float | None, **model_kwargs):
-    from lightgbm import LGBMRegressor
-    from lightgbm.callback import EarlyStopException
-
-    class _LGBMRegressor(LGBMRegressor):
-        def fit(self, X, y, **kwargs):
-            if fit_time_limit is not None:
-                start_time = time.time()
-
-                def _time_callback(env):
-                    if time.time() - start_time >= fit_time_limit:
-                        raise EarlyStopException(env.iteration, [])
-
-                _time_callback.order = 30
-                callbacks = kwargs.get("callbacks", []) or []
-                kwargs["callbacks"] = callbacks + [_time_callback]
-            return super().fit(X, y, **kwargs)
-
-    return _LGBMRegressor(objective="mae", verbose=-1, **model_kwargs)
-
-
-def _create_catboost(fit_time_limit: float | None, **model_kwargs):
-    from catboost import CatBoostRegressor
-
-    class _CatBoostRegressor(CatBoostRegressor):
-        def fit(self, X, y=None, **kwargs):
-            if "cat_features" not in kwargs:
-                cat_cols = list(X.select_dtypes(include=["object", "category"]).columns)
-                if cat_cols:
-                    kwargs["cat_features"] = cat_cols
-
-            if fit_time_limit is not None:
-                start_time = time.time()
-
-                class _TimeCallback:
-                    def __init__(self):
-                        self.time_end = start_time + fit_time_limit
-
-                    def after_iteration(self, info):
-                        time_cur = time.time()
-                        time_per_iter = (time_cur - start_time) / max(info.iteration, 1)
-                        return self.time_end >= (time_cur + 2 * time_per_iter)
-
-                callbacks = kwargs.get("callbacks", []) or []
-                kwargs["callbacks"] = callbacks + [_TimeCallback()]
-
-            return super().fit(X, y, **kwargs)
-
-    return _CatBoostRegressor(loss_function="MAE", verbose=False, allow_writing_files=False, **model_kwargs)
-
-
-class MLForecastModel:
-    """MLForecast with LightGBM or CatBoost regressor."""
-
-    def __init__(
-        self,
-        regressor: Literal["lightgbm", "catboost"] = "lightgbm",
-        lags: list[int] | None = None,
-        date_features: list | None = None,
-        differences: list[int] | None = None,
-        fit_time_limit: float | None = 600,
-        model_kwargs: dict | None = None,
-    ):
-        self.regressor = regressor
-        self.lags = lags
-        self.date_features = date_features
-        self.differences = differences
-        self.fit_time_limit = fit_time_limit
-        self.model_kwargs = model_kwargs or {}
-
-    def _create_model(self):
-        if self.regressor == "lightgbm":
-            return _create_lgbm(self.fit_time_limit, **self.model_kwargs)
-        if self.regressor == "catboost":
-            return _create_catboost(self.fit_time_limit, **self.model_kwargs)
-        raise ValueError(f"Unknown regressor: {self.regressor}")
-
-    def _get_lags(self, freq: str, median_series_len: int, seasonality: int = 1) -> list[int]:
-        if self.lags is not None:
-            return [lag for lag in self.lags if lag < median_series_len]
-
-        from autogluon.timeseries.utils.datetime import get_lags_for_frequency
-
-        # Limit max lag so that we have enough training samples even for short series.
-        # After differencing, the effective length decreases, and we need some rows left
-        # for training features; hence we reserve at least 10 rows for feature construction.
-        diff_cost = max(self.differences) if self.differences else seasonality
-        effective_len = median_series_len - diff_cost
-        max_lag = min(effective_len - 1, max(1, effective_len - 10))
-        lags = [lag for lag in get_lags_for_frequency(freq) if lag <= max_lag]
-
-        if effective_len < 30 and len(lags) > 5:
-            lags = lags[:5]
-
-        return lags if lags else [1]
-
-    def _get_date_features(self, freq: str) -> list:
-        if self.date_features is not None:
-            return self.date_features
-        from autogluon.timeseries.utils.datetime import get_time_features_for_frequency
-
-        return get_time_features_for_frequency(freq)
-
-    def _get_target_transforms(self, seasonality: int = 1, min_series_len: int | None = None):
-        from mlforecast.target_transforms import Differences, LocalStandardScaler
-
-        transforms = []
-        differences = self.differences if self.differences is not None else [seasonality]
-        if differences and (min_series_len is None or min_series_len > max(differences)):
-            transforms.append(Differences(differences))
-        transforms.append(LocalStandardScaler())
-        return transforms
-
-    def _prepare_data(self, window: fev.EvaluationWindow, task: fev.Task) -> tuple[pd.DataFrame, pd.DataFrame | None]:
-        train_df, future_df, static_df = fev.convert_input_data(window, adapter="nixtla", as_univariate=True)
-        train_df = train_df.copy()
-        future_df = future_df.copy()
-
-        if train_df["y"].isna().any():
-            train_df["y"] = train_df.groupby("unique_id", sort=False)["y"].ffill().fillna(0.0)
-
-        if task.past_dynamic_columns:
-            train_df = train_df.drop(columns=task.past_dynamic_columns)
-
-        if static_df is not None and len(static_df.columns) > 1:
-            train_df = train_df.merge(static_df, on="unique_id", how="left")
-            future_df = future_df.merge(static_df, on="unique_id", how="left")
-
-        cat_cols = list(train_df.select_dtypes(include=["object", "category"]).columns)
-        for col in cat_cols:
-            train_df[col] = train_df[col].fillna("_NA_")
-            future_df[col] = future_df[col].fillna("_NA_")
-            all_categories = pd.concat([train_df[col], future_df[col]]).unique()
-            cat_type = pd.CategoricalDtype(categories=all_categories)
-            train_df[col] = train_df[col].astype(cat_type)
-            future_df[col] = future_df[col].astype(cat_type)
-
-        for df in [train_df, future_df]:
-            num_cols = list(df.select_dtypes(include=["number"]).columns)
-            df[num_cols] = df[num_cols].astype("float32")
-
-        if len(future_df.columns) <= 2:
-            future_df = None
-
-        return train_df, future_df
-
-    def _create_mlforecast(self, freq: str, lags: list[int], date_features: list, target_transforms: list):
-        from mlforecast import MLForecast
-
-        return MLForecast(
-            models={self.regressor: self._create_model()},
-            freq=freq,
-            lags=lags,
-            date_features=date_features,
-            target_transforms=target_transforms,
-        )
-
-    def _format_predictions(
-        self, preds_df: pd.DataFrame, quantile_levels: list[float], target_columns: list[str]
-    ) -> datasets.DatasetDict:
-        preds_df[fev.constants.PREDICTIONS] = preds_df[self.regressor]
-        for q in quantile_levels:
-            preds_df[str(q)] = preds_df[self.regressor]
-
-        output_columns = [fev.constants.PREDICTIONS] + [str(q) for q in quantile_levels]
-        predictions = [group[output_columns].to_dict("list") for _, group in preds_df.groupby("unique_id", sort=True)]
-        return fev.utils.combine_univariate_predictions_to_multivariate(predictions, target_columns=target_columns)
-
-    def _predict_window(
-        self, window: fev.EvaluationWindow, task: fev.Task, tuned_config: dict | None = None
-    ) -> tuple[datasets.DatasetDict, float, float]:
-        train_df, future_df = self._prepare_data(window, task)
-
-        if tuned_config is not None:
-            lags = tuned_config["lags"]
-            date_features = tuned_config["date_features"]
-            target_transforms = tuned_config["target_transforms"]
-        else:
-            series_lengths = train_df.groupby("unique_id").size()
-            min_series_len = int(series_lengths.min())
-            median_series_len = int(series_lengths.median())
-            lags = self._get_lags(task.freq, median_series_len, task.seasonality)
-            date_features = self._get_date_features(task.freq)
-            target_transforms = self._get_target_transforms(task.seasonality, min_series_len)
-
-        forecaster = self._create_mlforecast(task.freq, lags, date_features, target_transforms)
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-
-            start_time = time.monotonic()
-            forecaster.fit(train_df, static_features=[])
-            training_time = time.monotonic() - start_time
-
-            start_time = time.monotonic()
-            preds_df = forecaster.predict(window.horizon, X_df=future_df)
-            inference_time = time.monotonic() - start_time
-
-        return (
-            self._format_predictions(preds_df, task.quantile_levels, window.target_columns),
-            training_time,
-            inference_time,
-        )
-
-    def fit_predict(self, task: fev.Task) -> tuple[list[datasets.DatasetDict], float, float, dict]:
-        """Fit and predict on all windows of a task.
-
-        Returns:
-            predictions: List of DatasetDict, one per evaluation window
-            training_time: Total time spent on fitting (seconds)
-            inference_time: Total time spent on predicting (seconds)
-            extra_info: Dictionary with model metadata
-        """
-        task.load_full_dataset()
-
-        predictions = []
-        total_training_time = 0.0
-        total_inference_time = 0.0
-
-        for window in task.iter_windows():
-            preds, train_time, infer_time = self._predict_window(window, task)
-            predictions.append(preds)
-            total_training_time += train_time
-            total_inference_time += infer_time
-
-        extra_info = {"regressor": self.regressor}
-        return predictions, total_training_time, total_inference_time, extra_info
-
-
-class MLForecastAutoModel(MLForecastModel):
-    """MLForecast with preprocessing tuning via Optuna."""
-
-    def __init__(
-        self,
-        regressor: Literal["lightgbm", "catboost"] = "lightgbm",
-        num_samples: int = 20,
-        n_windows: int = 3,
-        hpo_time_limit: float | None = 1800,
-        n_jobs: int = 1,
-        **kwargs,
-    ):
-        super().__init__(regressor=regressor, **kwargs)
-        self.num_samples = num_samples
-        self.n_windows = n_windows
-        self.hpo_time_limit = hpo_time_limit
-        self.n_jobs = n_jobs
-
-    def _filter_series_for_hpo(self, train_df: pd.DataFrame, horizon: int) -> pd.DataFrame | None:
-        min_required = (self.n_windows + 1) * horizon + 1
-        series_lengths = train_df.groupby("unique_id").size()
-        valid_series = series_lengths[series_lengths >= min_required].index
-
-        min_series_count = min(10, max(1, len(series_lengths) // 10))
-        if len(valid_series) < min_series_count:
-            return None
-
-        return train_df[train_df["unique_id"].isin(valid_series)]
-
-    def _get_preprocessing_search_space(
-        self, seasonality: int, default_lags: list[int], default_date_features: list, min_series_len: int
-    ):
-        from mlforecast.lag_transforms import ExponentiallyWeightedMean, RollingMean
-        from mlforecast.target_transforms import Differences, LocalStandardScaler
-
-        candidate_transforms = [[LocalStandardScaler()]]
-        if min_series_len > 1:
-            candidate_transforms.append([Differences([1]), LocalStandardScaler()])
-        if min_series_len > seasonality:
-            candidate_transforms.insert(0, [Differences([seasonality]), LocalStandardScaler()])
-        if seasonality > 1 and min_series_len > seasonality + 1:
-            candidate_transforms.append([Differences([1, seasonality]), LocalStandardScaler()])
-
-        candidate_lag_transforms = [None, {1: [ExponentiallyWeightedMean(0.9)]}]
-        if seasonality > 1:
-            candidate_lag_transforms.append({seasonality: [RollingMean(window_size=seasonality, min_samples=1)]})
-
-        def config(trial):
-            tfm_idx = trial.suggest_categorical("target_transforms_idx", range(len(candidate_transforms)))
-            lag_tfm_idx = trial.suggest_categorical("lag_transforms_idx", range(len(candidate_lag_transforms)))
-            return {
-                "target_transforms": candidate_transforms[tfm_idx],
-                "lags": default_lags,
-                "lag_transforms": candidate_lag_transforms[lag_tfm_idx],
-                "date_features": default_date_features,
-            }
-
-        return config
-
-    def _run_hpo(
-        self, train_df: pd.DataFrame, task: fev.Task, lags: list[int], date_features: list, min_series_len: int
-    ) -> dict:
-        """Run HPO and return the tuned config."""
-        import optuna
-        from mlforecast.auto import AutoMLForecast, AutoModel
-
-        optuna.logging.set_verbosity(optuna.logging.ERROR)
-
-        # MLForecast doesn't allow passing kwargs to model.fit(), so we use custom model wrappers
-        # to inject time limit callbacks and specify categorical features. We also construct custom
-        # search spaces since the default ones in MLForecast can lead to catastrophically bad performance.
-        forecaster = AutoMLForecast(
-            models={self.regressor: AutoModel(model=self._create_model(), config=lambda t: {})},
-            freq=task.freq,
-            init_config=self._get_preprocessing_search_space(task.seasonality, lags, date_features, min_series_len),
-            fit_config=lambda t: {"static_features": []},
-        )
-
-        optimize_kwargs = {}
-        if self.n_jobs != 1:
-            optimize_kwargs["n_jobs"] = self.n_jobs
-        if self.hpo_time_limit is not None:
-            optimize_kwargs["timeout"] = self.hpo_time_limit
-
-        forecaster.fit(
-            train_df,
-            n_windows=self.n_windows,
-            h=task.horizon,
-            num_samples=self.num_samples,
-            optimize_kwargs=optimize_kwargs or None,
-        )
-
-        best_mlf = forecaster.models_[self.regressor]
-        return {
-            "lags": list(best_mlf.ts.lags) if best_mlf.ts.lags is not None else [],
-            "date_features": best_mlf.ts.date_features if best_mlf.ts.date_features else [],
-            "target_transforms": best_mlf.ts.target_transforms if best_mlf.ts.target_transforms else [],
-        }
-
-    def _compute_global_min_series_len(self, task: fev.Task) -> int:
-        min_len = float("inf")
-        for window in task.iter_windows():
-            train_df, _ = self._prepare_data(window, task)
-            min_len = min(min_len, int(train_df.groupby("unique_id").size().min()))
-        return int(min_len)
-
-    def fit_predict(self, task: fev.Task) -> tuple[list[datasets.DatasetDict], float, float, dict]:
-        """Fit and predict on all windows of a task.
-
-        Returns:
-            predictions: List of DatasetDict, one per evaluation window
-            training_time: Total time spent on fitting including HPO (seconds)
-            inference_time: Total time spent on predicting (seconds)
-            extra_info: Dictionary with model metadata
-        """
-        task.load_full_dataset()
-
-        min_series_len = self._compute_global_min_series_len(task)
-
-        first_window = task.get_window(0)
-        train_df, _ = self._prepare_data(first_window, task)
-        hpo_train_df = self._filter_series_for_hpo(train_df, task.horizon)
-
-        tuned_config = None
-        hpo_time = 0.0
-
-        if hpo_train_df is not None:
-            series_lengths = hpo_train_df.groupby("unique_id").size()
-            effective_series_len = int(series_lengths.median()) - self.n_windows * task.horizon
-            lags = self._get_lags(task.freq, effective_series_len, task.seasonality)
-            date_features = self._get_date_features(task.freq)
-
-            try:
-                with warnings.catch_warnings():
-                    warnings.simplefilter("ignore")
-                    start_time = time.monotonic()
-                    tuned_config = self._run_hpo(hpo_train_df, task, lags, date_features, min_series_len)
-                    hpo_time = time.monotonic() - start_time
-            except Exception:
-                pass
-
-        predictions = []
-        total_training_time = hpo_time
-        total_inference_time = 0.0
-
-        for window in task.iter_windows():
-            preds, train_time, infer_time = self._predict_window(window, task, tuned_config)
-            predictions.append(preds)
-            total_training_time += train_time
-            total_inference_time += infer_time
-
-        return predictions, total_training_time, total_inference_time, {}
-
-
-if __name__ == "__main__":
-    # Configuration
-    use_auto = True  # Set to False for fixed preprocessing
-    model_name = "lightgbm"  # "lightgbm" or "catboost"
-    num_tasks = None  # Set to small number for testing, None for full benchmark
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/fev_bench/tasks.yaml"
-    )
-
-    if use_auto:
-        model = MLForecastAutoModel(regressor=model_name)
-    else:
-        model = MLForecastModel(regressor=model_name)
-
-    summaries = []
-    for task in tqdm(benchmark.tasks[:num_tasks]):
-        predictions, training_time, inference_time, extra_info = model.fit_predict(task)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            training_time_s=training_time,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv(f"{model_name}.csv", index=False)
diff --git a/examples/mlforecast/requirements.txt b/examples/mlforecast/requirements.txt
deleted file mode 100644
index d0970b9..0000000
--- a/examples/mlforecast/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-mlforecast==0.14.0
-optuna==4.6.0
-lightgbm==4.6.0
-autogluon.timeseries==1.5.0
-catboost==1.2.8
diff --git a/examples/moirai/evaluate_model.py b/examples/moirai/evaluate_model.py
deleted file mode 100644
index 0df54de..0000000
--- a/examples/moirai/evaluate_model.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import logging
-import time
-import warnings
-
-import datasets
-import numpy as np
-import pandas as pd
-import torch
-from uni2ts.model.moirai2 import Moirai2Forecast, Moirai2Module
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_name: str = "Salesforce/moirai-2.0-R-small",
-    context_length: int = 500,
-    batch_size: int = 128,
-    device: str = "cpu",
-    seed: int = 123,
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    torch.manual_seed(seed)
-    # Disable GluonTS warnings when accessing forecast.mean
-    gts_logger = logging.getLogger("gluonts")
-    gts_logger.setLevel(100)
-
-    model = Moirai2Forecast(
-        module=Moirai2Module.from_pretrained(model_name).to(device),
-        prediction_length=task.horizon,
-        context_length=context_length,
-        target_dim=1,
-        feat_dynamic_real_dim=0,
-        past_feat_dynamic_real_dim=0,
-    )
-    predictor = model.create_predictor(batch_size=batch_size)
-
-    inference_time = 0.0
-    predictions_per_window = []
-    for window in task.iter_windows():
-        _, prediction_dataset = fev.convert_input_data(window, adapter="gluonts", as_univariate=True)
-        start_time = time.monotonic()
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", RuntimeWarning)
-            forecasts = list(predictor.predict(prediction_dataset))
-        inference_time += time.monotonic() - start_time
-
-        predictions_dict = {"predictions": np.stack([f.mean for f in forecasts])}
-        for q in task.quantile_levels:
-            predictions_dict[str(q)] = np.stack([f.quantile(q) for f in forecasts])
-        predictions_per_window.append(
-            fev.utils.combine_univariate_predictions_to_multivariate(
-                datasets.Dataset.from_dict(predictions_dict), target_columns=task.target_columns
-            )
-        )
-
-    extra_info = {
-        "model_config": {
-            "context_length": context_length,
-            "model_name": model_name,
-            "batch_size": batch_size,
-            "device": device,
-            "seed": seed,
-        }
-    }
-
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_name = "Salesforce/moirai-2.0-R-small"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_name=model_name)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv("moirai-2.0.csv", index=False)
diff --git a/examples/moirai/requirements.txt b/examples/moirai/requirements.txt
deleted file mode 100644
index 73dd6e2..0000000
--- a/examples/moirai/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-uni2ts @ git+https://github.com/SalesforceAIResearch/uni2ts@b472ef6dcf770b96a1b04bd98cd11e893058573a
-torch<2.5.0
diff --git a/examples/seasonal_naive/evaluate_model.py b/examples/seasonal_naive/evaluate_model.py
deleted file mode 100644
index 8346d06..0000000
--- a/examples/seasonal_naive/evaluate_model.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import time
-
-import datasets
-import numpy as np
-import pandas as pd
-from gluonts.model.seasonal_naive import SeasonalNaivePredictor
-
-import fev
-
-
-def predict_with_model(task: fev.Task) -> tuple[list[datasets.DatasetDict], float, dict]:
-    predictor = SeasonalNaivePredictor(prediction_length=task.horizon, season_length=task.seasonality)
-
-    inference_time = 0.0
-    predictions_per_window = []
-    for window in task.iter_windows(trust_remote_code=True):
-        _, prediction_dataset = fev.convert_input_data(window, adapter="gluonts", as_univariate=True)
-        start_time = time.monotonic()
-        forecast = np.stack([f.samples for f in predictor.predict(prediction_dataset)]).squeeze(
-            1
-        )  # [num_items, horizon]
-        inference_time += time.monotonic() - start_time
-
-        predictions_dict = {"predictions": forecast}
-        for q in task.quantile_levels:
-            predictions_dict[str(q)] = forecast
-
-        predictions_per_window.append(
-            fev.combine_univariate_predictions_to_multivariate(
-                datasets.Dataset.from_dict(predictions_dict), target_columns=task.target_columns
-            )
-        )
-
-    return predictions_per_window, inference_time, {}
-
-
-if __name__ == "__main__":
-    model_name = "seasonal_naive"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv(f"{model_name}.csv", index=False)
diff --git a/examples/seasonal_naive/requirements.txt b/examples/seasonal_naive/requirements.txt
deleted file mode 100644
index 9b25113..0000000
--- a/examples/seasonal_naive/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-gluonts==0.16.0
diff --git a/examples/statsforecast/evaluate_model.py b/examples/statsforecast/evaluate_model.py
deleted file mode 100644
index 1294b5a..0000000
--- a/examples/statsforecast/evaluate_model.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import inspect
-import os
-import time
-import warnings
-from typing import Type
-
-import datasets
-import pandas as pd
-from statsforecast import StatsForecast
-from statsforecast.models import AutoARIMA, AutoCES, AutoETS, AutoTheta, Naive, SeasonalNaive
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-model_name_to_class = {
-    "naive": Naive,
-    "seasonal_naive": SeasonalNaive,
-    "auto_arima": AutoARIMA,
-    "auto_ces": AutoCES,
-    "auto_ets": AutoETS,
-    "auto_theta": AutoTheta,
-}
-
-
-def filter_kwargs(cls: Type, kwargs: dict) -> dict:
-    """Remove kwargs that are not expected by the given class object."""
-    sig = inspect.signature(cls.__init__)
-    filtered_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters}
-    return filtered_kwargs
-
-
-def quantile_to_level(q: float) -> str:
-    """Convert a numeric quantile value to the level suffix used by StatsForecast."""
-    if q < 0.5:
-        prefix = "-lo-"
-        level = 100 - 200 * q
-    else:
-        prefix = "-hi-"
-        level = 200 * q - 100
-    return prefix + str(int(level))
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_name: str = "seasonal_naive",
-    model_kwargs: dict | None = None,
-    n_jobs: int = -1,
-    context_length: int | None = 2500,
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    default_model_kwargs = {"season_length": task.seasonality}
-    if model_kwargs is not None:
-        default_model_kwargs.update(model_kwargs)
-
-    model_cls = model_name_to_class[model_name]
-    model = model_cls(**filter_kwargs(model_cls, default_model_kwargs))
-
-    sf = StatsForecast(
-        models=[model],
-        freq="D",  # we use a placeholder freq since we anyway ignore the forecast timestamps
-        n_jobs=n_jobs,
-        fallback_model=SeasonalNaive(season_length=default_model_kwargs["season_length"]),
-        verbose=True,
-    )
-    levels = sorted(set([round(abs(q - 0.5) * 200) for q in task.quantile_levels]))
-
-    inference_time = 0.0
-    predictions_per_window = []
-    for window in task.iter_windows(trust_remote_code=True):
-        past_df, *_ = fev.convert_input_data(window, "nixtla", as_univariate=True)
-        # Forward fill NaNs + zero-fill leading NaNs
-        past_df = past_df.set_index("unique_id").groupby("unique_id").ffill().reset_index().fillna(0.0)
-        if context_length is not None:
-            past_df = past_df.groupby("unique_id").tail(context_length).reset_index(drop=True)
-
-        start_time = time.monotonic()
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            os.environ["PYTHONWARNINGS"] = "ignore"
-            forecast_df = sf.forecast(df=past_df, h=task.horizon, level=levels)
-        inference_time += time.monotonic() - start_time
-
-        forecast_df["predictions"] = forecast_df[str(model)]
-        for q in task.quantile_levels:
-            forecast_df[str(q)] = forecast_df[str(model) + quantile_to_level(q)]
-
-        selected_columns = ["predictions"] + [str(q) for q in task.quantile_levels]
-        predictions_list = []
-        for _, forecast in forecast_df.groupby("unique_id"):
-            predictions_list.append(forecast[selected_columns].to_dict("list"))
-        predictions_per_window.append(
-            fev.combine_univariate_predictions_to_multivariate(
-                datasets.Dataset.from_list(predictions_list), target_columns=task.target_columns
-            )
-        )
-
-    extra_info = {"model_config": {"context_length": context_length, **default_model_kwargs}}
-
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_name = "seasonal_naive"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_name=model_name)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv(f"{model_name}.csv", index=False)
diff --git a/examples/statsforecast/requirements.txt b/examples/statsforecast/requirements.txt
deleted file mode 100644
index 7eaaabf..0000000
--- a/examples/statsforecast/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-statsforecast==2.0.0
diff --git a/examples/timecopilot/.python-version b/examples/timecopilot/.python-version
deleted file mode 100644
index 2c07333..0000000
--- a/examples/timecopilot/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.11
diff --git a/examples/timecopilot/README.md b/examples/timecopilot/README.md
deleted file mode 100644
index 7c16647..0000000
--- a/examples/timecopilot/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# TimeCopilot `fev` Experiments
-
-This project demonstrates the evaluation of a foundation model ensemble built using the [TimeCopilot](https://timecopilot.dev) library on the [fev](https://github.com/autogluon/fev/) benchmark.
-
-TimeCopilot is an open‑source AI agent for time series forecasting that provides a unified interface to multiple forecasting approaches, from foundation models to classical statistical, machine learning, and deep learning methods, along with built‑in ensemble capabilities for robust and explainable forecasting.
-
-## Model Description
-
-This ensemble leverages [**TimeCopilot's MedianEnsemble**](https://timecopilot.dev/api/models/ensembles/#timecopilot.models.ensembles.median.MedianEnsemble) feature, which combines two state-of-the-art foundation models:
-
-- [**TiRex** (NX-AI)](https://timecopilot.dev/api/models/foundation/models/#timecopilot.models.foundation.tirex.TiRex)
-- [**Chronos** (AWS AI Labs)](https://timecopilot.dev/api/models/foundation/models/#timecopilot.models.foundation.chronos.Chronos)
-
-## Setup
-
-### Prerequisites
-- Python 3.11+
-- [uv](https://docs.astral.sh/uv/) package manager
-- AWS CLI configured (for distributed evaluation)
-- [Modal](https://modal.com/) account (for distributed evaluation)
-
-### Installation
-
-```bash
-# Install dependencies
-uv pip sync requirements.txt
-```
-
-## Evaluation Methods
-
-### 1. Local Evaluation
-
-Run evaluation sequentially (locally):
-
-```bash
-uv run -m src.evaluate_model --num-tasks 2
-```
-
-Remove `--num-tasks` parameter to run on all tasks. Results are saved to `timecopilot.csv` in `fev` format.
-
-### 2. Distributed Evaluation (Recommended)
-
-#### 2.1 Evaluate ensemble
-
-Evaluate all dataset configurations in parallel using [modal](https://modal.com/):
-
-```bash
-# Run distributed evaluation on Modal cloud
-uv run modal run --detach -m src.evaluate_model_modal
-```
-
-This creates one GPU job per dataset configuration, significantly reducing evaluation time.
-
-**Infrastructure:**
-- **GPU**: A10G per job
-- **CPU**: 8 cores per job  
-- **Timeout**: 3 hours per job
-- **Storage**: S3 bucket for data and results
-
-#### 2.2 Collect Results
-
-Download and consolidate results from distributed evaluation:
-
-```bash
-# Download all results from S3 and create consolidated CSV
-uv run python -m src.download_results
-```
-
-Results are saved to `timecopilot.csv` in `fev` format.
diff --git a/examples/timecopilot/pyproject.toml b/examples/timecopilot/pyproject.toml
deleted file mode 100644
index 874928d..0000000
--- a/examples/timecopilot/pyproject.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-[project]
-dependencies = [
-  "fev>=0.6.0",
-  "modal>=1.0.5",
-  "pyarrow<=20.0.0",
-  "s3fs>=2023.12.1",
-  "timecopilot>=0.0.17",
-  "typer>=0.16.0",
-]
-description = "TimeCopilot fev experiments"
-name = "timecopilot-fev"
-readme = "README.md"
-requires-python = ">=3.11"
-version = "0.1.0"
-
-[tool.uv]
-override-dependencies = ["datasets[s3]>=2.15,<4.0"]
diff --git a/examples/timecopilot/requirements.txt b/examples/timecopilot/requirements.txt
deleted file mode 100644
index 708fe5c..0000000
--- a/examples/timecopilot/requirements.txt
+++ /dev/null
@@ -1,1458 +0,0 @@
-# This file was autogenerated by uv via the following command:
-#    uv export --format requirements.txt --no-hashes
-absl-py==2.3.1
-    # via
-    #   einshape
-    #   tensorboard
-    #   timecopilot-timesfm
-accelerate==1.9.0
-    # via timecopilot-chronos-forecasting
-adagio==0.2.6
-    # via fugue
-ag-ui-protocol==0.1.8
-    # via pydantic-ai-slim
-aioboto3==15.0.0
-    # via timecopilot-toto
-aiobotocore==2.23.0
-    # via
-    #   aioboto3
-    #   s3fs
-aiofiles==24.1.0
-    # via aioboto3
-aiohappyeyeballs==2.6.1
-    # via aiohttp
-aiohttp==3.12.15
-    # via
-    #   aiobotocore
-    #   fsspec
-    #   huggingface-hub
-    #   modal
-    #   s3fs
-aioitertools==0.12.0
-    # via aiobotocore
-aiosignal==1.4.0
-    # via aiohttp
-alembic==1.16.5
-    # via optuna
-annotated-types==0.7.0
-    # via
-    #   nixtla
-    #   pydantic
-anthropic==0.60.0
-    # via pydantic-ai-slim
-antlr4-python3-runtime==4.9.3
-    # via
-    #   hydra-core
-    #   omegaconf
-antropy==0.1.9
-    # via tsfeatures
-anyio==4.9.0
-    # via
-    #   anthropic
-    #   google-genai
-    #   groq
-    #   httpx
-    #   jupyter-server
-    #   mcp
-    #   openai
-    #   pydantic-evals
-    #   sse-starlette
-    #   starlette
-    #   watchfiles
-appdirs==1.4.4
-    # via fs
-appnope==0.1.4 ; sys_platform == 'darwin'
-    # via ipykernel
-arch==7.2.0
-    # via tsfeatures
-argcomplete==3.6.2
-    # via pydantic-ai-slim
-argon2-cffi==25.1.0
-    # via jupyter-server
-argon2-cffi-bindings==21.2.0 ; python_full_version >= '3.14'
-    # via argon2-cffi
-argon2-cffi-bindings==25.1.0 ; python_full_version < '3.14'
-    # via argon2-cffi
-arrow==1.3.0
-    # via isoduration
-asttokens==3.0.0
-    # via stack-data
-async-lru==2.0.5
-    # via jupyterlab
-attrs==25.3.0
-    # via
-    #   aiohttp
-    #   jsonschema
-    #   referencing
-    #   sigtools
-babel==2.17.0
-    # via jupyterlab-server
-beartype==0.21.0
-    # via timecopilot-toto
-beautifulsoup4==4.13.4
-    # via nbconvert
-black==25.1.0
-    # via timecopilot-toto
-bleach==6.2.0
-    # via nbconvert
-blinker==1.9.0
-    # via flask
-boto3==1.38.27
-    # via
-    #   aiobotocore
-    #   pydantic-ai-slim
-    #   timecopilot-toto
-botocore==1.38.27
-    # via
-    #   aiobotocore
-    #   boto3
-    #   s3transfer
-cachetools==5.5.2
-    # via google-auth
-certifi==2025.7.14
-    # via
-    #   httpcore
-    #   httpx
-    #   modal
-    #   requests
-    #   sentry-sdk
-cffi==1.17.1
-    # via
-    #   argon2-cffi-bindings
-    #   pyzmq
-    #   zstandard
-charset-normalizer==3.4.2
-    # via
-    #   reportlab
-    #   requests
-click==8.1.8
-    # via
-    #   black
-    #   flask
-    #   modal
-    #   ray
-    #   typer
-    #   uvicorn
-    #   wandb
-cloudpickle==3.1.1
-    # via
-    #   mlforecast
-    #   statsforecast
-cmdstanpy==1.2.5
-    # via prophet
-cohere==5.16.1 ; sys_platform != 'emscripten'
-    # via pydantic-ai-slim
-colorama==0.4.6
-    # via
-    #   click
-    #   colorlog
-    #   griffe
-    #   ipython
-    #   pytest
-    #   tqdm
-colorlog==6.9.0
-    # via optuna
-comm==0.2.3
-    # via
-    #   ipykernel
-    #   ipywidgets
-contourpy==1.3.3
-    # via matplotlib
-coreforecast==0.0.16
-    # via
-    #   mlforecast
-    #   neuralforecast
-    #   statsforecast
-cycler==0.12.1
-    # via matplotlib
-dacite==1.9.2
-    # via
-    #   mlstm-kernels
-    #   timecopilot-tirex
-    #   xlstm
-dash==3.2.0
-    # via plotly-resampler
-datasets==3.6.0
-    # via
-    #   fev
-    #   tabpfn-time-series
-    #   timecopilot-toto
-    #   timecopilot-uni2ts
-debugpy==1.8.15
-    # via ipykernel
-decorator==5.2.1
-    # via ipython
-defusedxml==0.7.1
-    # via nbconvert
-dill==0.3.8
-    # via
-    #   datasets
-    #   multiprocess
-    #   timecopilot-toto
-distro==1.9.0
-    # via
-    #   anthropic
-    #   groq
-    #   openai
-einops==0.7.0
-    # via
-    #   mlstm-kernels
-    #   rotary-embedding-torch
-    #   tabpfn
-    #   timecopilot-tirex
-    #   timecopilot-toto
-    #   timecopilot-uni2ts
-    #   xlstm
-einshape==1.0
-    # via timecopilot-timesfm
-eval-type-backport==0.2.2
-    # via
-    #   mistralai
-    #   pydantic-ai-slim
-executing==2.2.0
-    # via
-    #   logfire
-    #   stack-data
-fastavro==1.11.1 ; sys_platform != 'emscripten'
-    # via cohere
-fastjsonschema==2.21.1
-    # via nbformat
-fev==0.5.0
-    # via timecopilot-fev
-filelock==3.18.0
-    # via
-    #   datasets
-    #   huggingface-hub
-    #   ray
-    #   torch
-    #   transformers
-fire==0.7.0
-    # via timecopilot
-flask==3.1.2
-    # via dash
-fonttools==4.59.0
-    # via matplotlib
-fqdn==1.5.1
-    # via jsonschema
-frozenlist==1.7.0
-    # via
-    #   aiohttp
-    #   aiosignal
-fs==2.4.16
-    # via triad
-fsspec==2025.3.0
-    # via
-    #   datasets
-    #   huggingface-hub
-    #   lightning
-    #   mlforecast
-    #   neuralforecast
-    #   pytorch-lightning
-    #   ray
-    #   s3fs
-    #   torch
-    #   triad
-ftfy==6.3.1
-    # via xlstm
-fugue==0.9.1
-    # via statsforecast
-gitdb==4.0.12
-    # via gitpython
-gitpython==3.1.45
-    # via wandb
-gluonts==0.16.2
-    # via
-    #   tabpfn-time-series
-    #   timecopilot
-    #   timecopilot-toto
-    #   timecopilot-uni2ts
-google-auth==2.40.3
-    # via
-    #   google-genai
-    #   pydantic-ai-slim
-google-genai==1.28.0
-    # via pydantic-ai-slim
-googleapis-common-protos==1.70.0
-    # via opentelemetry-exporter-otlp-proto-http
-greenlet==3.2.4 ; (python_full_version < '3.14' and platform_machine == 'AMD64') or (python_full_version < '3.14' and platform_machine == 'WIN32') or (python_full_version < '3.14' and platform_machine == 'aarch64') or (python_full_version < '3.14' and platform_machine == 'amd64') or (python_full_version < '3.14' and platform_machine == 'ppc64le') or (python_full_version < '3.14' and platform_machine == 'win32') or (python_full_version < '3.14' and platform_machine == 'x86_64')
-    # via sqlalchemy
-griffe==1.9.0
-    # via pydantic-ai-slim
-groq==0.30.0
-    # via pydantic-ai-slim
-grpcio==1.74.0 ; python_full_version < '3.14'
-    # via tensorboard
-grpclib==0.4.8
-    # via modal
-h11==0.16.0
-    # via
-    #   httpcore
-    #   uvicorn
-h2==4.2.0
-    # via grpclib
-hf-xet==1.1.5 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
-    # via huggingface-hub
-holidays==0.77
-    # via prophet
-hpack==4.1.0
-    # via h2
-httpcore==1.0.9
-    # via httpx
-httpx==0.28.1
-    # via
-    #   anthropic
-    #   cohere
-    #   google-genai
-    #   groq
-    #   jupyterlab
-    #   mcp
-    #   mistralai
-    #   nixtla
-    #   openai
-    #   pydantic-ai-slim
-    #   pydantic-graph
-    #   tabpfn-client
-httpx-sse==0.4.0
-    # via
-    #   cohere
-    #   mcp
-huggingface-hub==0.34.3
-    # via
-    #   accelerate
-    #   datasets
-    #   pydantic-ai-slim
-    #   tabpfn
-    #   timecopilot-timesfm
-    #   timecopilot-tirex
-    #   timecopilot-uni2ts
-    #   tokenizers
-    #   transformers
-    #   xlstm
-hydra-core==1.3.2 ; python_full_version < '3.14'
-    # via timecopilot-uni2ts
-hyperframe==6.1.0
-    # via h2
-idna==3.10
-    # via
-    #   anyio
-    #   httpx
-    #   jsonschema
-    #   requests
-    #   yarl
-importlib-metadata==8.7.0
-    # via
-    #   dash
-    #   opentelemetry-api
-importlib-resources==6.5.2
-    # via prophet
-iniconfig==2.1.0
-    # via pytest
-inquirerpy==0.3.4
-    # via huggingface-hub
-ipykernel==6.30.0
-    # via
-    #   jupyter
-    #   jupyter-console
-    #   jupyterlab
-    #   mlstm-kernels
-    #   xlstm
-ipython==9.4.0
-    # via
-    #   ipykernel
-    #   ipywidgets
-    #   jupyter-console
-ipython-pygments-lexers==1.1.1
-    # via ipython
-ipywidgets==8.1.7
-    # via jupyter
-isoduration==20.11.0
-    # via jsonschema
-isort==6.0.1
-    # via timecopilot-toto
-itsdangerous==2.2.0
-    # via flask
-jax==0.7.0 ; python_full_version < '3.14'
-    # via timecopilot-uni2ts
-jaxlib==0.7.0 ; python_full_version < '3.14'
-    # via jax
-jaxtyping==0.2.38
-    # via
-    #   timecopilot-toto
-    #   timecopilot-uni2ts
-jedi==0.19.2
-    # via ipython
-jinja2==3.1.6
-    # via
-    #   flask
-    #   jupyter-server
-    #   jupyterlab
-    #   jupyterlab-server
-    #   nbconvert
-    #   torch
-jiter==0.10.0
-    # via
-    #   anthropic
-    #   openai
-jmespath==1.0.1
-    # via
-    #   aiobotocore
-    #   boto3
-    #   botocore
-joblib==1.5.1
-    # via scikit-learn
-joypy==0.2.6
-    # via xlstm
-json5==0.12.0
-    # via jupyterlab-server
-jsonpointer==3.0.0
-    # via jsonschema
-jsonschema==4.25.0
-    # via
-    #   jupyter-events
-    #   jupyterlab-server
-    #   mcp
-    #   nbformat
-    #   ray
-jsonschema-specifications==2025.4.1
-    # via jsonschema
-jupyter==1.1.1
-    # via timecopilot-toto
-jupyter-client==8.6.3
-    # via
-    #   ipykernel
-    #   jupyter-console
-    #   jupyter-server
-    #   nbclient
-jupyter-console==6.6.3
-    # via jupyter
-jupyter-core==5.8.1
-    # via
-    #   ipykernel
-    #   jupyter-client
-    #   jupyter-console
-    #   jupyter-server
-    #   jupyterlab
-    #   nbclient
-    #   nbconvert
-    #   nbformat
-jupyter-events==0.12.0
-    # via jupyter-server
-jupyter-lsp==2.2.6
-    # via jupyterlab
-jupyter-server==2.16.0
-    # via
-    #   jupyter-lsp
-    #   jupyterlab
-    #   jupyterlab-server
-    #   notebook
-    #   notebook-shim
-jupyter-server-terminals==0.5.3
-    # via jupyter-server
-jupyterlab==4.4.5
-    # via
-    #   jupyter
-    #   notebook
-jupyterlab-pygments==0.3.0
-    # via nbconvert
-jupyterlab-server==2.27.3
-    # via
-    #   jupyterlab
-    #   notebook
-jupyterlab-widgets==3.0.15
-    # via ipywidgets
-kiwisolver==1.4.8
-    # via matplotlib
-lark==1.2.2
-    # via rfc3987-syntax
-lightgbm==4.6.0
-    # via timecopilot
-lightning==2.4.0
-    # via
-    #   gluonts
-    #   timecopilot-tirex
-    #   timecopilot-uni2ts
-lightning-utilities==0.15.0
-    # via
-    #   lightning
-    #   pytorch-lightning
-    #   torchmetrics
-llvmlite==0.44.0
-    # via numba
-logfire==4.0.0
-    # via pydantic-ai
-logfire-api==4.0.0
-    # via
-    #   pydantic-evals
-    #   pydantic-graph
-mako==1.3.10
-    # via alembic
-markdown==3.8.2 ; python_full_version < '3.14'
-    # via tensorboard
-markdown-it-py==3.0.0
-    # via rich
-markupsafe==3.0.2
-    # via
-    #   flask
-    #   jinja2
-    #   mako
-    #   nbconvert
-    #   werkzeug
-matplotlib==3.10.3
-    # via
-    #   joypy
-    #   mlstm-kernels
-    #   pandas
-    #   prophet
-    #   seaborn
-    #   timecopilot-toto
-matplotlib-inline==0.1.7
-    # via
-    #   ipykernel
-    #   ipython
-mcp==1.12.2
-    # via pydantic-ai-slim
-mdurl==0.1.2
-    # via markdown-it-py
-mistralai==1.9.3
-    # via pydantic-ai-slim
-mistune==3.1.3
-    # via nbconvert
-ml-dtypes==0.5.3 ; python_full_version < '3.14'
-    # via
-    #   jax
-    #   jaxlib
-mlforecast==1.0.2
-    # via timecopilot
-mlstm-kernels==2.0.1
-    # via xlstm
-modal==1.1.0
-    # via timecopilot-fev
-mpmath==1.3.0
-    # via sympy
-msgpack==1.1.1
-    # via ray
-multidict==6.6.3
-    # via
-    #   aiobotocore
-    #   aiohttp
-    #   grpclib
-    #   yarl
-multiprocess==0.70.16
-    # via
-    #   datasets
-    #   timecopilot-uni2ts
-mypy==1.17.0
-    # via timecopilot-toto
-mypy-extensions==1.1.0
-    # via
-    #   black
-    #   mypy
-narwhals==2.3.0
-    # via plotly
-nbclient==0.10.2
-    # via nbconvert
-nbconvert==7.16.6
-    # via
-    #   jupyter
-    #   jupyter-server
-nbformat==5.10.4
-    # via
-    #   jupyter-server
-    #   nbclient
-    #   nbconvert
-nest-asyncio==1.6.0
-    # via
-    #   dash
-    #   ipykernel
-networkx==3.5
-    # via torch
-neuralforecast==3.0.2
-    # via timecopilot
-ninja==1.11.1.4
-    # via
-    #   timecopilot-tirex
-    #   xlstm
-nixtla==0.6.6
-    # via timecopilot
-notebook==7.4.4
-    # via jupyter
-notebook-shim==0.2.4
-    # via
-    #   jupyterlab
-    #   notebook
-numba==0.61.2
-    # via
-    #   antropy
-    #   statsforecast
-numpy==1.26.4 ; python_full_version < '3.13'
-    # via
-    #   accelerate
-    #   antropy
-    #   arch
-    #   cmdstanpy
-    #   contourpy
-    #   coreforecast
-    #   datasets
-    #   einshape
-    #   fev
-    #   gluonts
-    #   jax
-    #   jaxlib
-    #   joypy
-    #   lightgbm
-    #   matplotlib
-    #   ml-dtypes
-    #   mlstm-kernels
-    #   neuralforecast
-    #   numba
-    #   optuna
-    #   pandas
-    #   patsy
-    #   plotly-resampler
-    #   prophet
-    #   scikit-learn
-    #   scipy
-    #   seaborn
-    #   stanio
-    #   statsforecast
-    #   statsmodels
-    #   tensorboard
-    #   tensorboardx
-    #   timecopilot-timesfm
-    #   timecopilot-tirex
-    #   timecopilot-uni2ts
-    #   torchmetrics
-    #   torchvision
-    #   transformers
-    #   triad
-    #   tsdownsample
-    #   utilsforecast
-    #   xlstm
-numpy==2.1.3 ; python_full_version >= '3.13'
-    # via
-    #   accelerate
-    #   antropy
-    #   arch
-    #   cmdstanpy
-    #   contourpy
-    #   coreforecast
-    #   datasets
-    #   einshape
-    #   fev
-    #   gluonts
-    #   jax
-    #   jaxlib
-    #   joypy
-    #   lightgbm
-    #   matplotlib
-    #   ml-dtypes
-    #   mlstm-kernels
-    #   neuralforecast
-    #   numba
-    #   optuna
-    #   pandas
-    #   patsy
-    #   plotly-resampler
-    #   prophet
-    #   scikit-learn
-    #   scipy
-    #   seaborn
-    #   stanio
-    #   statsforecast
-    #   statsmodels
-    #   tensorboard
-    #   tensorboardx
-    #   timecopilot-timesfm
-    #   timecopilot-tirex
-    #   timecopilot-uni2ts
-    #   torchmetrics
-    #   torchvision
-    #   transformers
-    #   triad
-    #   tsdownsample
-    #   utilsforecast
-    #   xlstm
-nvidia-cublas-cu12==12.4.5.8 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cuda-cupti-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cuda-nvrtc-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cuda-runtime-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cudnn-cu12==9.1.0.70 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cufft-cu12==11.2.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-curand-cu12==10.3.5.147 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cusolver-cu12==11.6.1.9 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cusparse-cu12==12.3.1.170 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cusparselt-cu12==0.6.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-nccl-cu12==2.21.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-nvjitlink-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via
-    #   nvidia-cufft-cu12
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
-    #   torch
-nvidia-nvtx-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-omegaconf==2.3.0
-    # via
-    #   hydra-core
-    #   mlstm-kernels
-    #   tabpfn-client
-    #   xlstm
-openai==1.102.0
-    # via
-    #   pydantic-ai-slim
-    #   timecopilot
-opentelemetry-api==1.35.0
-    # via
-    #   opentelemetry-exporter-otlp-proto-http
-    #   opentelemetry-instrumentation
-    #   opentelemetry-sdk
-    #   opentelemetry-semantic-conventions
-    #   pydantic-ai-slim
-opentelemetry-exporter-otlp-proto-common==1.35.0
-    # via opentelemetry-exporter-otlp-proto-http
-opentelemetry-exporter-otlp-proto-http==1.35.0
-    # via logfire
-opentelemetry-instrumentation==0.56b0
-    # via logfire
-opentelemetry-proto==1.35.0
-    # via
-    #   opentelemetry-exporter-otlp-proto-common
-    #   opentelemetry-exporter-otlp-proto-http
-opentelemetry-sdk==1.35.0
-    # via
-    #   logfire
-    #   opentelemetry-exporter-otlp-proto-http
-opentelemetry-semantic-conventions==0.56b0
-    # via
-    #   opentelemetry-instrumentation
-    #   opentelemetry-sdk
-opt-einsum==3.4.0
-    # via
-    #   jax
-    #   xlstm
-optuna==4.5.0
-    # via
-    #   mlforecast
-    #   neuralforecast
-orjson==3.11.1
-    # via
-    #   nixtla
-    #   plotly-resampler
-    #   timecopilot-uni2ts
-overrides==7.7.0
-    # via jupyter-server
-packaging==24.2
-    # via
-    #   accelerate
-    #   black
-    #   datasets
-    #   huggingface-hub
-    #   hydra-core
-    #   ipykernel
-    #   jupyter-events
-    #   jupyter-server
-    #   jupyterlab
-    #   jupyterlab-server
-    #   lightning
-    #   lightning-utilities
-    #   matplotlib
-    #   nbconvert
-    #   opentelemetry-instrumentation
-    #   optuna
-    #   plotly
-    #   pytest
-    #   pytorch-lightning
-    #   ray
-    #   statsmodels
-    #   tensorboard
-    #   tensorboardx
-    #   torchmetrics
-    #   transformers
-    #   utilsforecast
-    #   wandb
-pandas==2.1.4 ; python_full_version < '3.13'
-    # via
-    #   arch
-    #   cmdstanpy
-    #   datasets
-    #   gluonts
-    #   joypy
-    #   mlforecast
-    #   neuralforecast
-    #   nixtla
-    #   plotly-resampler
-    #   prophet
-    #   ray
-    #   seaborn
-    #   statsforecast
-    #   statsmodels
-    #   tabpfn
-    #   tabpfn-client
-    #   tabpfn-time-series
-    #   timecopilot-timesfm
-    #   timecopilot-tirex
-    #   timecopilot-toto
-    #   triad
-    #   tsfeatures
-    #   utilsforecast
-pandas==2.3.1 ; python_full_version >= '3.13'
-    # via
-    #   arch
-    #   cmdstanpy
-    #   datasets
-    #   gluonts
-    #   joypy
-    #   mlforecast
-    #   neuralforecast
-    #   nixtla
-    #   plotly-resampler
-    #   prophet
-    #   ray
-    #   seaborn
-    #   statsforecast
-    #   statsmodels
-    #   timecopilot
-    #   timecopilot-timesfm
-    #   timecopilot-tirex
-    #   timecopilot-toto
-    #   triad
-    #   tsfeatures
-    #   utilsforecast
-pandocfilters==1.5.1
-    # via nbconvert
-parso==0.8.4
-    # via jedi
-password-strength==0.0.3.post2 ; python_full_version < '3.13'
-    # via tabpfn-client
-pathspec==0.12.1
-    # via
-    #   black
-    #   mypy
-patsy==1.0.1
-    # via statsmodels
-pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
-    # via ipython
-pfzy==0.3.4
-    # via inquirerpy
-pillow==11.3.0
-    # via
-    #   matplotlib
-    #   reportlab
-    #   tensorboard
-    #   torchvision
-platformdirs==4.3.8
-    # via
-    #   black
-    #   jupyter-core
-    #   wandb
-plotly==6.3.0
-    # via
-    #   dash
-    #   plotly-resampler
-    #   utilsforecast
-plotly-resampler==0.11.0
-    # via utilsforecast
-pluggy==1.6.0
-    # via pytest
-prometheus-client==0.22.1
-    # via jupyter-server
-prompt-toolkit==3.0.51
-    # via
-    #   inquirerpy
-    #   ipython
-    #   jupyter-console
-    #   pydantic-ai-slim
-propcache==0.3.2
-    # via
-    #   aiohttp
-    #   yarl
-prophet==1.1.7
-    # via timecopilot
-protobuf==6.31.1
-    # via
-    #   googleapis-common-protos
-    #   logfire
-    #   modal
-    #   opentelemetry-proto
-    #   ray
-    #   tensorboard
-    #   tensorboardx
-    #   wandb
-psutil==7.0.0
-    # via
-    #   accelerate
-    #   ipykernel
-ptyprocess==0.7.0 ; os_name != 'nt' or (sys_platform != 'emscripten' and sys_platform != 'win32')
-    # via
-    #   pexpect
-    #   terminado
-pure-eval==0.2.3
-    # via stack-data
-pyarrow==20.0.0
-    # via
-    #   datasets
-    #   ray
-    #   timecopilot-fev
-    #   triad
-pyasn1==0.6.1
-    # via
-    #   pyasn1-modules
-    #   rsa
-pyasn1-modules==0.4.2
-    # via google-auth
-pycparser==2.22
-    # via cffi
-pydantic==2.11.7
-    # via
-    #   ag-ui-protocol
-    #   anthropic
-    #   cohere
-    #   fev
-    #   gluonts
-    #   google-genai
-    #   groq
-    #   mcp
-    #   mistralai
-    #   nixtla
-    #   openai
-    #   pydantic-ai-slim
-    #   pydantic-evals
-    #   pydantic-graph
-    #   pydantic-settings
-    #   wandb
-pydantic-ai==0.4.10
-    # via timecopilot
-pydantic-ai-slim==0.4.10
-    # via
-    #   pydantic-ai
-    #   pydantic-evals
-pydantic-core==2.33.2
-    # via
-    #   cohere
-    #   pydantic
-pydantic-evals==0.4.10
-    # via pydantic-ai-slim
-pydantic-graph==0.4.10
-    # via pydantic-ai-slim
-pydantic-settings==2.10.1
-    # via mcp
-pygments==2.19.2
-    # via
-    #   ipython
-    #   ipython-pygments-lexers
-    #   jupyter-console
-    #   nbconvert
-    #   pytest
-    #   rich
-pyparsing==3.2.3
-    # via matplotlib
-pytest==8.4.1
-    # via
-    #   pytest-env
-    #   timecopilot-toto
-pytest-env==1.1.5
-    # via timecopilot-toto
-python-dateutil==2.9.0.post0
-    # via
-    #   aiobotocore
-    #   arrow
-    #   botocore
-    #   holidays
-    #   jupyter-client
-    #   matplotlib
-    #   mistralai
-    #   pandas
-python-dotenv==1.1.1
-    # via
-    #   pydantic-settings
-    #   tabpfn-time-series
-    #   timecopilot-uni2ts
-python-json-logger==3.3.0
-    # via jupyter-events
-python-multipart==0.0.20
-    # via mcp
-pytorch-lightning==2.4.0
-    # via
-    #   gluonts
-    #   lightning
-    #   neuralforecast
-pytz==2025.2
-    # via pandas
-pywin32==311 ; sys_platform == 'win32'
-    # via
-    #   jupyter-core
-    #   mcp
-pywinpty==2.0.15 ; os_name == 'nt' and sys_platform != 'linux'
-    # via
-    #   jupyter-server
-    #   jupyter-server-terminals
-    #   terminado
-pyyaml==6.0.2
-    # via
-    #   accelerate
-    #   datasets
-    #   huggingface-hub
-    #   jupyter-events
-    #   lightning
-    #   omegaconf
-    #   optuna
-    #   pydantic-evals
-    #   pytorch-lightning
-    #   ray
-    #   tabpfn-time-series
-    #   timecopilot-toto
-    #   transformers
-    #   wandb
-pyzmq==27.0.0
-    # via
-    #   ipykernel
-    #   jupyter-client
-    #   jupyter-console
-    #   jupyter-server
-ray==2.49.0
-    # via neuralforecast
-referencing==0.36.2
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
-    #   jupyter-events
-regex==2025.7.34
-    # via transformers
-reportlab==4.4.3
-    # via xlstm
-requests==2.32.4
-    # via
-    #   cohere
-    #   dash
-    #   datasets
-    #   google-genai
-    #   huggingface-hub
-    #   jupyterlab-server
-    #   opentelemetry-exporter-otlp-proto-http
-    #   pydantic-ai-slim
-    #   ray
-    #   transformers
-    #   wandb
-retrying==1.4.2
-    # via dash
-rfc3339-validator==0.1.4
-    # via
-    #   jsonschema
-    #   jupyter-events
-rfc3986-validator==0.1.1
-    # via
-    #   jsonschema
-    #   jupyter-events
-rfc3987-syntax==1.1.0
-    # via jsonschema
-rich==14.1.0
-    # via
-    #   logfire
-    #   mlstm-kernels
-    #   modal
-    #   pydantic-ai-slim
-    #   pydantic-evals
-    #   typer
-    #   xlstm
-rotary-embedding-torch==0.8.9
-    # via timecopilot-toto
-rpds-py==0.26.0
-    # via
-    #   jsonschema
-    #   referencing
-rsa==4.9.1
-    # via google-auth
-s3fs==2025.3.0
-    # via
-    #   datasets
-    #   timecopilot-fev
-s3transfer==0.13.1
-    # via boto3
-safetensors==0.5.3
-    # via
-    #   accelerate
-    #   timecopilot-timesfm
-    #   timecopilot-toto
-    #   timecopilot-uni2ts
-    #   transformers
-scikit-learn==1.6.1 ; python_full_version < '3.13'
-    # via
-    #   antropy
-    #   mlforecast
-    #   tabpfn
-    #   tabpfn-client
-    #   timecopilot-timesfm
-    #   timecopilot-toto
-    #   tsfeatures
-scikit-learn==1.7.1 ; python_full_version >= '3.13'
-    # via
-    #   antropy
-    #   mlforecast
-    #   timecopilot-timesfm
-    #   timecopilot-toto
-    #   tsfeatures
-scipy==1.15.3
-    # via
-    #   antropy
-    #   arch
-    #   fev
-    #   gluonts
-    #   jax
-    #   jaxlib
-    #   joypy
-    #   lightgbm
-    #   scikit-learn
-    #   statsforecast
-    #   statsmodels
-    #   tabpfn
-    #   timecopilot
-    #   timecopilot-uni2ts
-seaborn==0.13.2
-    # via xlstm
-send2trash==1.8.3
-    # via jupyter-server
-sentry-sdk==2.34.1
-    # via wandb
-setuptools==80.9.0
-    # via
-    #   dash
-    #   fs
-    #   jupyterlab
-    #   lightning-utilities
-    #   tensorboard
-    #   torch
-shellingham==1.5.4
-    # via typer
-sigtools==4.0.1
-    # via synchronicity
-six==1.17.0
-    # via
-    #   fs
-    #   password-strength
-    #   python-dateutil
-    #   rfc3339-validator
-    #   triad
-smmap==5.0.2
-    # via gitdb
-sniffio==1.3.1
-    # via
-    #   anthropic
-    #   anyio
-    #   groq
-    #   openai
-soupsieve==2.7
-    # via beautifulsoup4
-sqlalchemy==2.0.43
-    # via
-    #   alembic
-    #   optuna
-sse-starlette==3.0.2
-    # via mcp
-sseclient-py==1.8.0 ; python_full_version < '3.13'
-    # via tabpfn-client
-stack-data==0.6.3
-    # via ipython
-stanio==0.5.1
-    # via cmdstanpy
-starlette==0.47.2
-    # via
-    #   mcp
-    #   pydantic-ai-slim
-statsforecast==2.0.2
-    # via timecopilot
-statsmodels==0.14.5
-    # via
-    #   arch
-    #   statsforecast
-    #   tabpfn-time-series
-    #   tsfeatures
-supersmoother==0.4
-    # via tsfeatures
-sympy==1.13.1
-    # via torch
-synchronicity==0.10.2
-    # via modal
-tabpfn==2.1.0 ; python_full_version < '3.13'
-    # via tabpfn-time-series
-tabpfn-client==0.1.9 ; python_full_version < '3.13'
-    # via tabpfn-time-series
-tabpfn-time-series==1.0.3 ; python_full_version < '3.13'
-    # via timecopilot
-tabulate==0.9.0
-    # via timecopilot-toto
-tenacity==8.5.0
-    # via
-    #   google-genai
-    #   nixtla
-    #   pydantic-ai-slim
-tensorboard==2.20.0 ; python_full_version < '3.14'
-    # via timecopilot-uni2ts
-tensorboard-data-server==0.7.2 ; python_full_version < '3.14'
-    # via tensorboard
-tensorboardx==2.6.4
-    # via ray
-termcolor==3.1.0
-    # via fire
-terminado==0.18.1
-    # via
-    #   jupyter-server
-    #   jupyter-server-terminals
-threadpoolctl==3.6.0
-    # via
-    #   scikit-learn
-    #   statsforecast
-timecopilot==0.0.17
-    # via timecopilot-fev
-timecopilot-chronos-forecasting==0.1.0
-    # via timecopilot
-timecopilot-timesfm==0.1.0
-    # via timecopilot
-timecopilot-tirex==0.1.0
-    # via timecopilot
-timecopilot-toto==0.1.3
-    # via timecopilot
-timecopilot-uni2ts==0.1.2 ; python_full_version < '3.14'
-    # via timecopilot
-tinycss2==1.4.0
-    # via bleach
-tokenizers==0.19.1 ; python_full_version < '3.13'
-    # via
-    #   cohere
-    #   transformers
-    #   xlstm
-tokenizers==0.21.4 ; python_full_version >= '3.13'
-    # via
-    #   cohere
-    #   transformers
-    #   xlstm
-toml==0.10.2
-    # via modal
-toolz==0.12.1
-    # via gluonts
-torch==2.6.0
-    # via
-    #   accelerate
-    #   gluonts
-    #   lightning
-    #   mlstm-kernels
-    #   neuralforecast
-    #   pytorch-lightning
-    #   rotary-embedding-torch
-    #   tabpfn
-    #   timecopilot-chronos-forecasting
-    #   timecopilot-timesfm
-    #   timecopilot-tirex
-    #   timecopilot-toto
-    #   timecopilot-uni2ts
-    #   torchmetrics
-    #   torchvision
-    #   xlstm
-torchmetrics==1.8.0
-    # via
-    #   lightning
-    #   pytorch-lightning
-torchvision==0.21.0
-    # via timecopilot-tirex
-tornado==6.5.1
-    # via
-    #   ipykernel
-    #   jupyter-client
-    #   jupyter-server
-    #   jupyterlab
-    #   notebook
-    #   terminado
-tqdm==4.67.1
-    # via
-    #   cmdstanpy
-    #   datasets
-    #   gluonts
-    #   huggingface-hub
-    #   lightning
-    #   mlstm-kernels
-    #   nixtla
-    #   openai
-    #   optuna
-    #   prophet
-    #   pytorch-lightning
-    #   statsforecast
-    #   tabpfn-client
-    #   tabpfn-time-series
-    #   timecopilot-tirex
-    #   timecopilot-toto
-    #   transformers
-    #   xlstm
-traitlets==5.14.3
-    # via
-    #   ipykernel
-    #   ipython
-    #   ipywidgets
-    #   jupyter-client
-    #   jupyter-console
-    #   jupyter-core
-    #   jupyter-events
-    #   jupyter-server
-    #   jupyterlab
-    #   matplotlib-inline
-    #   nbclient
-    #   nbconvert
-    #   nbformat
-transformers==4.40.1 ; python_full_version < '3.13'
-    # via
-    #   timecopilot
-    #   timecopilot-chronos-forecasting
-    #   xlstm
-transformers==4.53.3 ; python_full_version >= '3.13'
-    # via
-    #   timecopilot
-    #   timecopilot-chronos-forecasting
-    #   xlstm
-triad==0.9.8
-    # via
-    #   adagio
-    #   fugue
-triton==3.2.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-tsdownsample==0.1.4.1
-    # via plotly-resampler
-tsfeatures==0.4.5
-    # via timecopilot
-typer==0.16.0
-    # via
-    #   modal
-    #   timecopilot-fev
-    #   timecopilot-timesfm
-types-certifi==2021.10.8.3
-    # via modal
-types-python-dateutil==2.9.0.20250708
-    # via arrow
-types-pyyaml==6.0.12.20250516
-    # via timecopilot-toto
-types-requests==2.32.4.20250611 ; sys_platform != 'emscripten'
-    # via cohere
-types-tabulate==0.9.0.20241207
-    # via timecopilot-toto
-types-toml==0.10.8.20240310
-    # via modal
-typing-extensions==4.14.1
-    # via
-    #   aiosignal
-    #   alembic
-    #   anthropic
-    #   anyio
-    #   beautifulsoup4
-    #   cohere
-    #   dash
-    #   gluonts
-    #   google-genai
-    #   groq
-    #   huggingface-hub
-    #   ipython
-    #   lightning
-    #   lightning-utilities
-    #   logfire
-    #   modal
-    #   mypy
-    #   openai
-    #   opentelemetry-api
-    #   opentelemetry-exporter-otlp-proto-http
-    #   opentelemetry-sdk
-    #   opentelemetry-semantic-conventions
-    #   pydantic
-    #   pydantic-core
-    #   pytorch-lightning
-    #   referencing
-    #   sqlalchemy
-    #   starlette
-    #   synchronicity
-    #   tabpfn
-    #   tabpfn-client
-    #   torch
-    #   typer
-    #   typing-inspection
-    #   wandb
-typing-inspection==0.4.1
-    # via
-    #   mistralai
-    #   pydantic
-    #   pydantic-ai-slim
-    #   pydantic-graph
-    #   pydantic-settings
-tzdata==2025.2
-    # via pandas
-uri-template==1.3.0
-    # via jsonschema
-urllib3==2.5.0
-    # via
-    #   botocore
-    #   requests
-    #   sentry-sdk
-    #   types-requests
-utilsforecast==0.2.12
-    # via
-    #   mlforecast
-    #   neuralforecast
-    #   nixtla
-    #   statsforecast
-    #   timecopilot
-    #   timecopilot-timesfm
-uvicorn==0.35.0 ; sys_platform != 'emscripten'
-    # via mcp
-wadler-lindig==0.1.7
-    # via jaxtyping
-wandb==0.21.0
-    # via timecopilot-timesfm
-watchfiles==1.1.0
-    # via modal
-wcwidth==0.2.13
-    # via
-    #   ftfy
-    #   prompt-toolkit
-webcolors==24.11.1
-    # via jsonschema
-webencodings==0.5.1
-    # via
-    #   bleach
-    #   tinycss2
-websocket-client==1.8.0
-    # via jupyter-server
-websockets==15.0.1
-    # via google-genai
-werkzeug==3.1.3
-    # via
-    #   dash
-    #   flask
-    #   tensorboard
-widgetsnbextension==4.0.14
-    # via ipywidgets
-wrapt==1.17.2
-    # via
-    #   aiobotocore
-    #   opentelemetry-instrumentation
-xlstm==2.0.4
-    # via timecopilot-tirex
-xxhash==3.5.0
-    # via
-    #   datasets
-    #   tabpfn-client
-yarl==1.20.1
-    # via aiohttp
-zipp==3.23.0
-    # via importlib-metadata
-zstandard==0.23.0
-    # via httpx
diff --git a/examples/timecopilot/src/__init__.py b/examples/timecopilot/src/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/examples/timecopilot/src/download_results.py b/examples/timecopilot/src/download_results.py
deleted file mode 100644
index b83297d..0000000
--- a/examples/timecopilot/src/download_results.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pandas as pd
-
-from .evaluate_model import tasks
-
-
-def download_results():
-    summaries = []
-    for task in tasks():
-        csv_path = f"s3://timecopilot-fev/results/{task.dataset_config}.csv"
-        df = pd.read_csv(csv_path)
-        summaries.append(df)
-    # Show and save the results
-    df = pd.concat(summaries)
-    print(df)
-    df.to_csv("timecopilot.csv", index=False)
-
-
-if __name__ == "__main__":
-    download_results()
diff --git a/examples/timecopilot/src/evaluate_model.py b/examples/timecopilot/src/evaluate_model.py
deleted file mode 100644
index 98492bc..0000000
--- a/examples/timecopilot/src/evaluate_model.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import logging
-import time
-import warnings
-
-import datasets
-import pandas as pd
-import typer
-from timecopilot.models.ensembles.median import MedianEnsemble
-from timecopilot.models.foundation.chronos import Chronos
-from timecopilot.models.foundation.tirex import TiRex
-
-import fev
-
-app = typer.Typer()
-logging.basicConfig(level=logging.INFO)
-datasets.disable_progress_bars()
-
-
-def predict_with_model(task: fev.Task) -> tuple[list[datasets.DatasetDict], float, dict]:
-    forecaster = MedianEnsemble(
-        models=[
-            Chronos(
-                repo_id="amazon/chronos-bolt-base",
-                batch_size=256,
-            ),
-            TiRex(batch_size=256),
-        ],
-        alias="TimeCopilot",
-    )
-
-    renamer = {
-        forecaster.alias: "predictions",
-    }
-    renamer.update({f"{forecaster.alias}-q-{int(100 * q)}": str(q) for q in task.quantile_levels})
-    inference_time = 0.0
-    predictions_per_window = []
-    for window in task.iter_windows(trust_remote_code=True):
-        past_df, *_ = fev.convert_input_data(window, "nixtla", as_univariate=True)
-        # Forward fill NaNs + zero-fill leading NaNs
-        past_df = past_df.set_index("unique_id").groupby("unique_id").ffill().reset_index().fillna(0.0)
-
-        start_time = time.monotonic()
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            forecast_df = forecaster.forecast(
-                df=past_df,
-                h=task.horizon,
-                quantiles=task.quantile_levels,
-                freq=task.freq,
-            )
-        inference_time += time.monotonic() - start_time
-        forecast_df = forecast_df.rename(columns=renamer)
-        selected_columns = ["predictions"] + [str(q) for q in task.quantile_levels]
-        predictions_list = []
-        for _, forecast in forecast_df.groupby("unique_id"):
-            predictions_list.append(forecast[selected_columns].to_dict("list"))
-        predictions_per_window.append(
-            fev.combine_univariate_predictions_to_multivariate(
-                datasets.Dataset.from_list(predictions_list), target_columns=task.target_columns
-            )
-        )
-    return predictions_per_window, inference_time, {}
-
-
-def evaluate_task(task: fev.Task):
-    predictions, inference_time, extra_info = predict_with_model(task)
-    evaluation_summary = task.evaluation_summary(
-        predictions,
-        model_name="timecopilot",
-        inference_time_s=inference_time,
-        extra_info=extra_info,
-    )
-    print(evaluation_summary)
-    return pd.DataFrame([evaluation_summary])
-
-
-def tasks():
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/tasks.yaml"
-    )
-    return benchmark.tasks
-
-
-@app.command()
-def main(num_tasks: int | None = None):
-    _tasks = tasks()[:num_tasks]
-    logging.info(f"Evaluating {len(_tasks)} tasks")
-    summaries = []
-    for task in _tasks:
-        evaluation_summary = evaluate_task(task)
-        summaries.append(evaluation_summary)
-    # Show and save the results
-    summary_df = pd.concat(summaries)
-    print(summary_df)
-    summary_df.to_csv("timecopilot.csv", index=False)
-
-
-if __name__ == "__main__":
-    app()
diff --git a/examples/timecopilot/src/evaluate_model_modal.py b/examples/timecopilot/src/evaluate_model_modal.py
deleted file mode 100644
index 3264a75..0000000
--- a/examples/timecopilot/src/evaluate_model_modal.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from pathlib import Path
-
-import fev
-import modal
-
-app = modal.App(name="timecopilot-fev")
-image = (
-    modal.Image.from_registry(
-        "nvidia/cuda:12.8.1-devel-ubuntu24.04",
-        add_python="3.11",
-    )
-    # uploaded to s3 by makefile
-    .apt_install("git")
-    .pip_install("uv")
-    .add_local_file("pyproject.toml", "/root/pyproject.toml", copy=True)
-    .add_local_file(".python-version", "/root/.python-version", copy=True)
-    .add_local_file("uv.lock", "/root/uv.lock", copy=True)
-    .workdir("/root")
-    .run_commands("uv pip install . --system --compile-bytecode")
-)
-aws_secret = modal.Secret.from_name(
-    "aws-secret",
-    required_keys=["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"],
-)
-volume = {
-    "/s3-bucket": modal.CloudBucketMount(
-        bucket_name="timecopilot-fev",
-        secret=aws_secret,
-    )
-}
-
-
-@app.function(
-    image=image,
-    volumes=volume,
-    # 3 hours timeout
-    timeout=60 * 60 * 3,
-    gpu="A10G",
-    # as my local
-    cpu=8,
-    secrets=[modal.Secret.from_name("hf-secret")],
-)
-def evaluate_task_modal(task: fev.Task):
-    from .evaluate_model import evaluate_task
-
-    evaluation_summary = evaluate_task(task=task)
-    save_path = Path(f"/s3-bucket/results/{task.dataset_config}.csv")
-    save_path.parent.mkdir(parents=True, exist_ok=True)
-    evaluation_summary.to_csv(save_path, index=False)
-
-
-@app.local_entrypoint()
-def main():
-    from .evaluate_model import tasks
-
-    list(evaluate_task_modal.map(tasks()[:2]))
diff --git a/examples/timesfm-2.0/evaluate_model.py b/examples/timesfm-2.0/evaluate_model.py
deleted file mode 100644
index 82c9581..0000000
--- a/examples/timesfm-2.0/evaluate_model.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import time
-
-import datasets
-import numpy as np
-import pandas as pd
-import timesfm
-from gluonts.transform import LastValueImputation
-from timesfm.timesfm_torch import TimesFmTorch
-from tqdm.auto import tqdm
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-def get_frequency_indicator(freqstr: str) -> int:
-    base_freqstr = pd.tseries.frequencies.to_offset(freqstr).base.freqstr
-    if base_freqstr[0] in ["Q", "Y"]:
-        return 2
-    elif base_freqstr[0] in ["W", "M"]:
-        return 1
-    else:
-        return 0
-
-
-def batchify(lst: list, batch_size: int = 32):
-    """Convert list into batches of desired size."""
-    for i in range(0, len(lst), batch_size):
-        yield lst[i : i + batch_size]
-
-
-VERSION_TO_HYPERPARAMETERS = {
-    "google/timesfm-1.0-200m-pytorch": {
-        "context_len": 512,
-    },
-    "google/timesfm-2.0-500m-pytorch": {
-        "num_layers": 50,
-        "use_positional_embedding": False,
-        "context_len": 2048,
-    },
-}
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_name: str = "google/timesfm-2.0-500m-pytorch",
-    backend: str = "gpu",
-    batch_size: int = 256,
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    if model_name not in VERSION_TO_HYPERPARAMETERS:
-        raise ValueError(f"model_name must be one of {list(VERSION_TO_HYPERPARAMETERS)} (got {model_name})")
-    model_hparams = VERSION_TO_HYPERPARAMETERS[model_name]
-
-    tfm = TimesFmTorch(
-        hparams=timesfm.TimesFmHparams(
-            backend=backend,
-            horizon_len=task.horizon,
-            per_core_batch_size=32,
-            **model_hparams,
-        ),
-        checkpoint=timesfm.TimesFmCheckpoint(huggingface_repo_id=model_name),
-    )
-
-    quantile_to_index = {}
-    # Ensure that 0.5 quantile is predicted
-    task_quantiles = [0.5] + task.quantile_levels
-    for q in task_quantiles:
-        # We add 1 below to account for the first prediction which is the mean
-        quantile_to_index[q] = int(np.argmin(np.abs(np.array(tfm.quantiles) - q))) + 1
-    imputation = LastValueImputation()
-    task.load_full_dataset()  # ensure that task.freq is available
-    frequency_indicator = get_frequency_indicator(task.freq)
-
-    inference_time = 0.0
-    predictions_per_window = []
-    for window in task.iter_windows(trust_remote_code=True):
-        past_data, _ = fev.convert_input_data(window, adapter="datasets", as_univariate=True)
-        past_data = past_data.with_format("numpy").cast_column("target", datasets.Sequence(datasets.Value("float32")))
-        inputs = [imputation(t) for t in past_data["target"]]
-
-        forecast_batches = []
-        start_time = time.monotonic()
-        for batch in tqdm(batchify(inputs, batch_size=batch_size), total=len(inputs) // batch_size):
-            mean_forecast, full_forecast = tfm.forecast(batch, freq=[frequency_indicator for _ in batch])
-
-            if task.eval_metric in ["MSE", "RMSE", "RMSSE"]:
-                forecast = {"predictions": mean_forecast}
-            else:
-                forecast = {"predictions": full_forecast[:, :, quantile_to_index[0.5]]}
-            for q in task.quantile_levels:
-                forecast[str(q)] = full_forecast[:, :, quantile_to_index[q]]
-            forecast_batches.append(forecast)
-        inference_time += time.monotonic() - start_time
-
-        predictions = datasets.Dataset.from_dict(
-            {
-                k: np.concatenate([batch[k] for batch in forecast_batches], axis=0)
-                for k in task.predictions_schema.keys()
-            }
-        )
-        predictions_per_window.append(
-            fev.combine_univariate_predictions_to_multivariate(predictions, target_columns=task.target_columns)
-        )
-
-    extra_info = {
-        "model_config": {
-            "batch_size": batch_size,
-            "backend": backend,
-            "frequency_indicator": frequency_indicator,
-            **model_hparams,
-        }
-    }
-
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_name = "google/timesfm-2.0-500m-pytorch"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_name=model_name)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv("timesfm-2.0.csv", index=False)
diff --git a/examples/timesfm-2.0/requirements.txt b/examples/timesfm-2.0/requirements.txt
deleted file mode 100644
index 7c89ece..0000000
--- a/examples/timesfm-2.0/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-timesfm[torch]==1.3.0
-torch==2.5.1
-# GluonTS used for missing value imputation
-gluonts==0.16.0
diff --git a/examples/timesfm-2.5/evaluate_model.py b/examples/timesfm-2.5/evaluate_model.py
deleted file mode 100644
index 09d1794..0000000
--- a/examples/timesfm-2.5/evaluate_model.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import time
-
-import datasets
-import numpy as np
-import pandas as pd
-import timesfm
-import torch
-from gluonts.transform import LastValueImputation
-
-import fev
-
-torch.set_float32_matmul_precision("high")
-
-TIMESFM_MODEL_QUANTILES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
-
-
-def batchify(lst: list, batch_size: int):
-    """Convert list into batches of desired size."""
-    for i in range(0, len(lst), batch_size):
-        yield lst[i : i + batch_size]
-
-
-def predict_window(
-    window: fev.EvaluationWindow,
-    model: timesfm.TimesFM_2p5_200M_torch,
-    quantile_levels: list[float],
-    batch_size: int,
-    return_mean: bool,
-) -> tuple[datasets.DatasetDict, float]:
-    quantile_to_index = {}
-    # Ensure that 0.5 quantile is predicted
-    task_quantiles = [0.5] + quantile_levels
-    for q in task_quantiles:
-        # We add 1 below to account for the first prediction which is the mean
-        quantile_to_index[q] = int(np.argmin(np.abs(np.array(TIMESFM_MODEL_QUANTILES) - q))) + 1
-
-    past_data, _ = fev.convert_input_data(window, adapter="datasets", as_univariate=True)
-    past_data = past_data.with_format("numpy").cast_column("target", datasets.Sequence(datasets.Value("float32")))
-
-    imputation = LastValueImputation()
-    # We copy the array because datasets sometimes returns numpy arrays which are not writeable
-    # See: https://github.com/huggingface/datasets/issues/616
-    inputs = [imputation(t.copy()) for t in past_data["target"]]
-
-    forecast_batches = []
-    start_time = time.monotonic()
-    for batch in batchify(inputs, batch_size=batch_size):
-        mean_forecast, full_forecast = model.forecast(inputs=batch, horizon=window.horizon)
-        if return_mean:
-            forecast = {"predictions": mean_forecast}
-        else:
-            forecast = {"predictions": full_forecast[:, :, quantile_to_index[0.5]]}
-        for q in quantile_levels:
-            forecast[str(q)] = full_forecast[:, :, quantile_to_index[q]]
-        forecast_batches.append(forecast)
-    window_inference_time = time.monotonic() - start_time
-
-    predictions = datasets.Dataset.from_dict(
-        {
-            k: np.concatenate([batch[k] for batch in forecast_batches], axis=0)
-            for k in ["predictions"] + [str(q) for q in quantile_levels]
-        }
-    )
-
-    return fev.utils.combine_univariate_predictions_to_multivariate(
-        predictions, target_columns=window.target_columns
-    ), window_inference_time
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_name: str = "google/timesfm-2.5-200m-pytorch",
-    batch_size: int = 256,
-    context_length: int = 16_000,
-    per_core_batch_size: int = 64,
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    context_length = min(context_length, max([len(t) for t in task.load_full_dataset()[task.timestamp_column]]))
-    print(f"Setting context_length={context_length}")
-
-    model = timesfm.TimesFM_2p5_200M_torch.from_pretrained(model_name)
-    model_hparams = dict(
-        max_context=context_length,
-        max_horizon=task.horizon,
-        normalize_inputs=True,
-        use_continuous_quantile_head=True,
-        force_flip_invariance=True,
-        infer_is_positive=True,
-        fix_quantile_crossing=True,
-        per_core_batch_size=per_core_batch_size,
-    )
-    model.compile(timesfm.ForecastConfig(**model_hparams))
-
-    inference_time = 0.0
-    predictions_per_window = []
-    for window in task.iter_windows():
-        predictions, window_inference_time = predict_window(
-            window,
-            model=model,
-            quantile_levels=task.quantile_levels,
-            batch_size=batch_size,
-            return_mean=task.eval_metric in ["MSE", "RMSE", "RMSSE"],
-        )
-        predictions_per_window.append(predictions)
-        inference_time += window_inference_time
-
-    extra_info = {"model_config": {"batch_size": batch_size, **model_hparams}}
-
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_name = "google/timesfm-2.5-200m-pytorch"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://github.com/autogluon/fev/raw/refs/heads/main/benchmarks/fev_bench/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_name=model_name)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv("timesfm-2.5.csv", index=False)
diff --git a/examples/timesfm-2.5/requirements.txt b/examples/timesfm-2.5/requirements.txt
deleted file mode 100644
index f128b6c..0000000
--- a/examples/timesfm-2.5/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-"timesfm[torch] @ git+https://github.com/google-research/timesfm@58e01ad82fec7f4e975d708409d51fd5c0131bbd"
-# GluonTS used for missing value imputation
-gluonts==0.16.0
diff --git a/examples/tirex/README.md b/examples/tirex/README.md
deleted file mode 100644
index 2df2945..0000000
--- a/examples/tirex/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# TiRex: Zero-Shot Forecasting across Long and Short Horizions
-
-TiRex is a pre-trained time series forecasting model bases on xLSTM.
-
-Please follow the installation instruction in [TiRex's repository](https://github.com/NX-AI/tirex?tab=readme-ov-file#installation).
-We recommend to install TiRex within the conda environment specified in the repository.
-
-For more information check the [repository](https://github.com/NX-AI/tirex)
-or the [HuggingFace model card](https://huggingface.co/NX-AI/TiRex).
\ No newline at end of file
diff --git a/examples/tirex/evaluate_model.py b/examples/tirex/evaluate_model.py
deleted file mode 100644
index 4d9bcd9..0000000
--- a/examples/tirex/evaluate_model.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import time
-
-import datasets
-import pandas as pd
-import torch
-from tirex import ForecastModel, load_model
-from tirex.models.mixed_stack import skip_cuda
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_name: str = "NX-AI/TiRex",
-    batch_size: int = 512,
-    device_map: str = "cuda",
-    compile: bool = False,
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    model: ForecastModel = load_model(model_name, device=device_map)
-    if compile:
-        model = torch.compile(model)
-
-    inference_time = 0.0
-    predictions_per_window = []
-    for window in task.iter_windows(trust_remote_code=True):
-        past_data, _ = fev.convert_input_data(window, adapter="datasets", as_univariate=True)
-        past_data = past_data.with_format("torch").cast_column("target", datasets.Sequence(datasets.Value("float32")))
-        loaded_targets = [t for t in past_data["target"]]
-
-        start_time = time.monotonic()
-        quantiles, means = model.forecast(
-            loaded_targets, quantile_levels=task.quantile_levels, prediction_length=task.horizon, batch_size=batch_size
-        )
-        inference_time += time.monotonic() - start_time
-
-        predictions_dict = {"predictions": means}
-        for idx, level in enumerate(task.quantile_levels):
-            predictions_dict[str(level)] = quantiles[:, :, idx]
-
-        predictions_per_window.append(
-            fev.combine_univariate_predictions_to_multivariate(
-                datasets.Dataset.from_dict(predictions_dict), target_columns=task.target_columns
-            )
-        )
-
-    extra_info = {
-        "model_config": {
-            "model_name": "tirex",
-            "batch_size": batch_size,
-            "device_map": device_map,
-            "compile": compile,
-            "cuda_kernel": not skip_cuda(),
-        }
-    }
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_name = "NX-AI/TiRex"
-    num_tasks = None  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_name=model_name)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name="tirex",
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv("tirex.csv", index=False)
diff --git a/examples/tirex/freezed_test_environment.yaml b/examples/tirex/freezed_test_environment.yaml
deleted file mode 100644
index 7eb8364..0000000
--- a/examples/tirex/freezed_test_environment.yaml
+++ /dev/null
@@ -1,284 +0,0 @@
-name: freezed_test_environment
-channels:
-  - nvidia
-  - conda-forge
-dependencies:
-  - _libgcc_mutex=0.1
-  - _openmp_mutex=4.5
-  - binutils_impl_linux-64=2.40
-  - binutils_linux-64=2.40
-  - bzip2=1.0.8
-  - c-ares=1.34.5
-  - ca-certificates=2025.4.26
-  - cmake=4.0.2
-  - cuda=12.6.3
-  - cuda-cccl=12.6.77
-  - cuda-cccl_linux-64=12.6.77
-  - cuda-command-line-tools=12.6.3
-  - cuda-compiler=12.6.3
-  - cuda-crt-dev_linux-64=12.6.85
-  - cuda-crt-tools=12.6.85
-  - cuda-cudart=12.6.77
-  - cuda-cudart-dev=12.6.77
-  - cuda-cudart-dev_linux-64=12.6.77
-  - cuda-cudart-static=12.6.77
-  - cuda-cudart-static_linux-64=12.6.77
-  - cuda-cudart_linux-64=12.6.77
-  - cuda-cuobjdump=12.6.77
-  - cuda-cupti=12.6.80
-  - cuda-cupti-dev=12.6.80
-  - cuda-cuxxfilt=12.6.77
-  - cuda-driver-dev=12.6.77
-  - cuda-driver-dev_linux-64=12.6.77
-  - cuda-gdb=12.6.77
-  - cuda-libraries=12.6.3
-  - cuda-libraries-dev=12.6.3
-  - cuda-nsight=12.6.77
-  - cuda-nvcc=12.6.85
-  - cuda-nvcc-dev_linux-64=12.6.85
-  - cuda-nvcc-impl=12.6.85
-  - cuda-nvcc-tools=12.6.85
-  - cuda-nvcc_linux-64=12.6.85
-  - cuda-nvdisasm=12.6.77
-  - cuda-nvml-dev=12.6.77
-  - cuda-nvprof=12.6.80
-  - cuda-nvprune=12.6.77
-  - cuda-nvrtc=12.6.85
-  - cuda-nvrtc-dev=12.6.85
-  - cuda-nvtx=12.6.77
-  - cuda-nvvm-dev_linux-64=12.6.85
-  - cuda-nvvm-impl=12.6.85
-  - cuda-nvvm-tools=12.6.85
-  - cuda-nvvp=12.6.80
-  - cuda-opencl=12.6.77
-  - cuda-opencl-dev=12.6.77
-  - cuda-profiler-api=12.6.77
-  - cuda-runtime=12.6.3
-  - cuda-sanitizer-api=12.6.77
-  - cuda-toolkit=12.6.3
-  - cuda-tools=12.6.3
-  - cuda-version=12.6
-  - cuda-visual-tools=12.6.3
-  - dbus=1.16.2
-  - expat=2.7.0
-  - fontconfig=2.15.0
-  - freetype=2.13.3
-  - gcc_impl_linux-64=13.2.0
-  - gcc_linux-64=13.2.0
-  - gds-tools=1.11.1.6
-  - gmp=6.3.0
-  - gxx_impl_linux-64=13.2.0
-  - gxx_linux-64=13.2.0
-  - icu=75.1
-  - kernel-headers_linux-64=3.10.0
-  - keyutils=1.6.1
-  - krb5=1.21.3
-  - ld_impl_linux-64=2.40
-  - libcublas=12.6.4.1
-  - libcublas-dev=12.6.4.1
-  - libcufft=11.3.0.4
-  - libcufft-dev=11.3.0.4
-  - libcufile=1.11.1.6
-  - libcufile-dev=1.11.1.6
-  - libcurand=10.3.7.77
-  - libcurand-dev=10.3.7.77
-  - libcurl=8.13.0
-  - libcusolver=11.7.1.2
-  - libcusolver-dev=11.7.1.2
-  - libcusparse=12.5.4.2
-  - libcusparse-dev=12.5.4.2
-  - libedit=3.1.20250104
-  - libev=4.33
-  - libexpat=2.7.0
-  - libffi=3.4.6
-  - libfreetype=2.13.3
-  - libfreetype6=2.13.3
-  - libgcc=15.1.0
-  - libgcc-devel_linux-64=13.2.0
-  - libgcc-ng=15.1.0
-  - libglib=2.84.2
-  - libgomp=15.1.0
-  - libiconv=1.18
-  - liblzma=5.8.1
-  - libnghttp2=1.64.0
-  - libnpp=12.2.5.30
-  - libnpp-dev=12.2.5.30
-  - libnsl=2.0.1
-  - libnvfatbin=12.6.77
-  - libnvfatbin-dev=12.6.77
-  - libnvjitlink=12.6.85
-  - libnvjitlink-dev=12.6.85
-  - libnvjpeg=12.3.1.117
-  - libnvjpeg-dev=12.3.1.117
-  - libpng=1.6.47
-  - libsanitizer=13.2.0
-  - libsqlite=3.49.2
-  - libssh2=1.11.1
-  - libstdcxx=15.1.0
-  - libstdcxx-devel_linux-64=13.2.0
-  - libstdcxx-ng=15.1.0
-  - libuuid=2.38.1
-  - libuv=1.50.0
-  - libxcb=1.17.0
-  - libxcrypt=4.4.36
-  - libxkbcommon=1.10.0
-  - libxml2=2.13.8
-  - libzlib=1.3.1
-  - ncurses=6.5
-  - nsight-compute=2024.3.2.3
-  - nspr=4.36
-  - nss=3.111
-  - openssl=3.5.0
-  - pcre2=10.45
-  - pip=25.1.1
-  - pthread-stubs=0.4
-  - python=3.11.12
-  - readline=8.2
-  - rhash=1.4.6
-  - setuptools=80.8.0
-  - sysroot_linux-64=2.17
-  - tk=8.6.13
-  - wheel=0.45.1
-  - xkeyboard-config=2.44
-  - xorg-libx11=1.8.12
-  - xorg-libxau=1.0.12
-  - xorg-libxdmcp=1.1.5
-  - zstd=1.5.7
-  - pip:
-      - aiobotocore==2.22.0
-      - aiohappyeyeballs==2.6.1
-      - aiohttp==3.12.0
-      - aioitertools==0.12.0
-      - aiosignal==1.3.2
-      - annotated-types==0.7.0
-      - antlr4-python3-runtime==4.9.3
-      - asttokens==3.0.0
-      - attrs==25.3.0
-      - botocore==1.37.3
-      - certifi==2025.4.26
-      - cfgv==3.4.0
-      - chardet==5.2.0
-      - charset-normalizer==3.4.2
-      - comm==0.2.2
-      - contourpy==1.3.2
-      - cycler==0.12.1
-      - dacite==1.9.2
-      - datasets==3.6.0
-      - debugpy==1.8.14
-      - decorator==5.2.1
-      - dill==0.3.8
-      - distlib==0.3.9
-      - dotenv==0.9.9
-      - einops==0.8.1
-      - executing==2.2.0
-      - fev==0.4.1
-      - filelock==3.18.0
-      - fonttools==4.58.0
-      - frozenlist==1.6.0
-      - fsspec==2025.3.0
-      - ftfy==6.3.1
-      - gluonts==0.16.1
-      - hf-xet==1.1.2
-      - huggingface-hub==0.32.0
-      - identify==2.6.12
-      - idna==3.10
-      - iniconfig==2.1.0
-      - ipykernel==6.29.5
-      - ipython==9.2.0
-      - ipython-pygments-lexers==1.1.1
-      - jedi==0.19.2
-      - jinja2==3.1.6
-      - jmespath==1.0.1
-      - joypy==0.2.6
-      - jupyter-client==8.6.3
-      - jupyter-core==5.7.2
-      - kiwisolver==1.4.8
-      - lightning==2.5.1.post0
-      - lightning-utilities==0.14.3
-      - markdown-it-py==3.0.0
-      - markupsafe==3.0.2
-      - matplotlib==3.10.3
-      - matplotlib-inline==0.1.7
-      - mdurl==0.1.2
-      - mlstm-kernels==2.0.0
-      - mpmath==1.3.0
-      - multidict==6.4.4
-      - multiprocess==0.70.16
-      - nest-asyncio==1.6.0
-      - networkx==3.4.2
-      - ninja==1.11.1.4
-      - nodeenv==1.9.1
-      - numpy==2.1.3
-      - nvidia-cublas-cu12==12.6.4.1
-      - nvidia-cuda-cupti-cu12==12.6.80
-      - nvidia-cuda-nvrtc-cu12==12.6.77
-      - nvidia-cuda-runtime-cu12==12.6.77
-      - nvidia-cudnn-cu12==9.5.1.17
-      - nvidia-cufft-cu12==11.3.0.4
-      - nvidia-curand-cu12==10.3.7.77
-      - nvidia-cusolver-cu12==11.7.1.2
-      - nvidia-cusparse-cu12==12.5.4.2
-      - nvidia-cusparselt-cu12==0.6.3
-      - nvidia-nccl-cu12==2.21.5
-      - nvidia-nvjitlink-cu12==12.6.85
-      - nvidia-nvtx-cu12==12.6.77
-      - omegaconf==2.3.0
-      - opt-einsum==3.4.0
-      - packaging==24.2
-      - pandas==2.2.3
-      - parso==0.8.4
-      - pexpect==4.9.0
-      - pillow==11.2.1
-      - platformdirs==4.3.8
-      - pluggy==1.6.0
-      - pre-commit==4.2.0
-      - prompt-toolkit==3.0.51
-      - propcache==0.3.1
-      - psutil==7.0.0
-      - ptyprocess==0.7.0
-      - pure-eval==0.2.3
-      - pyarrow==20.0.0
-      - pydantic==2.11.5
-      - pydantic-core==2.33.2
-      - pygments==2.19.1
-      - pyparsing==3.2.3
-      - pytest==8.3.5
-      - python-dateutil==2.9.0.post0
-      - python-dotenv==1.1.0
-      - pytorch-lightning==2.5.1.post0
-      - pytz==2025.2
-      - pyyaml==6.0.2
-      - pyzmq==26.4.0
-      - regex==2024.11.6
-      - reportlab==4.4.1
-      - requests==2.32.3
-      - rich==14.0.0
-      - s3fs==2025.3.0
-      - safetensors==0.5.3
-      - scipy==1.15.3
-      - seaborn==0.13.2
-      - six==1.17.0
-      - stack-data==0.6.3
-      - sympy==1.13.1
-      - tirex==1.0.0
-      - tokenizers==0.21.1
-      - toolz==0.12.1
-      - torch==2.6.0+cu126
-      - torchaudio==2.6.0+cu126
-      - torchmetrics==1.7.1
-      - torchvision==0.21.0+cu126
-      - tornado==6.5.1
-      - tqdm==4.67.1
-      - traitlets==5.14.3
-      - transformers==4.52.3
-      - triton==3.2.0
-      - typing-extensions==4.13.2
-      - typing-inspection==0.4.1
-      - tzdata==2025.2
-      - urllib3==2.4.0
-      - virtualenv==20.31.2
-      - wcwidth==0.2.13
-      - wrapt==1.17.2
-      - xlstm==2.0.3
-      - xxhash==3.5.0
-      - yarl==1.20.0
diff --git a/examples/tirex/requirements.txt b/examples/tirex/requirements.txt
deleted file mode 100644
index c0b92af..0000000
--- a/examples/tirex/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-torch~=2.6.0
-tirex @ git+https://github.com/NX-AI/tirex@main
\ No newline at end of file
diff --git a/examples/toto/evaluate_model.py b/examples/toto/evaluate_model.py
deleted file mode 100644
index 923ab19..0000000
--- a/examples/toto/evaluate_model.py
+++ /dev/null
@@ -1,247 +0,0 @@
-import math
-import time
-
-import datasets
-import numpy as np
-import pandas as pd
-import torch
-from toto.data.util.dataset import MaskedTimeseries
-from toto.inference.forecaster import TotoForecaster
-from toto.model.toto import Toto
-from tqdm.auto import tqdm
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-def batchify(lst: list, batch_size: int):
-    """Convert list into batches of desired size."""
-    for i in range(0, len(lst), batch_size):
-        yield lst[i : i + batch_size]
-
-
-def freq_to_seconds(freq: pd.offsets.BaseOffset | str) -> float:
-    if isinstance(freq, str):
-        freq = pd.tseries.frequencies.to_offset(freq)
-    try:
-        return freq.nanos / 1e9
-    except ValueError:
-        if isinstance(freq, pd.offsets.BusinessDay):
-            return freq.n * 24 * 60 * 60
-        elif isinstance(freq, pd.offsets.Week):
-            return freq.n * 7 * 24 * 60 * 60
-        elif isinstance(freq, pd.offsets.MonthBegin) or isinstance(freq, pd.offsets.MonthEnd):
-            return 30 * 24 * 60 * 60
-        elif isinstance(freq, pd.offsets.QuarterEnd) or isinstance(freq, pd.offsets.QuarterBegin):
-            return 90 * 24 * 60 * 60
-        elif isinstance(freq, pd.offsets.YearEnd) or isinstance(freq, pd.offsets.YearBegin):
-            return 365.25 * 24 * 60 * 60
-        else:
-            raise ValueError(f"Cannot handle frequency of type {type(freq)}: {freq}")
-
-
-def ffill(tensor: "torch.Tensor") -> "torch.Tensor":
-    """Forward fill along the last axis"""
-
-    assert tensor.ndim > 1
-    nan_mask = torch.isnan(tensor)
-    indices = torch.where(nan_mask, 0, torch.arange(tensor.shape[-1], device=tensor.device).expand_as(tensor))
-    last_valid = torch.cummax(indices, dim=-1).values
-    return torch.gather(tensor, dim=-1, index=last_valid)
-
-
-def left_pad_and_stack_2d(tensors: list["torch.Tensor"]) -> "torch.Tensor":
-    max_len = max(c.shape[-1] for c in tensors)
-    padded = []
-    for c in tensors:
-        assert isinstance(c, torch.Tensor)
-        assert c.ndim == 2
-        padding = torch.full(size=(c.shape[0], max_len - c.shape[-1]), fill_value=torch.nan, device=c.device)
-        padded.append(torch.concat((padding, c), dim=-1))
-    return torch.stack(padded)
-
-
-def predict_window(
-    window: fev.EvaluationWindow,
-    toto_forecaster: "TotoForecaster",
-    quantile_levels: list[float],
-    max_context_length: int,
-    max_batch_variate_size: int,
-    num_samples: int,
-    samples_per_batch: int,
-    time_delta_seconds: float,
-    as_univariate: bool,
-    return_mean: bool,
-    device: str,
-) -> tuple[datasets.DatasetDict, float]:
-    if as_univariate:
-        past_data, _ = fev.convert_input_data(window, adapter="datasets", as_univariate=True)
-        target_columns = ["target"]
-    else:
-        past_data, _ = window.get_input_data()
-        target_columns = window.target_columns
-
-    past_data_features = past_data.features
-    past_data_features.update({col: datasets.Sequence(datasets.Value("float32")) for col in target_columns})
-    past_data = past_data.cast(past_data_features)
-
-    num_variates = len(target_columns)
-    inputs = [
-        torch.tensor(np.stack(tuple(row.values()), axis=0), dtype=torch.float32)
-        for row in past_data.select_columns(target_columns)
-    ]
-
-    forecast_batches = []
-    batch_size = max(1, math.floor(max_batch_variate_size / num_variates))
-
-    start_time = time.monotonic()
-    for batch in tqdm(batchify(inputs, batch_size=batch_size), total=len(inputs) // batch_size):
-        stacked_batch = left_pad_and_stack_2d(batch)
-        stacked_batch = stacked_batch[..., -max_context_length:]
-        stacked_batch = stacked_batch.to(device=device)
-        # Impute missing values
-        stacked_batch = ffill(stacked_batch)
-        nan_mask = torch.isnan(stacked_batch)
-        stacked_batch[nan_mask] = 0.0
-
-        current_batch_size, _, context_length = stacked_batch.shape
-        # each item in the batch is assigned a unique ID from [0, ..., current_batch_size - 1]
-        id_mask = torch.arange(current_batch_size, dtype=torch.int, device=device)[:, None, None].repeat(
-            1, num_variates, context_length
-        )
-
-        # FIXME: technically this should be a tensor of unix epochs but it is not used
-        # by the current model (Datadog/Toto-Open-Base-1.0)
-        timestamp_seconds = torch.zeros_like(stacked_batch, dtype=torch.int)
-        time_interval_seconds = torch.full(
-            (current_batch_size, 1), fill_value=time_delta_seconds, device=device, dtype=torch.int
-        )
-
-        masked_timeseries = MaskedTimeseries(
-            series=stacked_batch,
-            padding_mask=~nan_mask,
-            id_mask=id_mask,
-            timestamp_seconds=timestamp_seconds,
-            time_interval_seconds=time_interval_seconds,
-        )
-
-        toto_forecast = toto_forecaster.forecast(
-            masked_timeseries,
-            prediction_length=window.horizon,
-            num_samples=num_samples,
-            samples_per_batch=samples_per_batch,
-        )
-
-        multivariate_forecast = {variate_name: {} for variate_name in target_columns}
-        if return_mean:
-            mean_forecast = toto_forecast.mean.cpu().numpy()
-            for i, variate_name in enumerate(target_columns):
-                multivariate_forecast[variate_name]["predictions"] = mean_forecast[:, i]
-        else:
-            median_forecast = toto_forecast.quantile(0.5).cpu().numpy()
-            for i, variate_name in enumerate(target_columns):
-                multivariate_forecast[variate_name]["predictions"] = median_forecast[:, i]
-
-        for q in quantile_levels:
-            quantile_forecast = toto_forecast.quantile(q).cpu().numpy()
-            for i, variate_name in enumerate(target_columns):
-                multivariate_forecast[variate_name][str(q)] = quantile_forecast[:, i]
-
-        forecast_batches.append(multivariate_forecast)
-
-    window_inference_time = time.monotonic() - start_time
-
-    predictions_dict: dict = {}
-    for variate_name in target_columns:
-        predictions_dict[variate_name] = datasets.Dataset.from_dict(
-            {
-                k: np.concatenate([batch[variate_name][k] for batch in forecast_batches], axis=0)
-                for k in ["predictions"] + [str(q) for q in quantile_levels]
-            }
-        )
-    predictions = datasets.DatasetDict(predictions_dict)
-    predictions.set_format("numpy")
-
-    return predictions, window_inference_time
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_path: str = "Datadog/Toto-Open-Base-1.0",
-    max_batch_variate_size: int = 24,
-    num_samples: int = 256,
-    samples_per_batch: int = 32,
-    max_context_length: int = 4096,
-    as_univariate: bool = False,
-    compile_model: bool = False,
-    device: str = "auto",
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    if device == "auto":
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    toto = Toto.from_pretrained(model_path)
-    toto.to(device)
-    if compile_model:
-        toto.compile()
-    toto_forecaster = TotoForecaster(toto.model)
-
-    inference_time = 0.0
-    predictions_per_window = []
-
-    for window in task.iter_windows():
-        predictions, window_inference_time = predict_window(
-            window=window,
-            toto_forecaster=toto_forecaster,
-            quantile_levels=task.quantile_levels,
-            max_context_length=max_context_length,
-            max_batch_variate_size=max_batch_variate_size,
-            num_samples=num_samples,
-            samples_per_batch=samples_per_batch,
-            return_mean=task.eval_metric in ["MSE", "RMSE", "RMSSE"],
-            time_delta_seconds=freq_to_seconds(task.freq),
-            as_univariate=as_univariate,
-            device=device,
-        )
-        predictions_per_window.append(predictions)
-        inference_time += window_inference_time
-
-    extra_info = {
-        "model_config": {
-            "model_path": model_path,
-            "max_batch_variate_size": max_batch_variate_size,
-            "num_samples": num_samples,
-            "samples_per_batch": samples_per_batch,
-            "max_context_length": max_context_length,
-            "as_univariate": as_univariate,
-            "compile_model": compile_model,
-            "device": device,
-        }
-    }
-
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_path = "Datadog/Toto-Open-Base-1.0"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://github.com/autogluon/fev/raw/refs/heads/main/benchmarks/fev_bench/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_path=model_path)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_path,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv("toto.csv", index=False)
diff --git a/examples/toto/requirements.txt b/examples/toto/requirements.txt
deleted file mode 100644
index 18da2c0..0000000
--- a/examples/toto/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-toto-ts @ git+https://github.com/DataDog/toto.git@66c885ef0cd345a41dedff0780fa0886e355b507
\ No newline at end of file
diff --git a/examples/ttm-r2/evaluate_model.py b/examples/ttm-r2/evaluate_model.py
deleted file mode 100644
index 4b83639..0000000
--- a/examples/ttm-r2/evaluate_model.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import time
-
-import datasets
-import numpy as np
-import pandas as pd
-import torch
-from gluonts.transform import LastValueImputation
-from tqdm.auto import tqdm
-from tsfm_public import TinyTimeMixerForPrediction
-
-import fev
-
-datasets.disable_progress_bars()
-
-
-class TinyTimeMixerPipeline:
-    def __init__(self, model):
-        self.model = model
-        self.max_context_length = model.config.context_length
-        self.max_prediction_length = self.model.config.prediction_length
-
-    def _left_pad_and_stack_1D(self, tensors: list[torch.Tensor]) -> torch.Tensor:
-        max_len = max(self.max_context_length, *(len(c) for c in tensors))
-        padded = []
-        for c in tensors:
-            assert isinstance(c, torch.Tensor)
-            assert c.ndim == 1
-            padding = torch.full(size=(max_len - len(c),), fill_value=torch.nan, device=c.device)
-            padded.append(torch.concat((padding, c), dim=-1))
-        return torch.stack(padded)
-
-    def _prepare_and_validate_context(self, context: torch.Tensor | list[torch.Tensor]):
-        if isinstance(context, list):
-            context = self._left_pad_and_stack_1D(context)
-        assert isinstance(context, torch.Tensor)
-        if context.ndim == 1:
-            context = context.unsqueeze(0)
-        assert context.ndim == 2
-
-        if context.size(-1) > self.max_context_length:
-            context = context[..., -self.max_context_length :]
-
-        # Add channel dimension
-        context = context.unsqueeze(dim=-1)
-
-        return context
-
-    def predict(
-        self,
-        context: torch.Tensor | list[torch.Tensor],
-        prediction_length: int,
-    ):
-        assert prediction_length <= self.max_prediction_length, (
-            f"Only prediction lengths up to {self.max_prediction_length} are supported"
-        )
-
-        context = self._prepare_and_validate_context(context).float()
-        nan_mask = torch.isnan(context)
-        context[nan_mask] = 0.0
-
-        with torch.no_grad():
-            output = self.model(
-                past_values=context.to(self.model.device),
-                past_observed_mask=~nan_mask.to(self.model.device),
-            ).prediction_outputs.cpu()  # (batch, prediction_length, 1)
-
-        # truncate predictions
-        return output.squeeze(dim=-1)[:, :prediction_length]
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        model = TinyTimeMixerForPrediction.from_pretrained(*args, **kwargs)
-        model.eval()
-        return cls(model=model)
-
-
-def batchify(lst: list, batch_size: int = 32):
-    """Convert list into batches of desired size."""
-    for i in range(0, len(lst), batch_size):
-        yield lst[i : i + batch_size]
-
-
-def predict_with_model(
-    task: fev.Task,
-    model_name: str = "ibm-granite/granite-timeseries-ttm-r2",
-    device: str = "cuda",
-    batch_size: int = 256,
-) -> tuple[list[datasets.DatasetDict], float, dict]:
-    pipeline = TinyTimeMixerPipeline.from_pretrained(model_name, device_map=device)
-    imputation = LastValueImputation()
-    inference_time = 0.0
-    predictions_per_window = []
-    for window in task.iter_windows(trust_remote_code=True):
-        past_data, _ = fev.convert_input_data(window, adapter="datasets", as_univariate=True)
-        past_data = past_data.with_format("numpy").cast_column("target", datasets.Sequence(datasets.Value("float32")))
-        inputs = [imputation(t) for t in past_data["target"]]
-
-        forecast_per_batch = []
-        start_time = time.monotonic()
-        for batch in tqdm(batchify(inputs, batch_size=batch_size), total=len(inputs) // batch_size):
-            forecast_per_batch.append(
-                pipeline.predict([torch.tensor(x) for x in batch], prediction_length=task.horizon).cpu().numpy()
-            )
-        inference_time += time.monotonic() - start_time
-
-        forecast = np.concatenate(forecast_per_batch, axis=0)  # [num_items, horizon]
-        predictions_dict = {"predictions": forecast}
-        # Probabilistic forecasting not supported, so we repeat the point forecast for each quantile
-        for q in task.quantile_levels:
-            predictions_dict[str(q)] = forecast
-        predictions_per_window.append(
-            fev.combine_univariate_predictions_to_multivariate(
-                datasets.Dataset.from_dict(predictions_dict), target_columns=task.target_columns
-            )
-        )
-
-    extra_info = {
-        "model_config": {
-            "context_length": pipeline.max_context_length,
-            "model_name": model_name,
-            "batch_size": batch_size,
-            "device": device,
-        }
-    }
-    return predictions_per_window, inference_time, extra_info
-
-
-if __name__ == "__main__":
-    model_name = "ibm-granite/granite-timeseries-ttm-r2"
-    num_tasks = 2  # replace with `num_tasks = None` to run on all tasks
-
-    benchmark = fev.Benchmark.from_yaml(
-        "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/tasks.yaml"
-    )
-    summaries = []
-    for task in benchmark.tasks[:num_tasks]:
-        predictions, inference_time, extra_info = predict_with_model(task, model_name=model_name)
-        evaluation_summary = task.evaluation_summary(
-            predictions,
-            model_name=model_name,
-            inference_time_s=inference_time,
-            extra_info=extra_info,
-        )
-        print(evaluation_summary)
-        summaries.append(evaluation_summary)
-
-    # Show and save the results
-    summary_df = pd.DataFrame(summaries)
-    print(summary_df)
-    summary_df.to_csv("ttm-r2.csv", index=False)
diff --git a/examples/ttm-r2/requirements.txt b/examples/ttm-r2/requirements.txt
deleted file mode 100644
index 93d3776..0000000
--- a/examples/ttm-r2/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-tsfm-public @ git+https://github.com/ibm-granite/granite-tsfm.git@9484de7a81e9d202ca024ee7924b83a796661dca
-# GluonTS used for missing value imputation
-gluonts==0.16.0
diff --git a/src/fev/__about__.py b/src/fev/__about__.py
index 2852f8e..55600b8 100644
--- a/src/fev/__about__.py
+++ b/src/fev/__about__.py
@@ -1,2 +1,2 @@
 # We cannot store __version__ in fev/__init__.py since that will introduce circular dependencies
-__version__ = "0.8.0rc1"
+__version__ = "0.8.0"

From 8cb9bbaec54ddb7ffb4ee51a57aa2bdcfa34b36e Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Tue, 19 May 2026 16:17:54 +0000
Subject: [PATCH 2/3] Update tutorial

---
 docs/tutorials/05-add-your-model.ipynb | 32 ++++++++------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/docs/tutorials/05-add-your-model.ipynb b/docs/tutorials/05-add-your-model.ipynb
index d577e46..113a8f5 100644
--- a/docs/tutorials/05-add-your-model.ipynb
+++ b/docs/tutorials/05-add-your-model.ipynb
@@ -77,17 +77,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "### Step 3: Add `requirements.txt`\n",
-    "\n",
-    "List pinned dependencies for your model. These are installed automatically in an ephemeral environment when running `evaluate.py` — your project environment is not modified.\n",
-    "\n",
-    "```\n",
-    "# models/my-model/requirements.txt\n",
-    "my-forecasting-lib==1.2.3\n",
-    "torch>=2.0\n",
-    "```"
-   ]
+   "source": "### Step 3: Add `requirements.txt`\n\nList pinned dependencies for your model. Pin the main packages to exact versions for reproducibility. These are installed automatically in an ephemeral environment when running `evaluate.py` — your project environment is not modified.\n\n```\n# models/my-model/requirements.txt\nmy-forecasting-lib==1.2.3\ntorch==2.7\n```"
   },
   {
    "cell_type": "markdown",
@@ -141,16 +131,14 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "Predictions cannot contain any `NaN` values.\n",
-    "\n",
-    "### Tips\n",
-    "\n",
-    "- If your model generates probabilistic forecasts, choose the \"optimal\" point forecast for the `task.eval_metric`. For example, metrics like `\"MSE\"` prefer the mean, while `\"MASE\"` is optimized by the median.\n",
-    "- Use `fev.convert_input_data()` to take advantage of adapters and reduce boilerplate preprocessing.\n",
-    "- Make sure your wrapper handles missing values (or imputes them before passing data to the model).\n",
-    "- Take advantage of extra features available via `task.static_columns`, `task.dynamic_columns`, `task.known_dynamic_columns`, and `task.past_dynamic_columns`."
-   ]
+   "source": "Predictions cannot contain any missing values represented by `NaN`, otherwise an exception will be raised.\n\nOther than what's described above, there are no hard restrictions on how `_fit_predict` needs to be implemented. For example, it's completely up to you whether the method uses any dataset columns except the target or how the data is preprocessed.\n\nStill, here is some general advice:\n\n- If your model is capable of generating probabilistic forecasts, make sure that you use the \"optimal\" point forecast for the `task.eval_metric`. For example, for metrics like `\"MSE\"` or `\"RMSSE\"`, the mean forecast is preferred, while metrics like `\"MASE\"` are optimized by the median forecast.\n- Use `fev.convert_input_data()` to take advantage of the adapters and reduce the boilerplate preprocessing code.\n- Make sure that your wrapper can deal with missing values (or at least imputes them before passing the data to your model).\n- Make sure that your wrapper takes advantage of the extra features of the task. For example, the following attributes might be useful:"
+  },
+  {
+   "cell_type": "code",
+   "source": "print(f\"{task.static_columns=}\")\nprint(f\"{task.dynamic_columns=}\")\nprint(f\"{task.known_dynamic_columns=}\")\nprint(f\"{task.past_dynamic_columns=}\")\n# Attributes available after `task.load_full_dataset` is called\ntask.load_full_dataset()\nprint(f\"{task.freq=}\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -206,4 +194,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file

From f6ce15135e09347956e604039363c8d6a50ef052 Mon Sep 17 00:00:00 2001
From: Oleksandr Shchur <shchuro@amazon.com>
Date: Tue, 19 May 2026 16:30:57 +0000
Subject: [PATCH 3/3] Update tutorial

---
 docs/tutorials/05-add-your-model.ipynb | 118 +++++++++++++++++--------
 1 file changed, 80 insertions(+), 38 deletions(-)

diff --git a/docs/tutorials/05-add-your-model.ipynb b/docs/tutorials/05-add-your-model.ipynb
index 113a8f5..52e73b2 100644
--- a/docs/tutorials/05-add-your-model.ipynb
+++ b/docs/tutorials/05-add-your-model.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "This notebook covers the following topics:\n",
     "1. Adding a wrapper for your model to [fev/models](https://github.com/autogluon/fev/tree/main/models).\n",
-    "2. Submitting the results for your model to the [fev-leaderboard](https://huggingface.co/spaces/autogluon/fev-leaderboard)."
+    "2. Submitting the results for your model to the [`fev-bench` leaderboard](https://huggingface.co/spaces/autogluon/fev-bench)."
    ]
   },
   {
@@ -77,7 +77,17 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "### Step 3: Add `requirements.txt`\n\nList pinned dependencies for your model. Pin the main packages to exact versions for reproducibility. These are installed automatically in an ephemeral environment when running `evaluate.py` — your project environment is not modified.\n\n```\n# models/my-model/requirements.txt\nmy-forecasting-lib==1.2.3\ntorch==2.7\n```"
+   "source": [
+    "### Step 3: Add `requirements.txt`\n",
+    "\n",
+    "List pinned dependencies for your model. Pin the main packages to exact versions for reproducibility. These are installed automatically in an ephemeral environment when running `evaluate.py` — your project environment is not modified.\n",
+    "\n",
+    "```\n",
+    "# models/my-model/requirements.txt\n",
+    "my-forecasting-lib==1.2.3\n",
+    "torch==2.7\n",
+    "```"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -90,18 +100,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'predictions': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.1': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.2': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.3': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.4': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.5': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.6': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.7': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.8': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None),\n",
+       " '0.9': Sequence(feature=Value(dtype='float64', id=None), length=13, id=None)}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import fev\n",
     "\n",
-    "task = fev.Task(\n",
-    "    dataset_path=\"autogluon/chronos_datasets\",\n",
-    "    dataset_config=\"monash_rideshare\",\n",
-    "    target=\"price_mean\",\n",
-    "    horizon=30,\n",
-    ")\n",
+    "benchmark = fev.Benchmark.from_yaml(\"https://raw.githubusercontent.com/autogluon/fev/refs/tags/v0.7.0/benchmarks/fev_bench/tasks.yaml\")\n",
+    "task = [t for t in benchmark.tasks if t.task_name == \"rossmann_1W\"][0]\n",
     "task.predictions_schema"
    ]
   },
@@ -109,37 +135,45 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For probabilistic forecasting tasks (when `task.quantile_levels` is set), predictions must additionally contain quantile forecasts:"
+    "Predictions cannot contain any missing values represented by `NaN`, otherwise an exception will be raised.\n",
+    "\n",
+    "Other than what's described above, there are no hard restrictions on how `_fit_predict` needs to be implemented. For example, it's completely up to you whether the method uses any dataset columns except the target or how the data is preprocessed.\n",
+    "\n",
+    "Still, here is some general advice:\n",
+    "\n",
+    "- If your model is capable of generating probabilistic forecasts, make sure that you use the \"optimal\" point forecast for the `task.eval_metric`. For example, for metrics like `\"MSE\"` or `\"RMSSE\"`, the mean forecast is preferred, while metrics like `\"MASE\"` are optimized by the median forecast.\n",
+    "- Use `fev.convert_input_data()` to take advantage of the adapters and reduce the boilerplate preprocessing code.\n",
+    "- Make sure that your wrapper can deal with missing values (or at least imputes them before passing the data to your model).\n",
+    "- Make sure that your wrapper takes advantage of the extra features of the task. For example, the following attributes might be useful:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "task.static_columns=['Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Store', 'StoreType']\n",
+      "task.dynamic_columns=['Open', 'Promo', 'SchoolHoliday', 'StateHoliday', 'Customers']\n",
+      "task.known_dynamic_columns=['Open', 'Promo', 'SchoolHoliday', 'StateHoliday']\n",
+      "task.past_dynamic_columns=['Customers']\n",
+      "task.freq='W-SUN'\n"
+     ]
+    }
+   ],
    "source": [
-    "task = fev.Task(\n",
-    "    dataset_path=\"autogluon/chronos_datasets\",\n",
-    "    dataset_config=\"monash_rideshare\",\n",
-    "    target=\"price_mean\",\n",
-    "    horizon=30,\n",
-    "    quantile_levels=[0.1, 0.5, 0.9],\n",
-    ")\n",
-    "task.predictions_schema"
+    "print(f\"{task.static_columns=}\")\n",
+    "print(f\"{task.dynamic_columns=}\")\n",
+    "print(f\"{task.known_dynamic_columns=}\")\n",
+    "print(f\"{task.past_dynamic_columns=}\")\n",
+    "# Attributes available after `task.load_full_dataset` is called\n",
+    "task.load_full_dataset()\n",
+    "print(f\"{task.freq=}\")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": "Predictions cannot contain any missing values represented by `NaN`, otherwise an exception will be raised.\n\nOther than what's described above, there are no hard restrictions on how `_fit_predict` needs to be implemented. For example, it's completely up to you whether the method uses any dataset columns except the target or how the data is preprocessed.\n\nStill, here is some general advice:\n\n- If your model is capable of generating probabilistic forecasts, make sure that you use the \"optimal\" point forecast for the `task.eval_metric`. For example, for metrics like `\"MSE\"` or `\"RMSSE\"`, the mean forecast is preferred, while metrics like `\"MASE\"` are optimized by the median forecast.\n- Use `fev.convert_input_data()` to take advantage of the adapters and reduce the boilerplate preprocessing code.\n- Make sure that your wrapper can deal with missing values (or at least imputes them before passing the data to your model).\n- Make sure that your wrapper takes advantage of the extra features of the task. For example, the following attributes might be useful:"
-  },
-  {
-   "cell_type": "code",
-   "source": "print(f\"{task.static_columns=}\")\nprint(f\"{task.dynamic_columns=}\")\nprint(f\"{task.known_dynamic_columns=}\")\nprint(f\"{task.past_dynamic_columns=}\")\n# Attributes available after `task.load_full_dataset` is called\ntask.load_full_dataset()\nprint(f\"{task.freq=}\")",
-   "metadata": {},
-   "execution_count": null,
-   "outputs": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -164,7 +198,7 @@
    "source": [
     "## Submitting results to the leaderboard\n",
     "\n",
-    "After implementing your model wrapper, follow these steps to submit results to the [fev-leaderboard](https://huggingface.co/spaces/autogluon/fev-leaderboard):\n",
+    "After implementing your model wrapper, follow these steps to submit results to the [`fev-bench` leaderboard](https://huggingface.co/spaces/autogluon/fev-bench):\n",
     "\n",
     "1. Fork [`autogluon/fev`](https://github.com/autogluon/fev) and clone your fork.\n",
     "2. Implement your model wrapper in `models/<name>/`.\n",
@@ -177,21 +211,29 @@
     "   - `models/<name>/model.py`\n",
     "   - `models/<name>/requirements.txt`\n",
     "   - `benchmarks/fev_bench/results/<name>.csv`\n",
-    "5. We will independently reproduce the results using your code and add them to the leaderboard."
+    "5. We will independently validate the results using your code and add them to the leaderboard."
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
-   "version": "3.11.0"
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.15"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}