diff --git a/.gitignore b/.gitignore
index 3565dc89c..ec69137bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,4 +20,4 @@ tests/__pycache__
 tests/data/*
 .vscode/
 *ipynb_checkpoints/
-docs/user_guide/deepforestr.md
\ No newline at end of file
+docs/user_guide/deepforestr.md
diff --git a/docs/user_guide/11_training.md b/docs/user_guide/11_training.md
index 1845ae54d..95dc21c4c 100644
--- a/docs/user_guide/11_training.md
+++ b/docs/user_guide/11_training.md
@@ -429,17 +429,27 @@ for tile in tiles_to_predict:
 
 Usually creating this object does not cost too much computational time.
 
-#### Training across multiple nodes on a HPC system
+#### Training across multiple nodes/GPUs
 
-We have heard that this error can appear when trying to deep copy the pytorch lightning module. The trainer object is not pickleable.
-For example, on multi-gpu environments when trying to scale the deepforest model the entire module is copied leading to this error.
-Setting the trainer object to None and directly using the pytorch object is a reasonable workaround.
+If you have access to a HPC system or cluster, or simply a powerful desktop with multiple GPUs locally, you may want to take advantage of them. Fortunately, DeepForest uses Lightning which handles most of the distributed processing issues for you. Let's call the number of nodes "N" and the number of GPUs per node, "M". A common setup is a single node with up to `M=8` GPUs, but you may need to split procesing between machines, in which case you'd have multiple nodes.
+
+If you're using a job manager like SLURM, you can express the number of GPUs via a configuration and the "allowed" device IDs will be passed to Lightning. On a local machine, Lightning will attempt to acquire whatever resources it can, unless you override and specify the `devices` argument, which can be a list. **On a managed cluster, do not do this: rely on 'auto' and let the scheduler to inform what GPUs are available.** The reason is that some clusters are unable to isolate GPU devices to jobs like they can with CPU cores, and you can interfere with other people's jobs if you try to acquire a device that wasn't allocated to you.
+
+In most cases the only thing you need to set is the training strategy to be "DDP" (distributed-data parallel). Pytorch has a technical document here, but we provide a brief summary here with some practical tips. When training starts, `NM` copies of your program will be created. In DDP, the training dataset is sharded/split between these processes, so each epoch will be `len(dataset)/batch_size/NM` steps. At the end of each forward pass, all the processes are synchronized, the g are combined
 
-Replace
 
 ```python
 m = main.deepforest()
-m.create_trainer()
+ v
+        logger=loggers,
+        callbacks=callbacks,
+        gradient_clip_val=0.5,
+        accelerator=config.accelerator,
+        strategy="ddp_find_unused_parameters_true"
+        if torch.cuda.is_available()
+        else "auto",
+        devices='auto'
+    )
 m.trainer.fit(m)
 ```
 
@@ -449,14 +459,16 @@ with
 m.trainer = None
 from pytorch_lightning import Trainer
 
-    trainer = Trainer(
-        accelerator="gpu",
-        strategy="ddp",
-        devices=model.config.devices,
-        enable_checkpointing=False,
-        max_epochs=model.config.train.epochs,
-        logger=comet_logger
-    )
+trainer = Trainer(
+    accelerator="gpu",
+    strategy="ddp_find_unused_parameters_true",
+    devices=model.config.devices,
+    enable_checkpointing=False,
+    max_epochs=model.config.train.epochs,
+    logger=comet_logger
+)
+
+
 trainer.fit(m)
 ```
 
@@ -464,6 +476,19 @@ The added benefits of this is more control over the trainer object.
 The downside is that it doesn't align with the .config pattern where a user now has to look into the config to create the trainer.
 We are open to changing this to be the default pattern in the future and welcome input from users.
 
+#### Visualization during training
+
+Visualizing images during training can be valuable to spot augmentation that isn't working as you expected, label issues and to see if the model is learning anything. To make this easy, we provide a Lightning callback that can be used with the trainer: `deepforest.callbacks.ImagesCallback`. You need to provide a directory path where the images will be saved, which can be a temporary path if you don't want to keep the images. To use, create the callback object and pass it to `create_trainer` along with any other callbacks you need.
+
+```python
+from deepforest import callbacks
+
+im_callback = callbacks.ImagesCallback(save_dir=tmpdir, every_n_epochs=2)
+m.create_trainer(callbacks=[im_callback])
+```
+
+The callback will, by default, log images to disk. When training starts, it will save images for the training and validation dataset (if available). Then at a user-specified interval (`every_n_epochs`), predictions will be logged with ground truth. If you have Comet or Tensorboard loggers (loggers which accept `add_image` or `log_image`) then the callback will attempt to log to those. Due to auto-discovery behavior with Comet, the callback will preferentially log to Tensorboard if present, to avoid images being pushed to Comet twice. To adjust the number of samples saved, modify `dataset_samples` and `prediction_samples` (set to 0 to disable).
+
 #### Training via command line
 
 We provide a basic script to trigger a training run via CLI. This script is installed as part of the standard DeepForest installation is called `deepforest train`. We use [Hydra](https://hydra.cc/docs/intro/) for configuration management and you can pass configuration parameters as command line arguments as follows:
diff --git a/pyproject.toml b/pyproject.toml
index f683e1b2a..da50ce8d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
     "h5py",
     "huggingface_hub>=0.25.0",
     "hydra-core",
+    "geopandas>=1.0.0",
     "matplotlib",
     "numpy<2.0",
     "omegaconf",
@@ -46,21 +47,38 @@ dependencies = [
     "pillow>6.2.0",
     "psutil",
     "pycocotools",
-    "pytorch-lightning>=2.1.0,<3.0.0",
+    "pytorch-lightning>=2.5.5,<3.0.0",
     "pyyaml>=5.1.0",
     "rasterio",
-    "rtree",
     "safetensors<0.6.0",
+    "shapely>2.0.0",
     "setuptools",
     "slidingwindow",
     "supervision",
     "tensorboard",
     "timm",
-    "torch>=2.2.0,<2.3.0",
-    "torchvision>=0.17.0,<0.18.0",
+    "torch>=2.7.0",
+    "torchvision>=0.17.0",
     "tqdm",
-    "transformers",
+    "transformers>=4.56",
     "xmltodict",
+    "transformers",
+    "timm>=1.0.15",
+    "faster-coco-eval>=1.6.7",
+    "comet-ml>=3.51.0",
+]
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+torchvision = [
+  { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 
 [project.urls]
@@ -101,6 +119,7 @@ docs = [
 
 [project.scripts]
 deepforest = "deepforest.scripts.cli:main"
+deepforest-evaluate = "deepforest.scripts.evaluate:main"
 
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]
diff --git a/src/deepforest/IoU.py b/src/deepforest/IoU.py
index fa1559924..bfa217ca1 100644
--- a/src/deepforest/IoU.py
+++ b/src/deepforest/IoU.py
@@ -2,122 +2,124 @@
 IoU Module, with help from https://github.com/SpaceNetChallenge/utilities/blob/spacenetV3/spacenetutilities/evalTools.py
 """
 
+import geopandas as gpd
 import numpy as np
 import pandas as pd
-import rtree
+import shapely
 from scipy.optimize import linear_sum_assignment
+from shapely import STRtree
 
 
-def create_rtree_from_poly(poly_list):
-    # create index
-    index = rtree.index.Index(interleaved=True)
-    for idx, geom in enumerate(poly_list):
-        index.insert(idx, geom.bounds)
-
-    return index
-
-
-def _overlap_(test_poly, truth_polys, rtree_index):
-    """Calculate overlap between one polygon and all ground truth by area."""
-    prediction_id = []
-    truth_id = []
-    area = []
-    matched_list = list(rtree_index.intersection(test_poly.geometry.bounds))
-    for index in truth_polys.index:
-        if index in matched_list:
-            # get the original index just to be sure
-            intersection_result = test_poly.geometry.intersection(
-                truth_polys.loc[index].geometry
-            )
-            intersection_area = intersection_result.area
-        else:
-            intersection_area = 0
-
-        prediction_id.append(test_poly.prediction_id)
-        truth_id.append(truth_polys.loc[index].truth_id)
-        area.append(intersection_area)
-
-    results = pd.DataFrame(
-        {"prediction_id": prediction_id, "truth_id": truth_id, "area": area}
-    )
-    return results
+def _overlap_all(test_polys: "gpd.GeoDataFrame", truth_polys: "gpd.GeoDataFrame"):
+    """Computes intersection and union areas for all polygons in the test/truth
+    dataframes.
 
-
-def _overlap_all(test_polys, truth_polys, rtree_index):
-    """Find area of overlap among all sets of ground truth and prediction."""
-    results = []
-    for _index, row in test_polys.iterrows():
-        result = _overlap_(
-            test_poly=row, truth_polys=truth_polys, rtree_index=rtree_index
+    Return NumPy arrays:
+      intersections  : (n_truth, n_pred) intersection areas
+      unions : (n_truth, n_pred) union areas
+      truth_ids : (n_truth,) truth index values (order matches rows of areas/unions)
+      pred_ids  : (n_pred,) prediction index values (order matches cols of areas/unions)
+    """
+    # geometry arrays
+    pred_geoms = np.asarray(test_polys.geometry.values, dtype=object)
+    truth_geoms = np.asarray(truth_polys.geometry.values, dtype=object)
+
+    pred_ids = test_polys.index.to_numpy()
+    truth_ids = truth_polys.index.to_numpy()
+
+    n_pred = pred_geoms.size
+    n_truth = truth_geoms.size
+
+    # empty cases
+    if n_pred == 0 or n_truth == 0:
+        return (
+            np.zeros((n_truth, n_pred), dtype=float),
+            np.zeros((n_truth, n_pred), dtype=float),
+            truth_ids,
+            pred_ids,
         )
-        results.append(result)
-    results = pd.concat(results, ignore_index=True)
 
-    return results
+    # spatial index on truth
+    tree = STRtree(truth_geoms)
+    p_idx, t_idx = tree.query(pred_geoms, predicate="intersects")  # shape (2, M)
 
+    intersections = np.zeros((n_truth, n_pred), dtype=float)
+    unions = np.zeros((n_truth, n_pred), dtype=float)
 
-def _iou_(test_poly, truth_poly):
-    """Intersection over union."""
-    intersection_result = test_poly.intersection(truth_poly.geometry)
-    intersection_area = intersection_result.area
-    union_area = test_poly.union(truth_poly.geometry).area
-    return intersection_area / union_area
+    if p_idx.size:
+        inter = shapely.intersection(truth_geoms[t_idx], pred_geoms[p_idx])
+        uni = shapely.union(truth_geoms[t_idx], pred_geoms[p_idx])
+        intersections[t_idx, p_idx] = shapely.area(inter)
+        unions[t_idx, p_idx] = shapely.area(uni)
 
+    return intersections, unions, truth_ids, pred_ids
 
-def compute_IoU(ground_truth, submission):
-    """
 
-    Args:
-        ground_truth: a projected geopandas dataframe with geoemtry
-        submission: a projected geopandas dataframe with geometry
-    Returns:
-        iou_df: dataframe of IoU scores
-    """
-    # Create index columns for ease
-    ground_truth["truth_id"] = ground_truth.index.values
-    submission["prediction_id"] = submission.index.values
-
-    # rtree_index
-    rtree_index = create_rtree_from_poly(ground_truth.geometry)
-
-    # find overlap among all sets
-    overlap_df = _overlap_all(
-        test_polys=submission, truth_polys=ground_truth, rtree_index=rtree_index
+def compute_IoU(ground_truth: "gpd.GeoDataFrame", submission: "gpd.GeoDataFrame"):
+    # Compute truth <> prediction overlaps
+    intersections, unions, truth_ids, pred_ids = _overlap_all(
+        test_polys=submission, truth_polys=ground_truth
     )
 
-    # Create cost matrix for assignment
-    matrix = overlap_df.pivot(
-        index="truth_id", columns="prediction_id", values="area"
-    ).values
+    # Cost matrix is the intersection area
+    matrix = intersections
+
+    if matrix.size == 0:
+        # No matches, early exit
+        return pd.DataFrame(
+            {
+                "prediction_id": pd.Series(dtype="float64"),
+                "truth_id": pd.Series(dtype=truth_ids.dtype),
+                "IoU": pd.Series(dtype="float64"),
+                "score": pd.Series(dtype="float64"),
+                "geometry": pd.Series(dtype=object),
+            }
+        )
+
+    # Linear sum assignment + match lookup
     row_ind, col_ind = linear_sum_assignment(matrix, maximize=True)
+    match_for_truth = dict(zip(row_ind, col_ind, strict=False))
+
+    # Score lookup
+    pred_scores = submission["score"].to_dict() if "score" in submission.columns else {}
+
+    # IoU matrix
+    with np.errstate(divide="ignore", invalid="ignore"):
+        iou_mat = np.divide(
+            intersections,
+            unions,
+            out=np.zeros_like(intersections, dtype=float),
+            where=unions > 0,
+        )
 
-    # Create IoU dataframe, match those predictions and ground truth, IoU = 0
-    # for all others, they will get filtered out
-    iou_df = []
-    for index, _row in ground_truth.iterrows():
-        if index in row_ind:
-            matched_id = col_ind[np.where(index == row_ind)[0][0]]
-            iou = _iou_(
-                submission[submission.prediction_id == matched_id],
-                ground_truth.loc[index],
-            )
-            score = submission[submission.prediction_id == matched_id].score.values[0]
+    # build rows for every truth element (unmatched => None, IoU 0)
+    records = []
+    for t_idx, truth_id in enumerate(truth_ids):
+        # If we matched this truth box
+        if t_idx in match_for_truth:
+            # Look up matching prediction and corresponding IoU and score
+            p_idx = match_for_truth[t_idx]
+            pred_id = pred_ids[p_idx]
+            iou = float(iou_mat[t_idx, p_idx])
+            score = pred_scores.get(pred_id, None)
         else:
-            iou = 0
-            matched_id = None
+            pred_id = None
+            iou = 0.0
             score = None
-        iou_df.append(
-            pd.DataFrame(
-                {
-                    "prediction_id": [matched_id],
-                    "truth_id": [index],
-                    "IoU": iou,
-                    "score": score,
-                }
-            )
+        records.append(
+            {
+                "prediction_id": pred_id,
+                "truth_id": truth_id,
+                "IoU": iou,
+                "score": score,
+            }
         )
 
-    iou_df = pd.concat(iou_df)
-    iou_df = iou_df.merge(ground_truth[["truth_id", "geometry"]])
-
+    # Output dataframe
+    iou_df = pd.DataFrame.from_records(records)
+    iou_df = iou_df.merge(
+        ground_truth.assign(truth_id=truth_ids)[["truth_id", "geometry"]],
+        on="truth_id",
+        how="left",
+    )
     return iou_df
diff --git a/src/deepforest/callbacks.py b/src/deepforest/callbacks.py
deleted file mode 100644
index 310dd5dd8..000000000
--- a/src/deepforest/callbacks.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""DeepForest callback for logging images during training.
-
-Callbacks must implement on_epoch_begin, on_epoch_end, on_fit_end,
-on_fit_begin methods and inject model and epoch kwargs.
-"""
-
-import glob
-
-import numpy as np
-import supervision as sv
-from pytorch_lightning import Callback
-
-from deepforest import visualize
-
-
-class images_callback(Callback):
-    """Log evaluation images during training.
-
-    Args:
-        savedir: Directory to save predicted images
-        n: Number of images to process
-        every_n_epochs: Run interval in epochs
-        select_random: Whether to select random images
-        color: Bounding box color as BGR tuple
-        thickness: Border line thickness in pixels
-    """
-
-    def __init__(
-        self, savedir, n=2, every_n_epochs=5, select_random=False, color=None, thickness=1
-    ):
-        self.savedir = savedir
-        self.n = n
-        self.color = color
-        self.thickness = thickness
-        self.select_random = select_random
-        self.every_n_epochs = every_n_epochs
-
-    def log_images(self, pl_module):
-        """Log images to the logger."""
-        df = pl_module.predictions
-
-        # Limit to n images, potentially randomly selected
-        if self.select_random:
-            selected_images = np.random.choice(df.image_path.unique(), self.n)
-        else:
-            selected_images = df.image_path.unique()[: self.n]
-        df = df[df.image_path.isin(selected_images)]
-
-        # Add root_dir to the dataframe
-        if "root_dir" not in df.columns:
-            df["root_dir"] = pl_module.config.validation.root_dir
-
-        # Ensure color is correctly assigned
-        if self.color is None:
-            num_classes = len(df["label"].unique())
-            results_color = sv.ColorPalette.from_matplotlib("viridis", num_classes)
-        else:
-            results_color = self.color
-
-        # Plot results
-        visualize.plot_results(
-            results=df,
-            savedir=self.savedir,
-            results_color=results_color,
-            thickness=self.thickness,
-        )
-
-        try:
-            saved_plots = glob.glob(f"{self.savedir}/*.png")
-            for x in saved_plots:
-                pl_module.logger.experiment.log_image(x)
-        except Exception as e:
-            print(
-                "Could not find comet logger in lightning module, "
-                f"skipping upload, images were saved to {self.savedir}, "
-                f"error was raised {e}"
-            )
-
-    def on_validation_end(self, trainer, pl_module):
-        """Run callback at validation end."""
-        if trainer.sanity_checking:
-            return
-
-        if trainer.current_epoch % self.every_n_epochs == 0:
-            print("Running image callback")
-            self.log_images(pl_module)
diff --git a/src/deepforest/callbacks/__init__.py b/src/deepforest/callbacks/__init__.py
new file mode 100644
index 000000000..a08da1553
--- /dev/null
+++ b/src/deepforest/callbacks/__init__.py
@@ -0,0 +1,11 @@
+"""DeepForest callbacks for training monitoring and logging.
+
+This module contains PyTorch Lightning callbacks for various training tasks:
+- ImagesCallback: Log evaluation images during training
+- EvaluationCallback: Accumulate validation predictions and save to disk
+"""
+
+from .evaluation import EvaluationCallback
+from .images import ImagesCallback, images_callback
+
+__all__ = ["ImagesCallback", "images_callback", "EvaluationCallback"]
diff --git a/src/deepforest/callbacks/evaluation.py b/src/deepforest/callbacks/evaluation.py
new file mode 100644
index 000000000..d4b2d88cf
--- /dev/null
+++ b/src/deepforest/callbacks/evaluation.py
@@ -0,0 +1,268 @@
+import gzip
+import json
+import os
+import shutil
+import tempfile
+import warnings
+from glob import glob
+from pathlib import Path
+
+import torch
+from pytorch_lightning import Callback, Trainer
+from pytorch_lightning.core import LightningModule
+
+
+class EvaluationCallback(Callback):
+    """Accumulate validation predictions per batch, write one shard per rank,
+    optionally merge shards on rank 0, and optionally run evaluation.
+
+    File names:
+      - Shards: predictions_epoch_{E}_rank{R}.csv[.gz]
+      - Merged: predictions_epoch_{E}.csv[.gz]
+      - Meta:   predictions_epoch_{E}_metadata.json
+    """
+
+    def __init__(
+        self,
+        save_dir: str | None = None,
+        every_n_epochs: int = 5,
+        iou_threshold: float = 0.4,
+        run_evaluation: bool = False,
+        compress: bool = False,
+    ) -> None:
+        super().__init__()
+        self._user_save_dir = save_dir
+        self.compress = compress
+        self.every_n_epochs = every_n_epochs
+        self.iou_threshold = iou_threshold
+        self.run_evaluation = run_evaluation
+
+        self.save_dir: Path | None = None
+        self._is_temp = save_dir is None
+        self._rank_base: Path | None = None
+        self.csv_file = None
+        self.csv_path: Path | None = None
+        self.predictions_written = 0  # rows written by *this rank* this epoch
+
+    def _active_epoch(self, trainer: Trainer) -> bool:
+        e = trainer.current_epoch + 1
+        return not (
+            trainer.sanity_checking
+            or trainer.fast_dev_run
+            or self.every_n_epochs == -1
+            or (e % self.every_n_epochs != 0)
+        )
+
+    def _open_writer(self, path: Path):
+        if self.compress:
+            return gzip.open(path, "wt", encoding="utf-8")
+        return open(path, "w", encoding="utf-8")
+
+    def setup(
+        self, trainer: Trainer, pl_module: LightningModule, stage: str | None = None
+    ):
+        # Rank 0 creates/determines the save directory, then broadcasts to all ranks
+        # This ensures all ranks write shards to the same location
+        if trainer.is_global_zero:
+            if self._is_temp:
+                base = Path(tempfile.mkdtemp(prefix="preds_"))
+                self._rank_base = base
+                self.save_dir = base
+            else:
+                self.save_dir = Path(self._user_save_dir)  # type: ignore[arg-type]
+            self.save_dir.mkdir(parents=True, exist_ok=True)
+
+        # Broadcast the directory from rank 0 to all other ranks
+        if trainer.world_size > 1:
+            save_dir_str = str(self.save_dir) if trainer.is_global_zero else None
+            save_dir_str = trainer.strategy.broadcast(save_dir_str, src=0)
+            self.save_dir = Path(save_dir_str)
+
+        # Non-rank-0 processes ensure the directory exists
+        if not trainer.is_global_zero:
+            self.save_dir.mkdir(parents=True, exist_ok=True)
+
+    def on_validation_epoch_start(
+        self, trainer: Trainer, pl_module: LightningModule
+    ) -> None:
+        if self._active_epoch(trainer):
+            epoch = trainer.current_epoch + 1
+            rank = trainer.global_rank
+            suffix = ".csv.gz" if self.compress else ".csv"
+            self.csv_path = (
+                self.save_dir / f"predictions_epoch_{epoch}_rank{rank}{suffix}"
+            )
+            self.csv_file = self._open_writer(self.csv_path)
+        else:
+            self.csv_path = None
+            self.csv_file = None
+
+        self.predictions_written = 0
+
+        trainer.strategy.barrier()
+
+    def on_validation_batch_end(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        outputs,
+        batch,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if not self._active_epoch(trainer) or self.csv_file is None:
+            return
+        # expected: pl_module.last_preds is list[pd.DataFrame]
+        batch_preds = getattr(pl_module, "last_preds", None)
+        if not batch_preds:
+            return
+        for df in batch_preds:
+            if df is None or df.empty:
+                continue
+            df.to_csv(self.csv_file, index=False, header=(self.predictions_written == 0))
+            self.predictions_written += len(df)
+
+    def on_validation_epoch_end(
+        self, trainer: Trainer, pl_module: LightningModule
+    ) -> None:
+        strategy = trainer.strategy
+        world_size = strategy.world_size
+
+        if self.csv_file is not None:
+            self.csv_file.close()
+            self.csv_file = None
+
+        strategy.barrier()  # all ranks finished writing
+
+        # Collect each rank's save_dir and row count
+        if (
+            world_size > 1
+            and torch.distributed.is_available()
+            and torch.distributed.is_initialized()
+        ):
+            rank_dirs: list[str | None] = [None] * world_size
+            rank_counts: list[int] = [0] * world_size
+            torch.distributed.all_gather_object(rank_dirs, str(self.save_dir))
+            torch.distributed.all_gather_object(
+                rank_counts, int(self.predictions_written)
+            )
+        else:
+            rank_dirs = [str(self.save_dir)]
+            rank_counts = [int(self.predictions_written)]
+
+        if self._active_epoch(trainer) and trainer.is_global_zero:
+            self._reduce_and_evaluate(
+                trainer, pl_module, [Path(d) for d in rank_dirs if d], sum(rank_counts)
+            )
+
+        strategy.barrier()  # allow rank 0 to finish
+
+    def teardown(
+        self, trainer: Trainer, pl_module: LightningModule, stage: str | None = None
+    ):
+        if self._is_temp and self._rank_base is not None:
+            shutil.rmtree(self._rank_base, ignore_errors=True)
+
+    def _reduce_and_evaluate(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        rank_dirs: list[Path],
+        total_written: int,
+    ) -> None:
+        epoch = trainer.current_epoch + 1
+        suffix = ".csv.gz" if self.compress else ".csv"
+
+        # Deduplicate rank_dirs
+        unique_dirs = list(dict.fromkeys(rank_dirs))
+        if len(unique_dirs) < len(rank_dirs):
+            warnings.warn(
+                f"Detected {len(rank_dirs) - len(unique_dirs)} duplicate directories "
+                f"in rank_dirs. This may indicate a configuration issue.",
+                stacklevel=2,
+            )
+
+        # discover shards
+        shard_paths: list[Path] = []
+        for d in unique_dirs:
+            pattern = str(d / f"predictions_epoch_{epoch}_rank*.csv")
+            if self.compress:
+                pattern += ".gz"
+            shard_paths.extend(sorted(Path(p) for p in glob(pattern)))
+
+        # Validate shard count matches world size
+        world_size = trainer.strategy.world_size
+        if len(shard_paths) != world_size:
+            warnings.warn(
+                f"Expected {world_size} shard files but found {len(shard_paths)}. "
+                f"Shards: {[p.name for p in shard_paths]}",
+                stacklevel=2,
+            )
+
+        merged_path = (
+            (self.save_dir / f"predictions_epoch_{epoch}{suffix}")
+            if shard_paths
+            else None
+        )
+
+        # stream-merge shards into a single file without repeating headers
+        if merged_path is not None:
+            merged_path.parent.mkdir(parents=True, exist_ok=True)
+            open_out = gzip.open if self.compress else open
+            with open_out(merged_path, "wt", encoding="utf-8") as out_f:
+                wrote_header = False
+                for shard in shard_paths:
+                    open_in = (
+                        gzip.open
+                        if shard.suffix == ".gz" or shard.suffixes[-2:] == [".csv", ".gz"]
+                        else open
+                    )
+                    with open_in(shard, "rt", encoding="utf-8") as in_f:
+                        for i, line in enumerate(in_f):
+                            if i == 0 and wrote_header:
+                                continue
+                            out_f.write(line)
+                    wrote_header = True
+
+        # metadata
+        cfg = getattr(pl_module, "config", None)
+        val = getattr(cfg, "validation", None)
+        meta = {
+            "epoch": epoch,
+            "current_step": trainer.global_step,
+            "predictions_count": int(total_written),
+            "target_csv_file": getattr(val, "csv_file", None),
+            "target_root_dir": getattr(val, "root_dir", None),
+            "shards": [str(p) for p in shard_paths],
+            "merged_predictions": str(merged_path) if merged_path else None,
+            "world_size": trainer.strategy.world_size,
+        }
+        with open(
+            self.save_dir / f"predictions_epoch_{epoch}_metadata.json",
+            "w",
+            encoding="utf-8",
+        ) as f:
+            json.dump(meta, f, indent=2)
+
+        # optional shard cleanup
+        for p in shard_paths:
+            try:
+                os.remove(p)
+            except OSError:
+                pass
+
+        # optional evaluation
+        if self.run_evaluation:
+            if merged_path and total_written > 0:
+                try:
+                    pl_module.evaluate(
+                        predictions=str(merged_path),
+                        csv_file=meta["target_csv_file"],
+                        iou_threshold=self.iou_threshold,
+                    )
+                except Exception as e:
+                    warnings.warn(f"Evaluation failed: {e}", stacklevel=2)
+            else:
+                warnings.warn(
+                    "No predictions written to disk, skipping evaluate.", stacklevel=2
+                )
diff --git a/src/deepforest/callbacks/images.py b/src/deepforest/callbacks/images.py
new file mode 100644
index 000000000..ef1b57026
--- /dev/null
+++ b/src/deepforest/callbacks/images.py
@@ -0,0 +1,278 @@
+"""Image logging callback for training monitoring.
+
+Callbacks must implement on_epoch_begin, on_epoch_end, on_fit_end,
+on_fit_begin methods and inject model and epoch kwargs.
+"""
+
+import json
+import os
+import random
+import warnings
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import supervision as sv
+import torch
+from PIL import Image
+from pytorch_lightning import Callback
+
+from deepforest import utilities, visualize
+from deepforest.datasets.training import BoxDataset
+
+
+class ImagesCallback(Callback):
+    """Log evaluation images during training.
+
+    Args:
+        save_dir: Directory to save predicted images
+        n: Number of images to process
+        every_n_epochs: Run interval in epochs
+        select_random: Whether to select random images
+        color: Bounding box color as BGR tuple
+        thickness: Border line thickness in pixels
+    """
+
+    def __init__(
+        self,
+        save_dir,
+        sample_batches=2,
+        images_per_batch=1,
+        dataset_samples=5,
+        every_n_epochs=5,
+        select_random=False,
+        color=None,
+        thickness=2,
+    ):
+        self.savedir = save_dir
+        self.sample_batches = sample_batches
+        self.dataset_samples = dataset_samples
+        self.color = color
+        self.thickness = thickness
+        self.select_random = select_random
+        self.every_n_epochs = every_n_epochs
+        self.num_val_batches = 0
+        self.images_per_batch = images_per_batch
+
+    def on_train_start(self, trainer, pl_module):
+        """Log sample images from training and validation datasets at training
+        start."""
+
+        if trainer.fast_dev_run:
+            return
+
+        self.trainer = trainer
+        self.pl_module = pl_module
+
+        # Training samples
+        pl_module.print("Logging training dataset samples.")
+        train_ds = trainer.train_dataloader.dataset
+        self._log_dataset_sample(train_ds, split="train")
+
+        # Validation samples
+        if trainer.val_dataloaders:
+            pl_module.print("Logging validation dataset samples.")
+            val_ds = trainer.val_dataloaders.dataset
+            self._log_dataset_sample(val_ds, split="validation")
+            self.num_val_batches = len(trainer.val_dataloaders)
+
+    def on_validation_start(self, trainer, pl_module):
+        """Pick batch indices for plotting, or skip."""
+        self.batch_indices = set()
+
+        if trainer.sanity_checking or trainer.fast_dev_run:
+            return
+
+        if (trainer.current_epoch + 1) % self.every_n_epochs == 0:
+            indices = list(range(self.num_val_batches))
+
+            if self.select_random:
+                random.shuffle(indices)
+
+            self.batch_indices = set(indices[: self.sample_batches])
+
+    def on_validation_batch_end(
+        self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0
+    ):
+        """Determine whether to sample predictions from this batch."""
+
+        if trainer.global_rank != 0:
+            return
+
+        # NB: Dataloader idx is the i'th dataloader
+        if batch_idx in self.batch_indices:
+            # Last predictions (validation_step)
+            _, batch_targets, image_names = batch
+            batch_preds = [p for p in pl_module.last_preds if p is not None]
+
+            if len(batch_preds) == 0:
+                warnings.warn(
+                    "No valid predictions found when logging images.", stacklevel=2
+                )
+
+            # Sample at most self.images_per_batch
+            for idx in list(range(min(len(batch_preds), self.images_per_batch))):
+                targets = utilities.format_geometry(batch_targets[idx], scores=False)
+                preds = batch_preds[idx].copy()
+                image_name = image_names[idx]
+
+                if preds.image_path.unique()[0] != image_name:
+                    warnings.warn(
+                        "Image names and predictions are out of sync, skipping sample.",
+                        stacklevel=2,
+                    )
+                else:
+                    self._log_prediction_sample(
+                        trainer, pl_module, preds, targets, image_name
+                    )
+
+    def _log_prediction_sample(self, trainer, pl_module, preds, targets, image_name):
+        dataset = trainer.val_dataloaders.dataset
+        """Log one sample."""
+        # Add root_dir to the dataframe
+        if "root_dir" not in preds.columns:
+            preds["root_dir"] = dataset.root_dir
+
+        # Ensure color is correctly assigned
+        if self.color is None:
+            num_classes = len(preds["label"].unique())
+            results_color = sv.ColorPalette.from_matplotlib("viridis", num_classes)
+        else:
+            results_color = self.color
+
+        out_dir = os.path.join(self.savedir, "predictions")
+        os.makedirs(out_dir, exist_ok=True)
+
+        basename = Path(image_name).stem + f"_{trainer.global_step}"
+        fig = visualize.plot_results(
+            basename=basename,
+            results=preds,
+            ground_truth=targets,
+            savedir=out_dir,
+            results_color=results_color,
+            thickness=self.thickness,
+            show=False,
+        )
+        plt.close(fig)
+
+        # Pred metadata, if supported.
+        stats = (
+            preds["score"]
+            .agg(
+                mean_confidence="mean",
+                max_confidence="max",
+                min_confidence="min",
+                std_confidence="std",
+            )
+            .to_dict()
+        )
+
+        metadata = {"pred_count": len(preds), "gt_count": len(targets)}
+        metadata.update(stats)
+
+        with open(os.path.join(out_dir, basename + ".json"), "w") as fp:
+            json.dump(metadata, fp, indent=1)
+
+        self._log_to_all(
+            image=os.path.join(out_dir, basename + ".png"),
+            trainer=trainer,
+            tag="prediction sample",
+            metadata=metadata,
+        )
+
+    def _log_dataset_sample(self, dataset: BoxDataset, split: str):
+        """Log random samples from a DeepForest BoxDataset."""
+
+        if self.dataset_samples == 0:
+            return
+
+        out_dir = os.path.join(self.savedir, split + "_sample")
+        os.makedirs(out_dir, exist_ok=True)
+        n_samples = min(self.dataset_samples, len(dataset))
+        sample_indices = torch.randperm(len(dataset))[:n_samples]
+
+        sample_data = [dataset[idx] for idx in sample_indices]
+        sample_images = [data[0] for data in sample_data]
+        sample_targets = [data[1] for data in sample_data]
+        sample_paths = [data[2] for data in sample_data]
+
+        for image, target, path in zip(
+            sample_images, sample_targets, sample_paths, strict=False
+        ):
+            image_annotations = target.copy()
+            image_annotations = utilities.format_geometry(image_annotations, scores=False)
+            image_annotations.root_dir = dataset.root_dir
+            image_annotations["image_path"] = path
+
+            # Plot transformed image
+            basename = Path(path).stem
+            image = (255 * image.cpu().numpy().transpose((1, 2, 0))).astype(np.uint8)
+            fig = visualize.plot_annotations(
+                image=image,
+                annotations=image_annotations,
+                savedir=out_dir,
+                basename=basename,
+                thickness=self.thickness,
+                show=False,
+            )
+            plt.close(fig)
+
+            self._log_to_all(
+                image=os.path.join(out_dir, basename + ".png"),
+                trainer=self.trainer,
+                tag=f"{split} dataset sample",
+            )
+
+    def _log_to_all(self, image: str, trainer, tag, metadata: dict | None = None):
+        """Log to all connected loggers.
+
+        Since Comet will pickup image logs to Tensorboard by default, we
+        add a check to log images preferentially to Tensorboard if both
+        are enabled.
+        """
+        try:
+            img = np.array(Image.open(image).convert("RGB"))
+
+            loggers = [lg for lg in trainer.loggers if hasattr(lg, "experiment")]
+
+            tb = next((lg for lg in loggers if hasattr(lg.experiment, "add_image")), None)
+            if tb is not None:
+                tb.experiment.add_image(
+                    tag=f"{tag}/{os.path.basename(image)}",
+                    img_tensor=img,
+                    global_step=trainer.global_step,
+                    dataformats="HWC",
+                )
+                return
+
+            comet = next(
+                (lg for lg in loggers if hasattr(lg.experiment, "log_image")),
+                None,
+            )
+            if comet is not None:
+                meta = {
+                    "image_name": os.path.basename(image),
+                    "context": tag,
+                    "step": trainer.global_step,
+                }
+
+                if metadata:
+                    meta.update(metadata)
+
+                comet.experiment.log_image(
+                    img,
+                    name=tag,
+                    step=trainer.global_step,
+                    metadata=meta,
+                )
+
+        except Exception as e:
+            warnings.warn(f"Tried to log {image} exception raised: {e}", stacklevel=2)
+
+
+class images_callback(ImagesCallback):
+    def __init__(self, savedir, **kwargs):
+        warnings.warn(
+            "Please use ImagesCallback instead.", DeprecationWarning, stacklevel=2
+        )
+        super().__init__(save_dir=savedir, **kwargs)
diff --git a/src/deepforest/conf/conditionaldetr.yaml b/src/deepforest/conf/conditionaldetr.yaml
new file mode 100644
index 000000000..72242e089
--- /dev/null
+++ b/src/deepforest/conf/conditionaldetr.yaml
@@ -0,0 +1,23 @@
+# Conditional DETR Base Configuration
+defaults:
+  - config
+  - _self_
+
+# Default model is pretrained on MS-COCO
+model:
+  name: "microsoft/conditional-detr-resnet-50"
+  revision: 'main'
+
+architecture: 'ConditionalDetr'
+
+train:
+  epochs: 40
+  lr: 2e-4
+  lr_backbone: 2e-5
+  optimizer: adamw
+  weight_decay: 1e-4
+  scheduler:
+    type: cosine
+    params:
+      T_max: 40
+      eta_min: 1e-7
diff --git a/src/deepforest/conf/config.yaml b/src/deepforest/conf/config.yaml
index 473828a6e..055e3544e 100644
--- a/src/deepforest/conf/config.yaml
+++ b/src/deepforest/conf/config.yaml
@@ -6,9 +6,18 @@ workers: 0
 devices: auto
 accelerator: auto
 batch_size: 1
+limit_batches: 1.0
+persistent_workers: false
+
+# Precision settings
+# matmul_precision: 'highest' (default, full precision), 'high' (TF32 on Ampere+), or 'medium' (bfloat16)
+matmul_precision: highest
+# training_precision: null (default '32-true'), '16-mixed', 'bf16-mixed', '32-true', etc.
+training_precision: null
 
 # Model Architecture
 architecture: 'retinanet'
+backbone: 'resnet50'
 num_classes: 1
 nms_thresh: 0.05
 score_thresh: 0.1
@@ -30,11 +39,18 @@ rgb_dir:
 path_to_rgb:
 
 train:
+    # Sanity check annotations on dataset load
+    check_annotations: False
+    log_root: logs
     csv_file:
     root_dir:
 
     # Optimizer initial learning rate
     lr: 0.001
+    # For some models it's helpful to set a lower
+    # backbone learning rate. Default is the same.
+    lr_backbone: 0.001
+    weight_decay: 0.0
 
     # Data augmentations for training
     # Augmentations must be a list of augmentation names, or a list
@@ -46,7 +62,7 @@ train:
     augmentations:
         - HorizontalFlip: {p: 0.5}
     scheduler:
-        type:
+        type: reduceLROnPlateau
         params:
             # Common parameters
             T_max: 10
@@ -72,12 +88,17 @@ train:
     fast_dev_run: False
     # preload images to GPU memory for fast training. This depends on GPU size and number of images.
     preload_images: False
+    freeze_backbone: False
+    optimizer: sgd
+    auxiliary_loss: false
 
 validation:
     csv_file:
     root_dir:
     preload_images: False
     size:
+    batch_size: 1
+    workers: 0
 
     # For retinanet you may prefer val_classification, but the default val_loss
     # should work with all models
diff --git a/src/deepforest/conf/deformabledetr.yaml b/src/deepforest/conf/deformabledetr.yaml
new file mode 100644
index 000000000..2f77c8e0a
--- /dev/null
+++ b/src/deepforest/conf/deformabledetr.yaml
@@ -0,0 +1,23 @@
+# DETR Base Configuration
+defaults:
+  - config
+  - _self_
+
+# Default model is pretrained on MS-COCO
+model:
+  name: "SenseTime/deformable-detr"
+  revision: 'main'
+
+architecture: 'DeformableDetr'
+
+train:
+  epochs: 40
+  lr: 2e-4
+  lr_backbone: 2e-5
+  optimizer: adamw
+  weight_decay: 1e-4
+  scheduler:
+    type: cosine
+    params:
+      T_max: 40
+      eta_min: 1e-7
diff --git a/src/deepforest/conf/detr.yaml b/src/deepforest/conf/detr.yaml
new file mode 100644
index 000000000..95e27bae3
--- /dev/null
+++ b/src/deepforest/conf/detr.yaml
@@ -0,0 +1,23 @@
+# DETR Base Configuration
+defaults:
+  - config
+  - _self_
+
+# Default model is pretrained on MS-COCO
+model:
+  name: "facebook/detr-resnet-50"
+  revision: 'main'
+
+architecture: 'Detr'
+
+train:
+  epochs: 40
+  lr: 2e-4
+  lr_backbone: 2e-5
+  optimizer: adamw
+  weight_decay: 1e-4
+  scheduler:
+    type: cosine
+    params:
+      T_max: 40
+      eta_min: 1e-7
diff --git a/src/deepforest/conf/dinov3.yaml b/src/deepforest/conf/dinov3.yaml
new file mode 100644
index 000000000..353731896
--- /dev/null
+++ b/src/deepforest/conf/dinov3.yaml
@@ -0,0 +1,20 @@
+# RetinaNet Base Configuration - Shared parameters for all folds
+defaults:
+  - config
+  - _self_
+
+model:
+  name: "facebook/dinov3-vitl16-pretrain-sat493m"
+  revision: 'main'
+
+backbone: 'dinov3'
+
+train:
+  freeze_backbone: True
+  epochs: 75
+  lr: 0.01
+  scheduler:
+    type: cosine
+    params:
+      T_max: 75
+      eta_min: 0.0001
diff --git a/src/deepforest/conf/schema.py b/src/deepforest/conf/schema.py
index 963a3c6f6..811f2c0ed 100644
--- a/src/deepforest/conf/schema.py
+++ b/src/deepforest/conf/schema.py
@@ -62,12 +62,19 @@ class TrainConfig:
 
     csv_file: str | None = MISSING
     root_dir: str | None = MISSING
+    log_root: str = "logs"
     lr: float = 0.001
+    lr_backbone: float = 0.001
     scheduler: SchedulerConfig = field(default_factory=SchedulerConfig)
     epochs: int = 1
     fast_dev_run: bool = False
     preload_images: bool = False
     augmentations: list[str] | None = field(default_factory=lambda: ["HorizontalFlip"])
+    check_annotations: bool = False
+    freeze_backbone: bool = False
+    optimizer: str = "sgd"
+    weight_decay: float = 0.0
+    auxiliary_loss: bool = False
 
 
 @dataclass
@@ -85,6 +92,8 @@ class ValidationConfig:
     size: int | None = None
     iou_threshold: float = 0.4
     val_accuracy_interval: int = 20
+    batch_size: int = 1
+    workers: int = 0
     lr_plateau_target: str = "val_loss"
     augmentations: list[str] | None = field(default_factory=lambda: [])
 
@@ -115,8 +124,13 @@ class Config:
     devices: int | str = "auto"
     accelerator: str = "auto"
     batch_size: int = 1
+    limit_batches: float | int = 1.0
+    persistent_workers: bool = True
+    matmul_precision: str = "highest"
+    training_precision: str | int | None = None
 
     architecture: str = "retinanet"
+    backbone: str = "resnet50"
     num_classes: int = 1
     label_dict: dict[str, int] = field(default_factory=lambda: {"Tree": 0})
 
diff --git a/src/deepforest/datasets/prediction.py b/src/deepforest/datasets/prediction.py
index 7a425c501..28d477bcf 100644
--- a/src/deepforest/datasets/prediction.py
+++ b/src/deepforest/datasets/prediction.py
@@ -239,7 +239,36 @@ def __init__(self, csv_file: str, root_dir: str, size: int = None):
     def prepare_items(self):
         self.annotations = pd.read_csv(self.csv_file)
         self.image_names = self.annotations.image_path.unique()
-        self.image_paths = [os.path.join(self.root_dir, x) for x in self.image_names]
+        # Use basename to handle absolute paths from other systems
+        candidate_paths = [
+            os.path.join(self.root_dir, os.path.basename(x)) for x in self.image_names
+        ]
+
+        # Validate that images exist
+        self.image_paths = []
+        missing_images = []
+        for path in candidate_paths:
+            if os.path.exists(path):
+                self.image_paths.append(path)
+            else:
+                missing_images.append(path)
+
+        # Warn about missing images
+        if missing_images:
+            print(f"Warning: {len(missing_images)} images not found:")
+            for path in missing_images[:5]:  # Show first 5
+                print(f"  - {path}")
+            if len(missing_images) > 5:
+                print(f"  ... and {len(missing_images) - 5} more")
+
+        # Ensure we have at least some valid images
+        assert (
+            len(self.image_paths) > 0
+        ), f"No valid images found! Checked {len(candidate_paths)} paths in {self.root_dir}"
+
+        print(
+            f"Found {len(self.image_paths)} valid images out of {len(candidate_paths)} in CSV"
+        )
 
     def __len__(self):
         return len(self.image_paths)
diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py
index e8e1fbcef..f3723369b 100644
--- a/src/deepforest/datasets/training.py
+++ b/src/deepforest/datasets/training.py
@@ -9,6 +9,7 @@
 import torch
 from PIL import Image
 from torch.utils.data import Dataset
+from tqdm.auto import tqdm
 
 from deepforest.augmentations import get_transform
 
@@ -41,6 +42,8 @@ def __init__(
         augmentations=None,
         label_dict=None,
         preload_images=False,
+        relative_paths=True,
+        check_annotations=False,
     ):
         """
         Args:
@@ -78,10 +81,18 @@ def __init__(
             self.transform = get_transform(augmentations=augmentations)
         else:
             self.transform = transforms
-        self.image_names = self.annotations.image_path.unique()
+
+        if relative_paths:
+            self.image_names = self.annotations.image_path.unique()
+        else:
+            self.image_names = [
+                os.path.basename(path) for path in self.annotations.image_path.unique()
+            ]
+        self.label_dict = label_dict
         self.preload_images = preload_images
 
-        self._validate_labels()
+        if check_annotations:
+            self._validate_annotations()
 
         # Pin data to memory if desired
         if self.preload_images:
@@ -90,20 +101,62 @@ def __init__(
             for idx, _ in enumerate(self.image_names):
                 self.image_dict[idx] = self.load_image(idx)
 
-    def _validate_labels(self):
-        """Validate that all labels in annotations exist in label_dict.
-
-        Raises:
-            ValueError: If any label in annotations is missing from label_dict
-        """
-        csv_labels = self.annotations["label"].unique()
-        missing_labels = [label for label in csv_labels if label not in self.label_dict]
+    def _validate_annotations(self):
+        errors = []
+        missing_labels = set()
+        img_sizes = {}  # rel_path -> (w,h)
+        has_geom = "geometry" in self.annotations.columns
+        labels = set(self.label_dict)
+
+        for row in tqdm(self.annotations.itertuples(index=True)):
+            rel_path = row.image_path
+            size = img_sizes.get(rel_path)
+            if size is None:
+                img_path = os.path.join(self.root_dir, rel_path)
+                try:
+                    with Image.open(img_path) as img:
+                        size = img.size
+                except Exception as e:
+                    errors.append(f"Failed to open image {img_path}: {e}")
+                    img_sizes[rel_path] = None
+                    continue
+                img_sizes[rel_path] = size
+            if size is None:
+                continue
+            width, height = size
+
+            if row.label not in labels:
+                missing_labels.add(row.label)
+
+            try:
+                if has_geom:
+                    xmin, ymin, xmax, ymax = shapely.wkt.loads(row.geometry).bounds
+                else:
+                    xmin, ymin, xmax, ymax = row.xmin, row.ymin, row.xmax, row.ymax
+            except Exception as e:
+                errors.append(f"Invalid box format at index {row.Index}: {e}")
+                continue
+
+            oob = []
+            if xmin < 0:
+                oob.append(f"xmin ({xmin}) < 0")
+            if xmax > width:
+                oob.append(f"xmax ({xmax}) > image width ({width})")
+            if ymin < 0:
+                oob.append(f"ymin ({ymin}) < 0")
+            if ymax > height:
+                oob.append(f"ymax ({ymax}) > image height ({height})")
+            if oob:
+                errors.append(
+                    f"Box ({xmin}, {ymin}, {xmax}, {ymax}) exceeds ({width}, {height}). Issues: {', '.join(oob)}."
+                )
+            if xmin == xmax or ymin == ymax:
+                errors.append(f"Zero area bbox ({xmin}, {ymin}, {xmax}, {ymax}).")
 
         if missing_labels:
-            raise ValueError(
-                f"Labels {missing_labels} are missing from label_dict. "
-                f"Please ensure all labels in the annotations exist as keys in label_dict."
-            )
+            errors.append(f"Labels {sorted(missing_labels)} are missing from label_dict")
+        if errors:
+            raise ValueError("\n".join(errors))
 
     def __len__(self):
         return len(self.image_names)
@@ -122,17 +175,18 @@ def load_image(self, idx):
         image = image.astype("float32")
         return image
 
-    def __getitem__(self, idx):
-        # Read image if not in memory
-        if self.preload_images:
-            image = self.image_dict[idx]
-        else:
-            image = self.load_image(idx)
+    def annotations_for_path(self, image_path, return_tensor=False):
+        """Construct target dictionary for a given image path, optionally
+        convert to tensor.
+
+        Args:
+            image_path (str): Path to image, expected to be in dataframe
+            return_tensor (bool): If true, convert fields from numpy to tensor
 
-        # select annotations
-        image_annotations = self.annotations[
-            self.annotations.image_path == self.image_names[idx]
-        ]
+        Returns:
+            target dictionary with boxes and labels entries
+        """
+        image_annotations = self.annotations[self.annotations.image_path == image_path]
         targets = {}
 
         if "geometry" in image_annotations.columns:
@@ -149,6 +203,21 @@ def __getitem__(self, idx):
             lambda x: self.label_dict[x]
         ).values.astype(np.int64)
 
+        if return_tensor:
+            for k, v in targets.items():
+                targets[k] = torch.from_numpy(v)
+
+        return targets
+
+    def __getitem__(self, idx):
+        # Read image if not in memory
+        if self.preload_images:
+            image = self.image_dict[idx]
+        else:
+            image = self.load_image(idx)
+
+        targets = self.annotations_for_path(self.image_names[idx])
+
         # If image has no annotations, don't augment
         if np.sum(targets["boxes"]) == 0:
             boxes = torch.zeros((0, 4), dtype=torch.float32)
@@ -161,12 +230,18 @@ def __getitem__(self, idx):
             return image, targets, self.image_names[idx]
 
         # Apply augmentations
-        augmented = self.transform(
-            image=image,
-            bboxes=targets["boxes"],
-            category_ids=targets["labels"].astype(np.int64),
-        )
-        image = augmented["image"]
+
+        try:
+            augmented = self.transform(
+                image=image,
+                bboxes=targets["boxes"],
+                category_ids=targets["labels"].astype(np.int64),
+            )
+            image = augmented["image"]
+        except Exception as e:
+            print(f"Failed to process image: {self.image_names[idx]}")
+            print(targets)
+            raise e
 
         # Convert boxes to tensor
         boxes = np.array(augmented["bboxes"])
diff --git a/src/deepforest/evaluate.py b/src/deepforest/evaluate.py
index f85a93e22..728dc92f6 100644
--- a/src/deepforest/evaluate.py
+++ b/src/deepforest/evaluate.py
@@ -1,15 +1,19 @@
 """Evaluation module."""
 
+import os
 import warnings
 
 import geopandas as gpd
 import numpy as np
 import pandas as pd
 import shapely
+from tqdm import tqdm
 
-from deepforest import IoU
+from deepforest import IoU, utilities
 from deepforest.utilities import determine_geometry_type
 
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
 
 def evaluate_image_boxes(predictions, ground_df):
     """Compute intersection-over-union matching among prediction and ground
@@ -29,11 +33,13 @@ def evaluate_image_boxes(predictions, ground_df):
     # match
     result = IoU.compute_IoU(ground_df, predictions)
 
-    # add the label classes
-    result["predicted_label"] = result.prediction_id.apply(
-        lambda x: predictions.label.loc[x] if pd.notnull(x) else x
-    )
-    result["true_label"] = result.truth_id.apply(lambda x: ground_df.label.loc[x])
+    # add the label classes using dictionary lookups for performance
+    pred_label_dict = predictions.label.to_dict()
+    ground_label_dict = ground_df.label.to_dict()
+
+    # Use vectorized operations for label mapping
+    result["predicted_label"] = result.prediction_id.map(pred_label_dict)
+    result["true_label"] = result.truth_id.map(ground_label_dict)
 
     return result
 
@@ -105,33 +111,10 @@ def __evaluate_wrapper__(predictions, ground_df, iou_threshold, numeric_to_label
         return results
 
     # Convert pandas to geopandas if needed
-    if not isinstance(predictions, gpd.GeoDataFrame):
-        warnings.warn(
-            "Converting predictions to GeoDataFrame using geometry column", stacklevel=2
-        )
-        # Check if we have bounding box columns and need to create geometry
-        if "geometry" not in predictions.columns and all(
-            col in predictions.columns for col in ["xmin", "ymin", "xmax", "ymax"]
-        ):
-            # Create geometry from bounding box columns
-            predictions = predictions.copy()
-            predictions["geometry"] = predictions.apply(
-                lambda x: shapely.geometry.box(x.xmin, x.ymin, x.xmax, x.ymax), axis=1
-            )
-        predictions = gpd.GeoDataFrame(predictions, geometry="geometry")
+    predictions = utilities.to_gdf(predictions)
 
     # Also ensure ground_df is a GeoDataFrame
-    if not isinstance(ground_df, gpd.GeoDataFrame):
-        # Check if we have bounding box columns and need to create geometry
-        if "geometry" not in ground_df.columns and all(
-            col in ground_df.columns for col in ["xmin", "ymin", "xmax", "ymax"]
-        ):
-            # Create geometry from bounding box columns
-            ground_df = ground_df.copy()
-            ground_df["geometry"] = ground_df.apply(
-                lambda x: shapely.geometry.box(x.xmin, x.ymin, x.xmax, x.ymax), axis=1
-            )
-        ground_df = gpd.GeoDataFrame(ground_df, geometry="geometry")
+    ground_df = utilities.to_gdf(ground_df)
 
     prediction_geometry = determine_geometry_type(predictions)
     if prediction_geometry == "point":
@@ -140,20 +123,22 @@ def __evaluate_wrapper__(predictions, ground_df, iou_threshold, numeric_to_label
         results = evaluate_boxes(
             predictions=predictions, ground_df=ground_df, iou_threshold=iou_threshold
         )
+
     else:
         raise NotImplementedError(f"Geometry type {prediction_geometry} not implemented")
 
-    # replace classes if not NUll
+    # replace classes if not NUll using efficient map operations
     if results["results"] is not None:
-        results["results"]["predicted_label"] = results["results"][
-            "predicted_label"
-        ].apply(lambda x: numeric_to_label_dict[x] if not pd.isnull(x) else x)
-        results["results"]["true_label"] = results["results"]["true_label"].apply(
-            lambda x: numeric_to_label_dict[x]
+        # Use map with dictionary for faster lookups
+        results["results"]["predicted_label"] = results["results"]["predicted_label"].map(
+            lambda x: numeric_to_label_dict.get(x, x) if pd.notnull(x) else x
+        )
+        results["results"]["true_label"] = results["results"]["true_label"].map(
+            numeric_to_label_dict
         )
         results["predictions"] = predictions
-        results["predictions"]["label"] = results["predictions"]["label"].apply(
-            lambda x: numeric_to_label_dict[x]
+        results["predictions"]["label"] = results["predictions"]["label"].map(
+            numeric_to_label_dict
         )
 
     return results
@@ -187,73 +172,45 @@ def evaluate_boxes(predictions, ground_df, iou_threshold=0.4):
         }
 
     # Convert pandas to geopandas if needed
-    if not isinstance(predictions, gpd.GeoDataFrame):
-        # Check if we have bounding box columns and need to create geometry
-        if "geometry" not in predictions.columns and all(
-            col in predictions.columns for col in ["xmin", "ymin", "xmax", "ymax"]
-        ):
-            # Create geometry from bounding box columns
-            predictions = predictions.copy()
-            predictions["geometry"] = predictions.apply(
-                lambda x: shapely.geometry.box(x.xmin, x.ymin, x.xmax, x.ymax), axis=1
-            )
-        predictions = gpd.GeoDataFrame(predictions, geometry="geometry")
-
-    if not isinstance(ground_df, gpd.GeoDataFrame):
-        # Check if we have bounding box columns and need to create geometry
-        if "geometry" not in ground_df.columns and all(
-            col in ground_df.columns for col in ["xmin", "ymin", "xmax", "ymax"]
-        ):
-            # Create geometry from bounding box columns
-            ground_df = ground_df.copy()
-            ground_df["geometry"] = ground_df.apply(
-                lambda x: shapely.geometry.box(x.xmin, x.ymin, x.xmax, x.ymax), axis=1
-            )
-        ground_df = gpd.GeoDataFrame(ground_df, geometry="geometry")
+    predictions = utilities.to_gdf(predictions)
+    ground_df = utilities.to_gdf(ground_df)
+
+    # Pre-group predictions by image for efficient access
+    predictions_by_image = {
+        name: group.reset_index(drop=True)
+        for name, group in predictions.groupby("image_path")
+    }
 
     # Run evaluation on all plots
     results = []
     box_recalls = []
     box_precisions = []
-    for image_path, group in ground_df.groupby("image_path"):
-        # clean indices
-        image_predictions = predictions[
-            predictions["image_path"] == image_path
-        ].reset_index(drop=True)
 
-        # If empty, add to list without computing IoU
-        if image_predictions.empty:
-            result = pd.DataFrame(
-                {
-                    "truth_id": group.index.values,
-                    "prediction_id": None,
-                    "IoU": 0,
-                    "predicted_label": None,
-                    "score": None,
-                    "match": False,
-                    "true_label": group.label,
-                }
-            )
-            # An empty prediction set has recall of 0, precision of NA.
-            box_recalls.append(0)
-            results.append(result)
-            continue
-        else:
-            group = group.reset_index(drop=True)
-            result = evaluate_image_boxes(predictions=image_predictions, ground_df=group)
+    groups = ground_df.groupby("image_path")
+    pbar = tqdm(total=len(groups))
 
-        result["image_path"] = image_path
-        result["match"] = result.IoU > iou_threshold
-        # Convert None to False for boolean consistency
-        result["match"] = result["match"].fillna(False)
-        true_positive = sum(result["match"])
-        recall = true_positive / result.shape[0]
-        precision = true_positive / image_predictions.shape[0]
+    for image_path, image_gt in groups:
+        # Get pre-grouped predictions for this image
+        image_predictions = predictions_by_image.get(image_path, pd.DataFrame())
+        if not isinstance(image_predictions, pd.DataFrame) or image_predictions.empty:
+            image_predictions = pd.DataFrame()
+
+        name = os.path.basename(image_path)
+        pbar.set_description(f"{name[:20]}, {len(image_predictions)} preds")
+
+        recall, precision, result = _box_recall_image(
+            image_predictions, image_gt, iou_threshold=iou_threshold
+        )
+
+        if precision:
+            box_precisions.append(precision)
 
         box_recalls.append(recall)
-        box_precisions.append(precision)
         results.append(result)
 
+        pbar.update(1)
+    pbar.close()
+
     results = pd.concat(results)
     box_precision = np.mean(box_precisions)
     box_recall = np.mean(box_recalls)
@@ -272,6 +229,41 @@ def evaluate_boxes(predictions, ground_df, iou_threshold=0.4):
     }
 
 
+def _box_recall_image(predictions, ground_truth, iou_threshold):
+    # clean indices
+    image_preds = predictions.reset_index(drop=True)
+    image_gt = ground_truth.reset_index(drop=True)
+
+    # If empty, add to list without computing IoU
+    if image_preds.empty:
+        result = pd.DataFrame(
+            {
+                "truth_id": image_gt.index.values,
+                "prediction_id": None,
+                "IoU": 0,
+                "predicted_label": None,
+                "score": None,
+                "match": False,
+                "true_label": image_gt.label,
+            }
+        )
+        # An empty prediction set has recall of 0, precision of NA.
+        recall = 0
+        precision = None
+    else:
+        result = evaluate_image_boxes(predictions=image_preds, ground_df=image_gt)
+
+        result["image_path"] = image_preds["image_path"].iloc(0)
+        result["match"] = result.IoU > iou_threshold
+        # Convert None to False for boolean consistency
+        result["match"] = result["match"].fillna(False)
+        true_positive = sum(result["match"])
+        recall = true_positive / result.shape[0]
+        precision = true_positive / image_preds.shape[0]
+
+    return recall, precision, result
+
+
 def _point_recall_image_(predictions, ground_df):
     """Compute intersection-over-union matching among prediction and ground
     truth boxes for one image.
diff --git a/src/deepforest/main.py b/src/deepforest/main.py
index 3583b4d8b..445463dd4 100644
--- a/src/deepforest/main.py
+++ b/src/deepforest/main.py
@@ -1,7 +1,7 @@
 # entry point for deepforest model
 import importlib
+import logging
 import os
-import tempfile
 import warnings
 
 import geopandas as gpd
@@ -13,14 +13,36 @@
 from omegaconf import DictConfig
 from PIL import Image
 from pytorch_lightning.callbacks import LearningRateMonitor
-from torch import optim
+from pytorch_lightning.utilities import rank_zero_only
 from torchmetrics.classification import BinaryAccuracy
 from torchmetrics.detection import IntersectionOverUnion, MeanAveragePrecision
 
 from deepforest import evaluate as evaluate_iou
-from deepforest import predict, utilities, visualize
+from deepforest import predict, utilities
 from deepforest.datasets import prediction, training
 
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+
+def setup_logging(level=logging.INFO):
+    fmt = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
+    datefmt = "%Y-%m-%d %H:%M:%S"
+    h = logging.StreamHandler()
+    h.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt))
+    lg = logging.getLogger("lightning.pytorch")
+    lg.handlers.clear()
+    lg.propagate = False
+    lg.addHandler(h)
+    lg.setLevel(level)
+
+
+setup_logging()
+
+
+@rank_zero_only
+def log_info(msg):
+    logging.getLogger("lightning.pytorch").info(msg)
+
 
 class deepforest(pl.LightningModule):
     """DeepForest model for tree crown detection in RGB images.
@@ -90,7 +112,12 @@ def __init__(
         self.iou_metric = IntersectionOverUnion(
             class_metrics=True, iou_threshold=self.config.validation.iou_threshold
         )
-        self.mAP_metric = MeanAveragePrecision()
+
+        # Disable warning for decluttering
+        self.mAP_metric = MeanAveragePrecision(
+            backend="faster_coco_eval",
+        )
+        self.mAP_metric.warn_on_many_detections = False
 
         # Empty frame accuracy
         self.empty_frame_accuracy = BinaryAccuracy()
@@ -259,7 +286,7 @@ def create_trainer(self, logger=None, callbacks=None, **kwargs):
             if logger is not None:
                 lr_monitor = LearningRateMonitor(logging_interval="epoch")
                 callbacks.append(lr_monitor)
-            limit_val_batches = 1.0
+            limit_val_batches = self.config.limit_batches
             num_sanity_val_steps = 2
         else:
             # Disable validation, don't use trainer defaults
@@ -273,6 +300,9 @@ def create_trainer(self, logger=None, callbacks=None, **kwargs):
         else:
             enable_checkpointing = False
 
+        if self.config.accelerator == "cuda" and not torch.cuda.is_available():
+            self.config.accelerator = "auto"
+
         trainer_args = {
             "logger": logger,
             "max_epochs": self.config.train.epochs,
@@ -281,6 +311,7 @@ def create_trainer(self, logger=None, callbacks=None, **kwargs):
             "accelerator": self.config.accelerator,
             "fast_dev_run": self.config.train.fast_dev_run,
             "callbacks": callbacks,
+            "limit_train_batches": self.config.limit_batches,
             "limit_val_batches": limit_val_batches,
             "num_sanity_val_steps": num_sanity_val_steps,
         }
@@ -297,94 +328,6 @@ def on_fit_start(self):
                 "calling deepforest.create_trainer()'"
             )
 
-    def on_train_start(self):
-        """Log sample images from training and validation datasets at training
-        start."""
-
-        if self.trainer.fast_dev_run:
-            return
-
-        # Get training dataset
-        train_ds = self.train_dataloader().dataset
-
-        # Sample up to 5 indices from training dataset
-        n_samples = min(5, len(train_ds))
-        sample_indices = torch.randperm(len(train_ds))[:n_samples]
-
-        # Create temporary directory for images
-        tmpdir = tempfile.mkdtemp()
-
-        # Get images, targets and paths for sampled indices
-        sample_data = [train_ds[idx] for idx in sample_indices]
-        sample_images = [data[0] for data in sample_data]
-        sample_targets = [data[1] for data in sample_data]
-        sample_paths = [data[2] for data in sample_data]
-
-        for image, target, path in zip(
-            sample_images, sample_targets, sample_paths, strict=False
-        ):
-            # Get annotations for this image
-            image_annotations = target.copy()
-            image_annotations = utilities.format_geometry(image_annotations, scores=False)
-            image_annotations.root_dir = self.config.train.root_dir
-            image_annotations["image_path"] = path
-
-            # Plot and save
-            save_path = os.path.join(tmpdir, f"train_{os.path.basename(path)}")
-            visualize.plot_annotations(
-                image_annotations, savedir=tmpdir, image=image.numpy(), basename=path
-            )
-
-            # Log to available loggers
-            for logger in self.trainer.loggers:
-                if hasattr(logger.experiment, "log_image"):
-                    logger.experiment.log_image(
-                        save_path,
-                        metadata={
-                            "name": path,
-                            "context": "detection_train",
-                            "step": self.global_step,
-                        },
-                    )
-
-        # Also log validation images if available
-        if self.config.validation.csv_file is not None:
-            val_ds = self.val_dataloader().dataset
-
-            n_samples = min(5, len(val_ds))
-            sample_indices = torch.randperm(len(val_ds))[:n_samples]
-
-            sample_data = [val_ds[idx] for idx in sample_indices]
-            sample_images = [data[0] for data in sample_data]
-            sample_targets = [data[1] for data in sample_data]
-            sample_paths = [data[2] for data in sample_data]
-
-            for image, target, path in zip(
-                sample_images, sample_targets, sample_paths, strict=False
-            ):
-                image_annotations = target.copy()
-                image_annotations = utilities.format_geometry(
-                    image_annotations, scores=False
-                )
-                image_annotations.root_dir = self.config.validation.root_dir
-                image_annotations["image_path"] = path
-
-                save_path = os.path.join(tmpdir, f"val_{os.path.basename(path)}")
-                visualize.plot_annotations(
-                    image_annotations, savedir=tmpdir, image=image.numpy(), basename=path
-                )
-
-                for logger in self.trainer.loggers:
-                    if hasattr(logger.experiment, "log_image"):
-                        logger.experiment.log_image(
-                            save_path,
-                            metadata={
-                                "name": path,
-                                "context": "detection_val",
-                                "step": self.global_step,
-                            },
-                        )
-
     def on_save_checkpoint(self, checkpoint):
         checkpoint["label_dict"] = self.label_dict
         checkpoint["numeric_to_label_dict"] = self.numeric_to_label_dict
@@ -418,6 +361,7 @@ def load_dataset(
         augmentations=None,
         preload_images=False,
         batch_size=1,
+        workers=0,
     ):
         """Create a dataset for inference or training. Csv file format is .csv
         file with the columns "image_path", "xmin","ymin","xmax","ymax" for the
@@ -454,6 +398,7 @@ def load_dataset(
             label_dict=self.label_dict,
             augmentations=augmentations,
             preload_images=preload_images,
+            check_annotations=self.config.train.check_annotations,
         )
         if len(ds) == 0:
             raise ValueError(
@@ -465,7 +410,8 @@ def load_dataset(
             batch_size=batch_size,
             shuffle=shuffle,
             collate_fn=ds.collate_fn,
-            num_workers=self.config.workers,
+            num_workers=workers,
+            persistent_workers=self.config.persistent_workers,
         )
 
         return data_loader
@@ -487,6 +433,7 @@ def train_dataloader(self):
             shuffle=True,
             transforms=self.transforms,
             batch_size=self.config.batch_size,
+            workers=self.config.workers,
         )
 
         return loader
@@ -511,7 +458,8 @@ def val_dataloader(self):
                 augmentations=self.config.validation.augmentations,
                 shuffle=False,
                 preload_images=self.config.validation.preload_images,
-                batch_size=self.config.batch_size,
+                batch_size=self.config.validation.batch_size,
+                workers=self.config.validation.workers,
             )
 
         return loader
@@ -845,14 +793,25 @@ def validation_step(self, batch, batch_idx):
             self.mAP_metric.update(filtered_preds, filtered_targets)
 
         # Log the predictions if you want to use them for evaluation logs
+        self.last_preds = []
         for i, result in enumerate(preds):
             formatted_result = utilities.format_geometry(result)
             if formatted_result is not None:
                 formatted_result["image_path"] = image_names[i]
-                self.predictions.append(formatted_result)
+            self.last_preds.append(formatted_result)
+
+        # Force cleanup
+        del preds
+        del filtered_preds
+        del images
+        del targets
+        del filtered_targets
 
         return losses
 
+    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx=0):
+        self.predictions.extend([p for p in self.last_preds if p is not None])
+
     def on_validation_epoch_start(self):
         self.predictions = []
 
@@ -909,7 +868,7 @@ def calculate_empty_frame_accuracy(self, ground_df, predictions_df):
             gt = torch.zeros(len(empty_images))
             predictions = torch.tensor(predictions)
 
-            # Calculate accuracy using metric
+            # Calculate accuracy using metrictest_empty_frame_accuracy_mixed_frames_with_predictions
             self.empty_frame_accuracy.update(predictions, gt)
             empty_accuracy = self.empty_frame_accuracy.compute()
 
@@ -929,8 +888,8 @@ def log_epoch_metrics(self):
                 self.log_dict(output)
             except Exception:
                 pass
-
             self.iou_metric.reset()
+            log_info("Logged IoU")
             output = self.mAP_metric.compute()
 
             # Remove classes from output dict
@@ -940,8 +899,10 @@ def log_epoch_metrics(self):
             except MisconfigurationException:
                 pass
             self.mAP_metric.reset()
+            log_info("Logged mAP")
 
         # Log empty frame accuracy if it has been updated
+        log_info("Computing Empty Frame Accuracy")
         if self.empty_frame_accuracy._update_called:
             empty_accuracy = self.empty_frame_accuracy.compute()
 
@@ -952,29 +913,19 @@ def log_epoch_metrics(self):
                 pass
 
     def on_validation_epoch_end(self):
-        """Compute metrics and predictions at the end of the validation
-        epoch."""
+        """Compute metrics at the end of the validation epoch."""
         if self.trainer.sanity_checking:  # optional skip
             return
 
-        if self.current_epoch % self.config.validation.val_accuracy_interval == 0:
-            if len(self.predictions) > 0:
-                self.predictions = pd.concat(self.predictions)
-            else:
-                self.predictions = pd.DataFrame()
-
-            results = self.evaluate(
-                self.config.validation.csv_file,
-                root_dir=self.config.validation.root_dir,
-                size=self.config.validation.size,
-                predictions=self.predictions,
-            )
+        # Log epoch metrics (calculated for every val loop)
+        log_info("Calculating torchmetrics")
+        self.log_epoch_metrics()
+        log_info(f"Logged epoch {self.current_epoch} metrics")
 
-            # Log epoch metrics
-            self.log_epoch_metrics()
-            self.__evaluation_logs__(results)
-
-            return results
+        if len(self.predictions) > 0:
+            return pd.concat(self.predictions)
+        else:
+            return pd.DataFrame()
 
     def predict_step(self, batch, batch_idx):
         """Predict a batch of images with the deepforest model. If batch is a
@@ -1047,9 +998,40 @@ def predict_batch(self, images, preprocess_fn=None):
         return results
 
     def configure_optimizers(self):
-        optimizer = optim.SGD(
-            self.model.parameters(), lr=self.config.train.lr, momentum=0.9
-        )
+        param_dicts = [
+            {
+                "params": [
+                    p
+                    for n, p in self.model.named_parameters()
+                    if "backbone" not in n and p.requires_grad
+                ],
+                "name": "head",
+            },
+            {
+                "params": [
+                    p
+                    for n, p in self.model.named_parameters()
+                    if "backbone" in n and p.requires_grad
+                ],
+                "lr": self.config.train.lr_backbone,
+                "name": "backbone",
+            },
+        ]
+
+        optimizer_name = self.config.train.optimizer
+
+        if optimizer_name.lower() == "sgd":
+            optimizer = torch.optim.SGD(
+                param_dicts,
+                lr=self.config.train.lr,
+                weight_decay=self.config.train.weight_decay,
+            )
+        elif optimizer_name.lower() == "adamw":
+            optimizer = torch.optim.AdamW(
+                param_dicts,
+                lr=self.config.train.lr,
+                weight_decay=self.config.train.weight_decay,
+            )
 
         scheduler_config = self.config.train.scheduler
         scheduler_type = scheduler_config.type
@@ -1059,7 +1041,19 @@ def configure_optimizers(self):
         def lr_lambda(epoch):
             return eval(params.lr_lambda)
 
-        if scheduler_type == "cosine":
+        if scheduler_type == "reduceLROnPlateau":
+            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer,
+                mode=params["mode"],
+                factor=params["factor"],
+                patience=params["patience"],
+                threshold=params["threshold"],
+                threshold_mode=params["threshold_mode"],
+                cooldown=params["cooldown"],
+                min_lr=params["min_lr"],
+                eps=params["eps"],
+            )
+        elif scheduler_type == "cosine":
             scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                 optimizer, T_max=params.T_max, eta_min=params.eta_min
             )
@@ -1086,18 +1080,9 @@ def lr_lambda(epoch):
             scheduler = torch.optim.lr_scheduler.ExponentialLR(
                 optimizer, gamma=params.gamma
             )
-
         else:
-            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
-                optimizer,
-                mode=params["mode"],
-                factor=params["factor"],
-                patience=params["patience"],
-                threshold=params["threshold"],
-                threshold_mode=params["threshold_mode"],
-                cooldown=params["cooldown"],
-                min_lr=params["min_lr"],
-                eps=params["eps"],
+            raise ValueError(
+                f"Unknown learning rate scheduler type in config, got: {scheduler_type}"
             )
 
         # Monitor learning rate if val data is used
@@ -1132,6 +1117,7 @@ def evaluate(
         Returns:
             dict: Results dictionary containing precision, recall and other metrics
         """
+
         self.model.eval()
         ground_df = utilities.read_file(csv_file)
         ground_df["label"] = ground_df.label.apply(lambda x: self.label_dict[x])
@@ -1144,6 +1130,8 @@ def evaluate(
             predictions = self.predict_file(
                 csv_file, root_dir, size=size, batch_size=batch_size
             )
+        else:
+            predictions = utilities.read_file(predictions)
 
         if iou_threshold is None:
             iou_threshold = self.config.validation.iou_threshold
diff --git a/src/deepforest/models/ConditionalDetr.py b/src/deepforest/models/ConditionalDetr.py
new file mode 100644
index 000000000..9594a134e
--- /dev/null
+++ b/src/deepforest/models/ConditionalDetr.py
@@ -0,0 +1,150 @@
+import warnings
+from pathlib import Path
+
+import torch
+from torch import nn
+from transformers import (
+    ConditionalDetrForObjectDetection,
+    ConditionalDetrImageProcessor,
+    logging,
+)
+
+from deepforest.model import BaseModel
+from deepforest.models import detr_utils
+
+# Suppress huge amounts of unnecessary warnings from transformers.
+logging.set_verbosity_error()
+
+
+class ConditionalDetrWrapper(nn.Module):
+    """This class wraps a transformers ConditionalDetrForObjectDetection model
+    so that input pre- and post-processing happens transparently."""
+
+    def __init__(self, config, name, revision, use_nms=False, **hf_args):
+        """Initialize a ConditionalDetrForObjectDetection model.
+
+        We assume that the provided name applies to both model and
+        processor. By default this function creates a model with MS-COCO
+        initialized weights, but can be overridden if needed.
+        """
+        super().__init__()
+        self.config = config
+        self.use_nms = use_nms
+
+        # This suppresses a bunch of messages which are specific to DETR,
+        # but do not impact model function.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=UserWarning)
+
+            # If the user passed in a different number of classes to the model,
+            # then the model will be modified on load. So we ignore
+            # mismatched sizes here.
+            self.net = ConditionalDetrForObjectDetection.from_pretrained(
+                name,
+                revision=revision,
+                num_labels=self.config.num_classes,
+                ignore_mismatched_sizes=True,
+                auxiliary_loss=self.config.train.auxiliary_loss,
+                **hf_args,
+            )
+            self.processor = ConditionalDetrImageProcessor.from_pretrained(
+                name,
+                do_resize=False,
+                do_rescale=False,
+                do_normalize=True,
+                revision=revision,
+                **hf_args,
+            )
+
+            # If user-provided label_dict doesn't match the model's id2label:
+            if self.net.config.label2id != self.config.label_dict:
+                warnings.warn(
+                    "Your supplied label dict differs from the model."
+                    "This is expected if you plan to fine-tune this model on your own data.",
+                    stacklevel=2,
+                )
+                self.net.config.label2id = self.config.label_dict
+                self.net.config.id2label = {
+                    v: k for k, v in self.config.label_dict.items()
+                }
+
+            # For consistency with other DeepForest components
+            self.label_dict = self.net.config.label2id
+            self.num_classes = self.net.config.num_labels
+
+
+    def forward(self, images, targets=None, prepare_targets=True):
+        """ConditionalDetrForObjectDetection forward pass. If targets are
+        provided the function returns a loss dictionary, otherwise it returns
+        processed predictions. For details, see the transformers documentation
+        for "post_process_object_detection".
+
+        Returns:
+            predictions: list of dictionaries with "score", "boxes" and "labels", or
+                          a loss dict for training.
+        """
+        if targets and prepare_targets:
+            targets = detr_utils.prepare_targets(targets)
+
+        encoded_inputs = self.processor.preprocess(
+            images=images, annotations=targets, return_tensors="pt", do_rescale=False
+        )
+
+        # Tensor "movement" is not automatic here, this
+        # could be refactored to the dataloader (collate_fn)
+        # later.
+        for k, v in encoded_inputs.items():
+            if isinstance(v, torch.Tensor):
+                encoded_inputs[k] = v.to(self.net.device)
+
+        if isinstance(encoded_inputs.get("labels"), list):
+            [target.to(self.net.device) for target in encoded_inputs["labels"]]
+
+        preds = self.net(**encoded_inputs)
+
+        if targets is None or not self.training:
+            results = detr_utils.handle_padding_and_postprocess(
+                self.processor, preds, encoded_inputs, images, self.config,
+                num_queries=self.net.config.num_queries
+            )
+
+            if self.use_nms:
+                results = detr_utils.apply_nms(results, iou_thresh=self.config.nms_thresh)
+
+            return results
+        else:
+            # Drop this as it's incorrect for ConditionalDETR
+            preds.loss_dict.pop('cardinality_error', None)
+            return preds.loss_dict
+
+
+class Model(BaseModel):
+    def __init__(self, config, **kwargs):
+        """
+        Args:
+        """
+        super().__init__(config)
+
+    def create_model(
+        self,
+        pretrained: str | Path | None = "microsoft/conditional-detr-resnet-50",
+        *,
+        revision: str | None = "main",
+        map_location: str | torch.device | None = None,
+        **hf_args,
+    ) -> ConditionalDetrWrapper:
+        """Create a Conditional DETR model from pretrained weights.
+
+        The number of classes set via config and will override the
+        downloaded checkpoint. The default weights will load a model
+        trained on MS-COCO that should fine-tune well on other tasks.
+        """
+
+        # Take class mapping from config if the user plans to pretrain,
+        # otherwise it should be defined by the hub model.
+        if pretrained is None:
+            hf_args.setdefault("id2label", self.config.numeric_to_label_dict)
+
+        return ConditionalDetrWrapper(
+            self.config, name=pretrained, revision=revision, **hf_args
+        ).to(map_location)
diff --git a/src/deepforest/models/DeformableDetr.py b/src/deepforest/models/DeformableDetr.py
index a98dbdc24..d482320f1 100644
--- a/src/deepforest/models/DeformableDetr.py
+++ b/src/deepforest/models/DeformableDetr.py
@@ -3,7 +3,6 @@
 
 import torch
 from torch import nn
-from torchvision.ops import nms
 from transformers import (
     DeformableDetrForObjectDetection,
     DeformableDetrImageProcessor,
@@ -11,6 +10,7 @@
 )
 
 from deepforest.model import BaseModel
+from deepforest.models import detr_utils
 
 # Suppress huge amounts of unnecessary warnings from transformers.
 logging.set_verbosity_error()
@@ -44,10 +44,16 @@ def __init__(self, config, name, revision, use_nms=False, **hf_args):
                 revision=revision,
                 num_labels=self.config.num_classes,
                 ignore_mismatched_sizes=True,
+                auxiliary_loss=self.config.train.auxiliary_loss,
                 **hf_args,
             )
             self.processor = DeformableDetrImageProcessor.from_pretrained(
-                name, revision=revision, **hf_args
+                name,
+                do_resize=False,
+                do_rescale=False,
+                do_normalize=True,
+                revision=revision,
+                **hf_args,
             )
 
             # If user-provided label_dict doesn't match the model's id2label:
@@ -66,76 +72,6 @@ def __init__(self, config, name, revision, use_nms=False, **hf_args):
             self.label_dict = self.net.config.label2id
             self.num_classes = self.net.config.num_labels
 
-    def _prepare_targets(self, targets):
-        """This is an internal function which translates BoxDataset targets
-        into MS-COCO format, for use with transformers-like models."""
-        if not isinstance(targets, list):
-            targets = [targets]
-
-        coco_targets = []
-
-        for target in targets:
-            annotations_for_target = []
-            for i, (label, box) in enumerate(
-                zip(target["labels"], target["boxes"], strict=False)
-            ):
-                if isinstance(box, torch.Tensor):
-                    box = box.tolist()
-
-                if isinstance(label, torch.Tensor):
-                    label = label.item()
-
-                annotations_for_target.append(
-                    {
-                        "id": i,
-                        "image_id": i,
-                        "category_id": label,
-                        "bbox": box,
-                        "area": (box[3] - box[1]) * (box[2] - box[0]),
-                        "iscrowd": 0,
-                    }
-                )
-
-            coco_targets.append({"image_id": 0, "annotations": annotations_for_target})
-
-        return coco_targets
-
-    def _apply_nms(self, predictions, iou_thresh):
-        """Apply class-wise NMS to a list of predictions."""
-        filtered = []
-        for pred in predictions:
-            boxes = pred["boxes"]
-            scores = pred["scores"]
-            labels = pred["labels"]
-
-            keep = []
-            for cls in labels.unique():
-                cls_mask = labels == cls
-                cls_boxes = boxes[cls_mask]
-                cls_scores = scores[cls_mask]
-                cls_keep = nms(cls_boxes, cls_scores, iou_thresh)
-                cls_indices = torch.nonzero(cls_mask).squeeze(1)[cls_keep]
-                keep.append(cls_indices)
-
-            if keep:
-                keep = torch.cat(keep)
-                filtered.append(
-                    {
-                        "boxes": boxes[keep],
-                        "scores": scores[keep],
-                        "labels": labels[keep],
-                    }
-                )
-            else:
-                filtered.append(
-                    {
-                        "boxes": boxes,
-                        "scores": scores,
-                        "labels": labels,
-                    }
-                )
-
-        return filtered
 
     def forward(self, images, targets=None, prepare_targets=True):
         """DeformableDetrForObjectDetection forward pass. If targets are
@@ -148,7 +84,7 @@ def forward(self, images, targets=None, prepare_targets=True):
                           a loss dict for training.
         """
         if targets and prepare_targets:
-            targets = self._prepare_targets(targets)
+            targets = detr_utils.prepare_targets(targets)
 
         encoded_inputs = self.processor.preprocess(
             images=images, annotations=targets, return_tensors="pt", do_rescale=False
@@ -166,15 +102,29 @@ def forward(self, images, targets=None, prepare_targets=True):
 
         preds = self.net(**encoded_inputs)
 
+        # Prediction + Validation
         if targets is None or not self.training:
+            # Handle padding for mixed-size batches
+            original_sizes = [i.shape[-2:] for i in images] if isinstance(images, list) else [images.shape[-2:]]
+            batch_size = encoded_inputs['pixel_values'].shape[0]
+            encoded_h, encoded_w = encoded_inputs['pixel_values'].shape[-2:]
+            target_sizes_padded = [(encoded_h, encoded_w)] * batch_size
+
             results = self.processor.post_process_object_detection(
                 preds,
                 threshold=self.config.score_thresh,
-                target_sizes=[i.shape[-2:] for i in images]
-                if isinstance(images, list)
-                else [images.shape[-2:]],
+                target_sizes=target_sizes_padded,
+                top_k=self.net.config.num_queries,
             )
 
+            # Clip boxes in padding area
+            if isinstance(images, list):
+                for i, (result, orig_size) in enumerate(zip(results, original_sizes)):
+                    orig_h, orig_w = orig_size
+                    if result['boxes'].shape[0] > 0:
+                        result['boxes'][:, [0, 2]] = torch.clamp(result['boxes'][:, [0, 2]], min=0, max=orig_w)
+                        result['boxes'][:, [1, 3]] = torch.clamp(result['boxes'][:, [1, 3]], min=0, max=orig_h)
+
             # DETR is specifically designed to be NMS-free, however we've seen cases
             # where it still predicts duplicate boxes
             if self.use_nms:
@@ -182,6 +132,8 @@ def forward(self, images, targets=None, prepare_targets=True):
 
             return results
         else:
+            # Drop cardinality error as it's incorrect for DeformableDETR
+            preds.loss_dict.pop('cardinality_error', None)
             return preds.loss_dict
 
 
diff --git a/src/deepforest/models/Detr.py b/src/deepforest/models/Detr.py
new file mode 100644
index 000000000..2010f25a5
--- /dev/null
+++ b/src/deepforest/models/Detr.py
@@ -0,0 +1,148 @@
+import warnings
+from pathlib import Path
+
+import torch
+from torch import nn
+from transformers import (
+    DetrForObjectDetection,
+    DetrImageProcessor,
+    logging,
+)
+
+from deepforest.model import BaseModel
+from deepforest.models import detr_utils
+
+# Suppress huge amounts of unnecessary warnings from transformers.
+logging.set_verbosity_error()
+
+
+class DetrWrapper(nn.Module):
+    """This class wraps a transformers DetrForObjectDetection model so that
+    input pre- and post-processing happens transparently."""
+
+    def __init__(self, config, name, revision, use_nms=False, **hf_args):
+        """Initialize a DetrForObjectDetection model.
+
+        We assume that the provided name applies to both model and
+        processor. By default this function creates a model with MS-COCO
+        initialized weights, but can be overridden if needed.
+        """
+        super().__init__()
+        self.config = config
+        self.use_nms = use_nms
+
+        # This suppresses a bunch of messages which are specific to DETR,
+        # but do not impact model function.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=UserWarning)
+
+            # If the user passed in a different number of classes to the model,
+            # then the model will be modified on load. So we ignore
+            # mismatched sizes here.
+            self.net = DetrForObjectDetection.from_pretrained(
+                name,
+                revision=revision,
+                num_labels=self.config.num_classes,
+                num_queries=300,
+                ignore_mismatched_sizes=True,
+                auxiliary_loss=self.config.train.auxiliary_loss,
+                **hf_args,
+            )
+            self.processor = DetrImageProcessor.from_pretrained(
+                name,
+                do_resize=False,
+                do_rescale=False,
+                do_normalize=True,
+                revision=revision,
+                **hf_args,
+            )
+
+            # If user-provided label_dict doesn't match the model's id2label:
+            if self.net.config.label2id != self.config.label_dict:
+                warnings.warn(
+                    "Your supplied label dict differs from the model."
+                    "This is expected if you plan to fine-tune this model on your own data.",
+                    stacklevel=2,
+                )
+                self.net.config.label2id = self.config.label_dict
+                self.net.config.id2label = {
+                    v: k for k, v in self.config.label_dict.items()
+                }
+
+            # For consistency with other DeepForest components
+            self.label_dict = self.net.config.label2id
+            self.num_classes = self.net.config.num_labels
+
+
+    def forward(self, images, targets=None, prepare_targets=True):
+        """DetrForObjectDetection forward pass. If targets are provided the
+        function returns a loss dictionary, otherwise it returns processed
+        predictions. For details, see the transformers documentation for
+        "post_process_object_detection".
+
+        Returns:
+            predictions: list of dictionaries with "score", "boxes" and "labels", or
+                          a loss dict for training.
+        """
+        if targets and prepare_targets:
+            targets = detr_utils.prepare_targets(targets)
+
+        encoded_inputs = self.processor.preprocess(
+            images=images, annotations=targets, return_tensors="pt", do_rescale=False
+        )
+
+        # Tensor "movement" is not automatic here, this
+        # could be refactored to the dataloader (collate_fn)
+        # later.
+        for k, v in encoded_inputs.items():
+            if isinstance(v, torch.Tensor):
+                encoded_inputs[k] = v.to(self.net.device)
+
+        if isinstance(encoded_inputs.get("labels"), list):
+            [target.to(self.net.device) for target in encoded_inputs["labels"]]
+
+        preds = self.net(**encoded_inputs)
+
+        if targets is None or not self.training:
+            results = detr_utils.handle_padding_and_postprocess(
+                self.processor, preds, encoded_inputs, images, self.config
+            )
+
+            if self.use_nms:
+                results = detr_utils.apply_nms(results, iou_thresh=self.config.nms_thresh)
+
+            return results
+        else:
+            return preds.loss_dict
+
+
+class Model(BaseModel):
+    def __init__(self, config, **kwargs):
+        """
+        Args:
+        """
+        super().__init__(config)
+
+    def create_model(
+        self,
+        pretrained: str | Path | None = "facebook/detr-resnet-50",
+        *,
+        revision: str | None = "main",
+        map_location: str | torch.device | None = None,
+        **hf_args,
+    ) -> DetrWrapper:
+        """Create a DETR model from pretrained weights.
+
+        The number of classes set via config and will override the
+        downloaded checkpoint. The default weights will load a model
+        trained on MS-COCO that should fine-tune well on other tasks.
+        """
+
+        # Take class mapping from config if the user plans to pretrain,
+        # otherwise it should be defined by the hub model.
+        if pretrained is None:
+            hf_args.setdefault("id2label", self.config.numeric_to_label_dict)
+
+        return DetrWrapper(self.config, name=pretrained, revision=revision, **hf_args).to(
+            map_location
+        )
diff --git a/src/deepforest/models/detr_utils.py b/src/deepforest/models/detr_utils.py
new file mode 100644
index 000000000..4bcb09e25
--- /dev/null
+++ b/src/deepforest/models/detr_utils.py
@@ -0,0 +1,139 @@
+"""Shared utility functions for DETR-based models."""
+
+import torch
+from torchvision.ops import nms
+
+
+def prepare_targets(targets):
+    """Translate BoxDataset targets into MS-COCO format for transformers models.
+
+    Args:
+        targets: List of target dictionaries with 'labels' and 'boxes' keys
+
+    Returns:
+        List of COCO-formatted target dictionaries
+    """
+    if not isinstance(targets, list):
+        targets = [targets]
+
+    coco_targets = []
+
+    for target in targets:
+        annotations_for_target = []
+        for i, (label, box) in enumerate(
+            zip(target["labels"], target["boxes"], strict=False)
+        ):
+            if isinstance(box, torch.Tensor):
+                box = box.tolist()
+
+            if isinstance(label, torch.Tensor):
+                label = label.item()
+
+            # Convert from [xmin, ymin, xmax, ymax] to COCO format [x, y, width, height]
+            xmin, ymin, xmax, ymax = box
+            coco_bbox = [xmin, ymin, xmax - xmin, ymax - ymin]
+            area = (xmax - xmin) * (ymax - ymin)
+
+            annotations_for_target.append(
+                {
+                    "id": i,
+                    "image_id": i,
+                    "category_id": label,
+                    "bbox": coco_bbox,
+                    "area": area,
+                    "iscrowd": 0,
+                }
+            )
+
+        coco_targets.append({"image_id": 0, "annotations": annotations_for_target})
+
+    return coco_targets
+
+
+def apply_nms(predictions, iou_thresh):
+    """Apply class-wise NMS to a list of predictions.
+
+    Args:
+        predictions: List of prediction dictionaries with 'boxes', 'scores', 'labels'
+        iou_thresh: IoU threshold for NMS
+
+    Returns:
+        List of filtered prediction dictionaries
+    """
+    filtered = []
+    for pred in predictions:
+        boxes = pred["boxes"]
+        scores = pred["scores"]
+        labels = pred["labels"]
+
+        keep = []
+        for cls in labels.unique():
+            cls_mask = labels == cls
+            cls_boxes = boxes[cls_mask]
+            cls_scores = scores[cls_mask]
+            cls_keep = nms(cls_boxes, cls_scores, iou_thresh)
+            cls_indices = torch.nonzero(cls_mask).squeeze(1)[cls_keep]
+            keep.append(cls_indices)
+
+        if keep:
+            keep = torch.cat(keep)
+            filtered.append(
+                {
+                    "boxes": boxes[keep],
+                    "scores": scores[keep],
+                    "labels": labels[keep],
+                }
+            )
+        else:
+            filtered.append(
+                {
+                    "boxes": boxes,
+                    "scores": scores,
+                    "labels": labels,
+                }
+            )
+
+    return filtered
+
+
+def handle_padding_and_postprocess(processor, preds, encoded_inputs, images, config, num_queries=None):
+    """Handle padded batches and post-process predictions with proper coordinate scaling.
+
+    When images of different sizes are batched, the processor pads them to a uniform size.
+    Predictions are in normalized [0, 1] coordinates relative to the padded image space,
+    so they must be scaled by padded dimensions and then clipped to original bounds.
+
+    Args:
+        processor: HuggingFace image processor with post_process_object_detection method
+        preds: Raw model predictions
+        encoded_inputs: Dictionary containing 'pixel_values' tensor
+        images: List of original image tensors
+        config: Model configuration with score_thresh
+        num_queries: Optional top_k parameter for DeformableDetr/ConditionalDetr
+
+    Returns:
+        List of post-processed prediction dictionaries
+    """
+    original_sizes = [i.shape[-2:] for i in images] if isinstance(images, list) else [images.shape[-2:]]
+    batch_size = encoded_inputs['pixel_values'].shape[0]
+    encoded_h, encoded_w = encoded_inputs['pixel_values'].shape[-2:]
+    target_sizes_padded = [(encoded_h, encoded_w)] * batch_size
+
+    kwargs = {
+        "threshold": config.score_thresh,
+        "target_sizes": target_sizes_padded,
+    }
+
+    if num_queries is not None:
+        kwargs["top_k"] = num_queries
+
+    results = processor.post_process_object_detection(preds, **kwargs)
+
+    if isinstance(images, list):
+        for i, (result, orig_size) in enumerate(zip(results, original_sizes)):
+            orig_h, orig_w = orig_size
+            if result['boxes'].shape[0] > 0:
+                result['boxes'][:, [0, 2]] = torch.clamp(result['boxes'][:, [0, 2]], min=0, max=orig_w)
+                result['boxes'][:, [1, 3]] = torch.clamp(result['boxes'][:, [1, 3]], min=0, max=orig_h)
+
+    return results
diff --git a/src/deepforest/models/dinov3.py b/src/deepforest/models/dinov3.py
new file mode 100644
index 000000000..da4419a94
--- /dev/null
+++ b/src/deepforest/models/dinov3.py
@@ -0,0 +1,165 @@
+from collections import OrderedDict
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers import AutoImageProcessor, AutoModel
+
+
+class Dinov3Model(nn.Module):
+    def __init__(
+        self,
+        repo_id="facebook/dinov3-vitl16-pretrain-sat493m",
+        frozen=True,
+        use_conv_pyramid=True,
+        fpn_out_channels=256,
+    ):
+        super().__init__()
+
+        self.model = AutoModel.from_pretrained(repo_id)
+        self.processor = AutoImageProcessor.from_pretrained(repo_id)
+        self.image_mean = torch.Tensor(self.processor.image_mean)
+        self.image_std = torch.Tensor(self.processor.image_std)
+        self.frozen = frozen
+        self.use_conv_pyramid = use_conv_pyramid
+        self.fpn_out_channels = fpn_out_channels
+
+        if self.frozen:
+            # Freeze DINO model parameters
+            for param in self.model.parameters():
+                param.requires_grad = False
+
+        # Use final layer
+        self.feature_layer = -1
+
+        # Infer hidden size from model config
+        self.hidden_size = self.model.config.hidden_size
+
+        if self.use_conv_pyramid:
+            # ViTDet-style simple feature pyramid with conv/deconv layers
+            # Following Detectron2 scale_factors=(4.0, 2.0, 1.0, 0.5)
+            # All operations applied in parallel to base features
+            self.fpn_4x = nn.Conv2d(
+                self.hidden_size,
+                self.fpn_out_channels,
+                kernel_size=3,
+                stride=4,
+                padding=1,
+            )  # 1/64 scale (4x downsample)
+            self.fpn_2x = nn.Conv2d(
+                self.hidden_size,
+                self.fpn_out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+            )  # 1/32 scale (2x downsample)
+            self.fpn_1x = nn.Conv2d(
+                self.hidden_size,
+                self.fpn_out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )  # 1/16 scale (base)
+            self.fpn_0_5x = nn.ConvTranspose2d(
+                self.hidden_size,
+                self.fpn_out_channels,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+            )  # 1/8 scale (2x upsample)
+
+            # Initialize conv layers
+            for module in [self.fpn_4x, self.fpn_2x, self.fpn_1x, self.fpn_0_5x]:
+                nn.init.kaiming_normal_(
+                    module.weight, mode="fan_out", nonlinearity="relu"
+                )
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+            self.out_channels = self.fpn_out_channels
+        else:
+            # Multi-scale pooling to create pyramid,
+            # uses average pooling in the forward pass.
+            self.scales = [1, 2, 4, 8]
+            self.out_channels = self.hidden_size
+
+    def _reshape_to_spatial(self, x, h, w):
+        """Reshape token sequence back to spatial format."""
+        batch_size = x.shape[0]
+
+        # Skip CLS token (first) and register tokens, keep only patch tokens
+        num_register_tokens = self.model.config.num_register_tokens
+        patch_tokens = x[:, 1 + num_register_tokens :, :]
+
+        patch_tokens = patch_tokens.contiguous().view(batch_size, h, w, self.hidden_size)
+        patch_tokens = patch_tokens.permute(0, 3, 1, 2)  # BHWC -> BCHW
+        return patch_tokens
+
+    def forward(self, x: torch.Tensor) -> OrderedDict:
+        # Note that 'x' is normalized by RetinaNet, so be careful.
+        batch_size, _, height, width = x.shape
+
+        # Calculate patch grid size (assuming patch size 16)
+        patch_size = self.model.config.patch_size
+        h_patches = height // patch_size
+        w_patches = width // patch_size
+
+        # Here, we expect x to be normalized.
+        encoded_inputs = {"pixel_values": x}
+
+        # This is the actual "forward pass", we extract the hidden states.
+        hidden_states = self.model(
+            **encoded_inputs, output_hidden_states=True
+        ).hidden_states
+
+        # Get features from final layer
+        layer_features = hidden_states[self.feature_layer]
+
+        # Reshape to spatial format
+        base_features = self._reshape_to_spatial(layer_features, h_patches, w_patches)
+
+        if self.use_conv_pyramid:
+            # Create multi-scale features using ViTDet-style simple feature pyramid
+            # Apply all conv/deconv operations in parallel to the base features
+            feat_4x = self.fpn_4x(base_features)  # 4x downsample for 1/64 scale
+            feat_2x = self.fpn_2x(base_features)  # 2x downsample for 1/32 scale
+            feat_1x = self.fpn_1x(base_features)  # Base resolution 1/16 scale
+            feat_0_5x = self.fpn_0_5x(base_features)  # 2x upsample for 1/8 scale
+
+            features = {
+                "feat_0": feat_0_5x,  # 1/8 scale (highest resolution - for small objects)
+                "feat_1": feat_1x,  # 1/16 scale (medium-high resolution)
+                "feat_2": feat_2x,  # 1/32 scale (medium-low resolution)
+                "feat_3": feat_4x,  # 1/64 scale (lowest resolution - for large objects)
+            }
+        else:
+            # Create multi-scale features using pooling (original approach)
+            features = {}
+            for i, scale in enumerate(self.scales):
+                if scale == 1:
+                    # Original resolution
+                    features[f"feat_{i}"] = base_features
+                else:
+                    # Downsample using average pooling
+                    pooled_features = F.avg_pool2d(
+                        base_features, kernel_size=scale, stride=scale
+                    )
+                    features[f"feat_{i}"] = pooled_features
+
+        # Return as OrderedDict to match torchvision backbone interface
+
+        return OrderedDict(features)
+
+    def normalize(self, X: torch.Tensor) -> torch.Tensor:
+        """Normalize input tensor X [B, C, H, W] using per-channel mean and
+        std.
+
+        Args:
+            X: input tensor [B, C, H, W]
+        Returns:
+            Normalized tensor [B, C, H, W]
+        """
+        # reshape mean and std to [C, 1, 1] so they broadcast across H, W
+        mean = self.image_mean.view(-1, 1, 1)
+        std = self.image_std.view(-1, 1, 1)
+        return (X - mean) / std
diff --git a/src/deepforest/models/retinanet.py b/src/deepforest/models/retinanet.py
index a9e0ebe19..c2172913d 100644
--- a/src/deepforest/models/retinanet.py
+++ b/src/deepforest/models/retinanet.py
@@ -1,12 +1,19 @@
+import os
 import warnings
 from pathlib import Path
 
 import torch
 import torchvision
 from huggingface_hub import PyTorchModelHubMixin
-from torchvision.models.detection.retinanet import AnchorGenerator, RetinaNet
+from torchvision.models.detection.retinanet import (
+    AnchorGenerator,
+    ResNet50_Weights,
+    RetinaNet,
+    RetinaNet_ResNet50_FPN_Weights,
+)
 
 from deepforest.model import BaseModel
+from deepforest.models.dinov3 import Dinov3Model
 
 
 class RetinaNetHub(RetinaNet, PyTorchModelHubMixin):
@@ -14,22 +21,65 @@ class RetinaNetHub(RetinaNet, PyTorchModelHubMixin):
 
     def __init__(
         self,
+        weights: str | None = None,
         backbone_weights: str | None = None,
+        backbone="resnet50",
         num_classes: int = 1,
         nms_thresh: float = 0.05,
         score_thresh: float = 0.5,
         label_dict: dict = None,
+        use_conv_pyramid: bool = True,
+        fpn_out_channels: int = 256,
+        freeze_backbone: bool = False,
         **kwargs,
     ):
-        backbone = torchvision.models.detection.retinanet_resnet50_fpn(
-            weights=backbone_weights
-        ).backbone
+        if backbone == "dinov3":
+            # Only pass repo_id if weights is not None to avoid overriding the default
+            dinov3_kwargs = {
+                "use_conv_pyramid": use_conv_pyramid,
+                "fpn_out_channels": fpn_out_channels,
+                "frozen": freeze_backbone,
+            }
+            if weights is not None:
+                dinov3_kwargs["repo_id"] = weights
+
+            backbone = Dinov3Model(**dinov3_kwargs)
+            anchor_sizes = tuple(
+                (x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3)))
+                for x in [32, 64, 128, 256]
+            )
+            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+            anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
+
+            # Can vary with model, e.g. sat does not use ImageNet
+            image_mean = backbone.image_mean
+            image_std = backbone.image_std
+        elif backbone == "resnet50":
+            backbone = torchvision.models.detection.retinanet_resnet50_fpn(
+                weights=weights, backbone_weights=backbone_weights
+            ).backbone
+            anchor_generator = None  # Use default
+
+            if freeze_backbone:
+                for param in backbone.parameters():
+                    param.requires_grad = False
+
+            # Explicitly use ImageNet
+            image_mean = torch.tensor([0.485, 0.456, 0.406])
+            image_std = torch.tensor([0.229, 0.224, 0.225])
+        else:
+            raise NotImplementedError(
+                f"Backbone {backbone} is unknown, or not supported. Use 'dinov3' or 'resnet50'"
+            )
 
         super().__init__(
             backbone=backbone,
             num_classes=num_classes,
+            anchor_generator=anchor_generator,
             score_thresh=score_thresh,
             nms_thresh=nms_thresh,
+            image_mean=image_mean,
+            image_std=image_std,
             **kwargs,
         )
 
@@ -41,8 +91,14 @@ def __init__(
 
         self.num_classes = num_classes
         self.label_dict = label_dict
+        self.use_conv_pyramid = use_conv_pyramid
+        self.fpn_out_channels = fpn_out_channels
         self.kwargs = kwargs
 
+        # Store normalization parameters for denormalization
+        self.image_mean = image_mean
+        self.image_std = image_std
+
         self.update_config()
 
     @classmethod
@@ -54,8 +110,40 @@ def from_pretrained(
 
         If the target num_classes differs from the model's num_classes,
         then the model heads are reinitialized to compensate.
+
+        Also handles PyTorch Lightning .ckpt files directly.
         """
-        model = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        # Handle PyTorch Lightning checkpoint files (.ckpt)
+        # TODO: Use from_pretrained and generate HF compatible config from ckpt?
+        if (
+            isinstance(pretrained_model_name_or_path, (str, Path))
+            and str(pretrained_model_name_or_path).endswith(".ckpt")
+            and os.path.exists(pretrained_model_name_or_path)
+        ):
+            # Always load to CPU first to avoid device mapping issues later
+            checkpoint = torch.load(
+                pretrained_model_name_or_path, map_location="cpu", weights_only=False
+            )
+
+            config = checkpoint["hyper_parameters"]["config"]
+
+            if config["architecture"] != "retinanet":
+                raise ValueError(
+                    f"Checkpoint architecture is {config['architecture']}, should be retinanet."
+                )
+            elif config["backbone"] != kwargs.get("backbone"):
+                raise ValueError(
+                    f"You are trying to instantiate a RetinaNet with a {kwargs.get('backbone')}, backbone, but your checkpoint contains {config['backbone']}. Check your config is correct."
+                )
+
+            # Instantiate model with config as provided and load in the state
+            model = cls(**kwargs)
+            model.load_state_dict(checkpoint["state_dict"])
+
+        else:
+            # Otherwise use HuggingFace Hub path, which requires weights + config
+            model = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
 
         # Override class info if specified
         if num_classes is not None and label_dict is not None:
@@ -166,7 +254,7 @@ def create_anchor_generator(
 
     def create_model(
         self,
-        pretrained: str | Path | None = None,
+        pretrained: str | Path = "resnet50-mscoco",
         *,
         revision: str | None = None,
         map_location: str | torch.device | None = None,
@@ -174,7 +262,7 @@ def create_model(
     ) -> RetinaNetHub:
         """Create a retinanet model
         Args:
-            pretrained (str | Path | None): If supplied, specifies repository ID for weight download, otherwise use default COCO weights
+            pretrained (str | Path): Specifies repository ID for weight download or predefined model type. Defaults to "resnet50-mscoco"
             revision (str | None): Repository revision
             map_location (str | torch.device | None): Device to load weights onto
             **hf_args: Any other arguments to load_pretrained
@@ -182,22 +270,61 @@ def create_model(
             model: a pytorch nn module
         """
 
-        if pretrained is None:
+        if pretrained == "resnet50-imagenet":
+            if revision is not None:
+                warnings.warn(
+                    "Ignoring revision and using an un-initialized RetinaNet head, ImageNet backbone.",
+                    stacklevel=2,
+                )
+            model = RetinaNetHub(
+                weights=None,
+                backbone_weights=ResNet50_Weights.IMAGENET1K_V2,
+                num_classes=self.config.num_classes,
+                nms_thresh=self.config.nms_thresh,
+                score_thresh=self.config.score_thresh,
+                label_dict=self.config.label_dict,
+                freeze_backbone=self.config.train.freeze_backbone,
+            )
+        elif pretrained == "resnet50-mscoco":
+            if revision is not None:
+                warnings.warn(
+                    "Ignoring revision and fine-tuning from ResNet50 MS-COCO checkpoint.",
+                    stacklevel=2,
+                )
+            model = RetinaNetHub(
+                weights=RetinaNet_ResNet50_FPN_Weights.COCO_V1,
+                num_classes=self.config.num_classes,
+                nms_thresh=self.config.nms_thresh,
+                score_thresh=self.config.score_thresh,
+                label_dict=self.config.label_dict,
+                freeze_backbone=self.config.train.freeze_backbone,
+            )
+        elif pretrained is None:
+            warnings.warn(
+                "Using a randomly initialized model. You probably don't want to do this unless you have a very large dataset to pretrain on..",
+                stacklevel=2,
+            )
             model = RetinaNetHub(
-                backbone_weights="COCO_V1",
+                weights=None,
+                backbone_weights=None,
+                backbone=self.config.backbone,
                 num_classes=self.config.num_classes,
                 nms_thresh=self.config.nms_thresh,
                 score_thresh=self.config.score_thresh,
                 label_dict=self.config.label_dict,
+                freeze_backbone=self.config.train.freeze_backbone,
             )
+        # Deepforest/tree, fine-tune from user, etc.
         else:
             model = RetinaNetHub.from_pretrained(
                 pretrained,
                 revision=revision,
+                backbone=self.config.backbone,
                 num_classes=self.config.num_classes,
                 label_dict=self.config.label_dict,
                 nms_thresh=self.config.nms_thresh,
                 score_thresh=self.config.score_thresh,
+                freeze_backbone=self.config.train.freeze_backbone,
                 **hf_args,
             )
 
diff --git a/src/deepforest/scripts/cli.py b/src/deepforest/scripts/cli.py
index 1a136e3ff..7cd91a41c 100644
--- a/src/deepforest/scripts/cli.py
+++ b/src/deepforest/scripts/cli.py
@@ -1,73 +1,537 @@
 import argparse
+import datetime
+import glob
 import os
+import sys
+import traceback
+import warnings
+from pathlib import Path
 
+import torch
 from hydra import compose, initialize, initialize_config_dir
 from omegaconf import DictConfig, OmegaConf
+from pytorch_lightning.callbacks import DeviceStatsMonitor, ModelCheckpoint
+from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger
 
+from deepforest.callbacks import EvaluationCallback, ImagesCallback
 from deepforest.conf.schema import Config as StructuredConfig
 from deepforest.main import deepforest
 from deepforest.visualize import plot_results
 
 
-def train(config: DictConfig) -> None:
+def train(
+    config: DictConfig,
+    checkpoint: bool = True,
+    comet: bool = False,
+    tensorboard: bool = False,
+    trace: bool = False,
+    compress: bool = False,
+    resume: str | None = None,
+) -> bool:
+    """Train a DeepForest model with configurable logging and experiment
+    tracking.
+
+    This training function sets up PyTorch Lightning trainer with various logging
+    options including CSV, TensorBoard, and Comet ML. When Comet logging is enabled,
+    the experiment ID is automatically captured and stored in the model's
+    hyperparameters for later use.
+
+    Args:
+        config (DictConfig): Hydra configuration containing model and training parameters
+        checkpoint (bool, optional): Whether to enable model checkpointing. Defaults to True.
+        comet (bool, optional): Whether to enable Comet ML logging. Requires comet-ml
+            package and proper environment variables (COMET_API_KEY, COMET_WORKSPACE).
+            Defaults to False.
+        tensorboard (bool, optional): Whether to enable TensorBoard logging in addition
+            to CSV logging. Defaults to False.
+        trace (bool, optional): Whether to enable PyTorch memory profiling for debugging.
+            Only works when CUDA is available. Defaults to False.
+        compress (bool, optional): Whether to compress prediction CSV files using gzip for
+            better storage efficiency. Defaults to False.
+
+    Returns:
+        bool: True if training completed successfully, False if training failed
+
+    Note:
+        When Comet logging is enabled, the experiment ID (key) is automatically added
+        to the model's hyperparameters as 'experiment_id' for later re-logging to
+        the same experiment.
+    """
+
+    if trace:
+        if not torch.cuda.is_available():
+            warnings.warn("Cuda is not available, skipping trace.", stacklevel=2)
+        else:
+            torch.cuda.memory._record_memory_history()
+
+    # Set matmul precision
+    torch.set_float32_matmul_precision(config.matmul_precision)
+
     m = deepforest(config=config)
-    m.trainer.fit(m)
+
+    callbacks = []
+    loggers = []
+    log_root = Path(config.train.log_root)
+
+    # Use defaults from Lightning unless overriden by Comet
+    experiment_name = None
+    experiment_id = None
+    # Store as %YYYY%mm%ddT%HH:%MM:%SS
+    version = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+
+    # Comet setup requires an external dependency
+    if comet and not m.config.train.fast_dev_run:
+        try:
+            from pytorch_lightning.loggers import CometLogger
+
+            comet_logger = CometLogger(
+                api_key=os.environ.get("COMET_API_KEY"),
+                workspace=os.environ.get("COMET_WORKSPACE"),
+                project=os.environ.get("COMET_PROJECT", default="DeepForest"),
+                offline_directory=config.train.log_root,
+            )
+
+            experiment_name = comet_logger.experiment.get_name()
+            # Store experiment ID (key) for later re-logging to comet
+            experiment_id = comet_logger.experiment.get_key()
+            version = ""
+            loggers.append(comet_logger)
+        except ImportError:
+            warnings.warn(
+                "Failed to import Comet, check if comet-ml is installed", stacklevel=2
+            )
+        except Exception as e:
+            warnings.warn(f"Failed to set up comet logger. {e}", stacklevel=2)
+    else:
+        callbacks.append(DeviceStatsMonitor())
+
+    # By default, create a CSV logger and monitor stats
+    csv_logger = CSVLogger(save_dir=log_root, name=experiment_name, version=version)
+    loggers.append(csv_logger)
+
+    if tensorboard:
+        tensorboard_logger = TensorBoardLogger(
+            save_dir=log_root,
+            sub_dir="tensorboard",
+            name=experiment_name,
+            version=version,
+        )
+        loggers.append(tensorboard_logger)
+
+    callbacks.append(
+        ImagesCallback(
+            save_dir=Path(csv_logger.log_dir) / "images",
+            every_n_epochs=config.validation.val_accuracy_interval,
+            select_random=True,
+        )
+    )
+
+    evaluation_path = Path(csv_logger.log_dir) / "predictions"
+    callbacks.append(
+        EvaluationCallback(
+            save_dir=evaluation_path,
+            compress=compress,
+            every_n_epochs=config.validation.val_accuracy_interval,
+            run_evaluation=True,
+        )
+    )
+
+    # Setup checkpoint to store in log directory
+    if checkpoint:
+        checkpoint_callback = ModelCheckpoint(
+            dirpath=Path(csv_logger.log_dir) / "checkpoints",
+            filename=f"{config.architecture}-{{epoch:02d}}-{{map_50:.2f}}",
+            monitor="val_loss",
+            mode="min",
+            save_top_k=1,
+            save_last=True,
+        )
+        # Using equals causes a lot of strife with Hydra, so use colon instead.
+        checkpoint_callback.CHECKPOINT_EQUALS_CHAR = ":"
+        callbacks.append(checkpoint_callback)
+
+    m.create_trainer(
+        logger=loggers,
+        callbacks=callbacks,
+        gradient_clip_val=0.5,
+        accelerator=config.accelerator,
+        precision=config.training_precision,
+        strategy="ddp_find_unused_parameters_true"
+        if torch.cuda.is_available() and "dino" in config.model.name
+        else "auto",
+    )
+
+    # Add experiment ID to hyperparameters if available
+    if experiment_id is not None:
+        # Update the saved hyperparameters to include experiment ID
+        current_hparams = m.hparams.copy()
+        current_hparams["experiment_id"] = experiment_id
+        m.save_hyperparameters(current_hparams)
+
+    train_success = False
+    try:
+        m.trainer.fit(m, ckpt_path=resume)
+        train_success = True
+    except Exception as e:
+        warnings.warn(
+            f"Training failed with exception {e}. Will attempt to upload any existing checkpoints if enabled.",
+            stacklevel=2,
+        )
+        warnings.warn(traceback.format_exc(), stacklevel=2)
+
+    if trace and torch.cuda.is_available():
+        torch.cuda.memory._dump_snapshot(
+            filename=Path(csv_logger.log_dir) / "dump_snapshot.pickle"
+        )
+
+    # Upload predictions
+    if comet:
+        for logger in m.trainer.loggers:
+            m.print("Uploading predictions")
+
+            if hasattr(logger.experiment, "log_artifact"):
+                logger.experiment.log_asset_folder(evaluation_path, log_file_name=True)
+
+    if checkpoint:
+        for logger in m.trainer.loggers:
+            if hasattr(logger.experiment, "log_model"):
+                for checkpoint in glob.glob(
+                    os.path.join((checkpoint_callback.dirpath), "*.ckpt")
+                ):
+                    m.print(f"Uploading checkpoint {checkpoint}")
+                    logger.experiment.log_model(
+                        name=os.path.basename(checkpoint), file_or_folder=str(checkpoint)
+                    )
+
+    return train_success
 
 
 def predict(
     config: DictConfig,
-    input_path: str,
+    input_path: str | None = None,
     output_path: str | None = None,
     plot: bool | None = False,
+    root_dir: str | None = None,
 ) -> None:
-    """Run prediction for the given image, optionally saving the results to the
-    provided path and optionally visualizing the results.
+    """Run prediction for the given image or CSV file, optionally saving the
+    results to the provided path and optionally visualizing the results.
 
     Args:
         config (DictConfig): Hydra configuration.
-        input_path (str): Path to the input image.
+        input_path (Optional[str]): Path to the input image or CSV file. If None, uses config.validation.csv_file.
         output_path (Optional[str]): Path to save the prediction results.
         plot (Optional[bool]): Whether to plot the results.
+        root_dir (Optional[str]): Root directory containing images when input_path is a CSV file.
 
     Returns:
         None
     """
+    # Set matmul precision
+    torch.set_float32_matmul_precision(config.matmul_precision)
+
     m = deepforest(config=config)
-    res = m.predict_tile(
-        path=input_path,
-        patch_size=config.patch_size,
-        patch_overlap=config.patch_overlap,
-        iou_threshold=config.nms_thresh,
-    )
+
+    # Use validation CSV from config if not provided
+    if input_path is None:
+        if config.validation.csv_file is None:
+            raise ValueError(
+                "No input file provided and config.validation.csv_file is not set"
+            )
+        input_path = config.validation.csv_file
+        print(f"Using validation CSV from config: {input_path}")
+
+    # Use validation root_dir from config if not provided and input is CSV
+    if input_path.endswith(".csv") and root_dir is None:
+        root_dir = config.validation.root_dir
+        if root_dir is not None:
+            print(f"Using root directory from config: {root_dir}")
+
+    if input_path.endswith(".csv"):
+        # CSV batch prediction
+        res = m.predict_file(
+            csv_file=input_path,
+            root_dir=root_dir,
+            batch_size=config.batch_size,
+        )
+    else:
+        # Single image prediction
+        res = m.predict_tile(
+            path=input_path,
+            patch_size=config.patch_size,
+            patch_overlap=config.patch_overlap,
+            iou_threshold=config.nms_thresh,
+        )
 
     if output_path is not None:
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        if os.path.dirname(output_path):
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
         res.to_csv(output_path, index=False)
 
     if plot:
         plot_results(res)
 
 
+def evaluate(
+    config: DictConfig,
+    csv_file: str | None = None,
+    root_dir: str | None = None,
+    predictions_csv: str | None = None,
+    iou_threshold: float | None = None,
+    batch_size: int | None = None,
+    size: int | None = None,
+    experiment_id: str | None = None,
+    output_path: str | None = None,
+    save_predictions: str | None = None,
+) -> None:
+    """Run evaluation on ground truth annotations, optionally logging to Comet.
+
+    This function evaluates model predictions against ground truth annotations.
+    There are two workflows:
+
+    1. Provide existing predictions via --predictions-csv:
+       deepforest evaluate ground_truth.csv --predictions-csv predictions.csv
+
+    2. Generate predictions during evaluation:
+       deepforest evaluate ground_truth.csv --root-dir /path/to/images
+
+       Optionally save generated predictions with --save-predictions:
+       deepforest evaluate ground_truth.csv --root-dir /path/to/images \\
+           --save-predictions predictions.csv -o eval_results.csv
+
+    Args:
+        config (DictConfig): Hydra configuration.
+        csv_file (Optional[str]): Path to ground truth CSV file with annotations. If None, uses config.validation.csv_file.
+        root_dir (Optional[str]): Root directory containing images. If None, uses config value or directory of csv_file.
+        predictions_csv (Optional[str]): Path to existing predictions CSV file. If None, generates predictions.
+        iou_threshold (Optional[float]): IoU threshold for evaluation. If None, uses config value.
+        batch_size (Optional[int]): Batch size for prediction. If None, uses config value.
+        size (Optional[int]): Size to resize images for prediction. If None, no resizing.
+        experiment_id (Optional[str]): Comet experiment ID to log results to.
+        output_path (Optional[str]): Path to save evaluation metrics summary CSV.
+        save_predictions (Optional[str]): Path to save generated predictions CSV. Only used when predictions_csv is None.
+
+    Returns:
+        None
+    """
+    # Set matmul precision
+    torch.set_float32_matmul_precision(config.matmul_precision)
+
+    m = deepforest(config=config)
+
+    # Use validation CSV from config if not provided
+    if csv_file is None:
+        if config.validation.csv_file is None:
+            raise ValueError(
+                "No CSV file provided and config.validation.csv_file is not set"
+            )
+        csv_file = config.validation.csv_file
+        print(f"Using validation CSV from config: {csv_file}")
+
+    # Use validation root_dir from config if not provided
+    if root_dir is None:
+        root_dir = config.validation.root_dir
+        if root_dir is not None:
+            print(f"Using root directory from config: {root_dir}")
+
+    # Run evaluation
+    results = m.evaluate(
+        csv_file=csv_file,
+        root_dir=root_dir,
+        iou_threshold=iou_threshold,
+        batch_size=batch_size,
+        size=size,
+        predictions=predictions_csv,
+    )
+
+    # Save generated predictions if requested and they were generated (not loaded from file)
+    if save_predictions is not None and predictions_csv is None:
+        predictions_df = results.get("predictions")
+        if predictions_df is not None and not predictions_df.empty:
+            if os.path.dirname(save_predictions):
+                os.makedirs(os.path.dirname(save_predictions), exist_ok=True)
+            predictions_df.to_csv(save_predictions, index=False)
+            print(f"\nGenerated predictions saved to: {save_predictions}")
+        else:
+            print("\nWarning: No predictions to save (predictions dataframe is empty)")
+    elif save_predictions is not None and predictions_csv is not None:
+        print(
+            "\nNote: --save-predictions is ignored when --predictions-csv is provided (predictions already exist)"
+        )
+
+    # Print results to console
+    print("Evaluation Results:")
+    print("=" * 50)
+    for key, value in results.items():
+        if key not in ["predictions", "results", "ground_df", "class_recall"]:
+            if value is not None:
+                print(f"{key}: {value}")
+
+    # Print class-specific results if available
+    if results.get("class_recall") is not None:
+        print("\nClass-specific Results:")
+        print("-" * 30)
+        for _, row in results["class_recall"].iterrows():
+            label_name = m.numeric_to_label_dict[row["label"]]
+            print(
+                f"{label_name} - Recall: {row['recall']:.4f}, Precision: {row['precision']:.4f}"
+            )
+
+    # Log to Comet if experiment ID provided
+    if experiment_id is not None:
+        try:
+            from pytorch_lightning.loggers import CometLogger
+
+            comet_logger = CometLogger(
+                api_key=os.environ.get("COMET_API_KEY"),
+                workspace=os.environ.get("COMET_WORKSPACE"),
+                project=os.environ.get("COMET_PROJECT", default="DeepForest"),
+                experiment_key=experiment_id,  # Re-log to existing experiment
+            )
+
+            # Log evaluation metrics
+            for key, value in results.items():
+                if key not in ["predictions", "results", "ground_df", "class_recall"]:
+                    if value is not None:
+                        comet_logger.experiment.log_metric(key, value)
+
+            # Log class-specific metrics
+            if results.get("class_recall") is not None:
+                for _, row in results["class_recall"].iterrows():
+                    label_name = m.numeric_to_label_dict[row["label"]]
+                    comet_logger.experiment.log_metric(
+                        f"{label_name}_Recall", row["recall"]
+                    )
+                    comet_logger.experiment.log_metric(
+                        f"{label_name}_Precision", row["precision"]
+                    )
+
+            print(f"\nResults logged to Comet experiment: {experiment_id}")
+
+        except ImportError:
+            warnings.warn(
+                "Failed to import Comet, skipping experiment logging", stacklevel=2
+            )
+        except Exception as e:
+            warnings.warn(f"Failed to log to Comet experiment. {e}", stacklevel=2)
+
+    # Save results to CSV if output path provided
+    if output_path is not None:
+        import pandas as pd
+
+        # Create a summary dataframe with evaluation metrics
+        summary_data = []
+        for key, value in results.items():
+            if key not in ["predictions", "results", "ground_df", "class_recall"]:
+                if value is not None:
+                    summary_data.append({"metric": key, "value": value})
+
+        # Add class-specific results if available
+        if results.get("class_recall") is not None:
+            for _, row in results["class_recall"].iterrows():
+                label_name = m.numeric_to_label_dict[row["label"]]
+                summary_data.append(
+                    {"metric": f"{label_name}_Recall", "value": row["recall"]}
+                )
+                summary_data.append(
+                    {"metric": f"{label_name}_Precision", "value": row["precision"]}
+                )
+
+        summary_df = pd.DataFrame(summary_data)
+        if os.path.dirname(output_path):
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        summary_df.to_csv(output_path, index=False)
+        print(f"\nEvaluation results saved to: {output_path}")
+
+
 def main():
     parser = argparse.ArgumentParser(description="DeepForest CLI")
     subparsers = parser.add_subparsers(dest="command")
 
     # Train subcommand
-    _ = subparsers.add_parser(
+    train_parser = subparsers.add_parser(
         "train",
-        help="Train a model",
+        help="Train a model. It is strongly recommended that you enable either Tensorboard or Comet logging so you can track your experiment visually.",
         epilog="Any remaining arguments <key>=<value> will be passed to Hydra to override the current config.",
     )
+    train_parser.add_argument(
+        "--disable-checkpoint", help="Path to log folder", action="store_true"
+    )
+    train_parser.add_argument(
+        "--comet",
+        help="Enable logging to Comet ML, requires comet to be logged in.",
+        action="store_true",
+    )
+    train_parser.add_argument(
+        "--tensorboard",
+        help="Enable logging to Tensorboard",
+        action="store_true",
+    )
+    train_parser.add_argument(
+        "--trace",
+        help="Enable PyTorch memory profiling.",
+        action="store_true",
+    )
+    train_parser.add_argument(
+        "--compress",
+        help="Compress prediction CSV files using gzip for better storage efficiency.",
+        action="store_true",
+    )
 
     # Predict subcommand
     predict_parser = subparsers.add_parser(
         "predict",
-        help="Run prediction on input",
+        help="Run prediction on input image or CSV file",
         epilog="Any remaining arguments <key>=<value> will be passed to Hydra to override the current config.",
     )
-    predict_parser.add_argument("input", help="Path to input raster")
+    predict_parser.add_argument(
+        "input",
+        nargs="?",
+        help="Path to input image or CSV file (optional if validation CSV specified in config)",
+    )
     predict_parser.add_argument("-o", "--output", help="Path to prediction results")
     predict_parser.add_argument("--plot", action="store_true", help="Plot results")
+    predict_parser.add_argument(
+        "--root-dir", help="Root directory containing images (required when input is CSV)"
+    )
+
+    # Evaluate subcommand
+    evaluate_parser = subparsers.add_parser(
+        "evaluate",
+        help="Run evaluation on ground truth annotations. Use --predictions-csv to provide existing predictions, or omit to generate them.",
+        epilog="Any remaining arguments <key>=<value> will be passed to Hydra to override the current config.",
+    )
+    evaluate_parser.add_argument(
+        "csv_file",
+        nargs="?",
+        help="Path to ground truth CSV file (optional if specified in config)",
+    )
+    evaluate_parser.add_argument(
+        "--root-dir",
+        help="Root directory containing images (required when generating predictions)",
+    )
+    evaluate_parser.add_argument(
+        "--predictions-csv",
+        help="Path to existing predictions CSV file. If not provided, predictions will be generated.",
+    )
+    evaluate_parser.add_argument(
+        "--save-predictions",
+        help="Path to save generated predictions CSV (only used when --predictions-csv is not provided)",
+    )
+    evaluate_parser.add_argument(
+        "--iou-threshold", type=float, help="IoU threshold for evaluation"
+    )
+    evaluate_parser.add_argument(
+        "--batch-size", type=int, help="Batch size for prediction"
+    )
+    evaluate_parser.add_argument(
+        "--size", type=int, help="Size to resize images for prediction"
+    )
+    evaluate_parser.add_argument(
+        "--experiment-id", help="Comet experiment ID to log results to"
+    )
+    evaluate_parser.add_argument(
+        "-o", "--output", help="Path to save evaluation metrics summary CSV"
+    )
 
     # Show config subcommand
     subparsers.add_parser("config", help="Show the current config")
@@ -90,9 +554,39 @@ def main():
     cfg = OmegaConf.merge(base, cfg)
 
     if args.command == "predict":
-        predict(cfg, input_path=args.input, output_path=args.output, plot=args.plot)
+        predict(
+            cfg,
+            input_path=args.input,
+            output_path=args.output,
+            plot=args.plot,
+            root_dir=args.root_dir,
+        )
     elif args.command == "train":
-        train(cfg)
+        res = train(
+            cfg,
+            checkpoint=not args.disable_checkpoint,
+            comet=args.comet,
+            tensorboard=args.tensorboard,
+            trace=args.trace,
+            compress=args.compress,
+        )
+
+        sys.exit(0 if res else 1)
+
+    elif args.command == "evaluate":
+        evaluate(
+            cfg,
+            csv_file=args.csv_file,
+            root_dir=args.root_dir,
+            predictions_csv=args.predictions_csv,
+            iou_threshold=args.iou_threshold,
+            batch_size=args.batch_size,
+            size=args.size,
+            experiment_id=args.experiment_id,
+            output_path=args.output,
+            save_predictions=args.save_predictions,
+        )
+
     elif args.command == "config":
         print(OmegaConf.to_yaml(cfg, resolve=True))
 
diff --git a/src/deepforest/scripts/evaluate.py b/src/deepforest/scripts/evaluate.py
new file mode 100644
index 000000000..d263c3f3a
--- /dev/null
+++ b/src/deepforest/scripts/evaluate.py
@@ -0,0 +1,730 @@
+"""Parallelizable evaluation script for DeepForest predictions."""
+
+import argparse
+import json
+import logging
+import multiprocessing as mp
+import os
+import tempfile
+import time
+import warnings
+from collections import namedtuple
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import yaml
+from hydra import compose, initialize, initialize_config_dir
+from omegaconf import OmegaConf
+from tqdm import tqdm
+
+try:
+    import comet_ml
+
+    COMET_AVAILABLE = True
+except ImportError:
+    COMET_AVAILABLE = False
+
+from deepforest import utilities
+from deepforest.conf.schema import Config as StructuredConfig
+from deepforest.evaluate import (
+    _box_recall_image,
+    compute_class_recall,
+)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+
+# Worker task structure
+WorkerTask = namedtuple(
+    "WorkerTask",
+    ["predictions_file", "ground_truth_file", "iou_threshold", "output_file", "rank"],
+)
+
+
+def _basename(series: pd.Series) -> pd.Series:
+    """Vectorized extraction of final path component for mixed / and \\."""
+    return series.astype("string").str.replace(r"^.*[\\/]", "", regex=True)
+
+
+def _get_metric_label(epoch: int = None, step: int = None) -> str:
+    """Get display label for epoch/step combination."""
+    return f"epoch {epoch}" if epoch else f"step {step}"
+
+
+def _clean_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Remove unnamed columns from DataFrame."""
+    unnamed_cols = [col for col in df.columns if col.startswith("Unnamed:")]
+    return df.drop(columns=unnamed_cols) if unnamed_cols else df
+
+
+def _should_skip_processing(
+    pred_csv: str, experiment_id: str = None, epoch: int = None, step: int = None
+) -> tuple[bool, str]:
+    """Check if processing should be skipped and return reason."""
+    if is_already_processed(pred_csv):
+        return True, "semaphore exists"
+    elif experiment_id and metric_on_comet(experiment_id, epoch=epoch, step=step):
+        return True, "found in Comet"
+    return False, None
+
+
+def discover_prediction_files(
+    log_dir: str,
+) -> tuple[dict, list[tuple[str, str, int, int]]]:
+    """Discover prediction files in experiment log directory.
+
+    Returns: (hparams_dict, [(prediction_csv, ground_truth_csv, epoch, step), ...])
+    """
+    log_path = Path(log_dir)
+
+    # Load hparams
+    with open(log_path / "hparams.yaml") as f:
+        hparams = yaml.safe_load(f)
+
+    # Find metadata files and pair with CSVs
+    file_pairs = []
+    for json_file in (log_path / "predictions").glob("*_metadata.json"):
+        with open(json_file) as f:
+            metadata = json.load(f)
+
+        # Find matching CSV
+        base_name = json_file.stem.replace("_metadata", "")
+        for ext in [".csv", ".csv.gz"]:
+            csv_file = json_file.parent / f"{base_name}{ext}"
+            if csv_file.exists():
+                epoch = metadata.get("epoch")
+                step = metadata.get("current_step")
+                file_pairs.append(
+                    (str(csv_file), metadata["target_csv_file"], epoch, step)
+                )
+                break
+
+    # Sort by epoch (priority) then step, handling None values
+    return hparams, sorted(file_pairs, key=lambda x: (x[2] or 999, x[3] or 999))
+
+
+def is_already_processed(pred_csv_path: str) -> bool:
+    """Check if prediction file has already been processed."""
+    semaphore_path = Path(pred_csv_path).with_suffix(".processed")
+    return semaphore_path.exists()
+
+
+def metric_on_comet(
+    experiment_id: str,
+    epoch: int = None,
+    step: int = None,
+    metric_name: str = "box_recall",
+) -> bool:
+    """Check if metric exists for the given epoch/step combination in Comet
+    experiment."""
+    if not COMET_AVAILABLE:
+        return False
+
+    try:
+        api = comet_ml.API()
+        experiment = api.get_experiment_by_key(experiment_id)
+        metrics = experiment.get_metrics(metric=metric_name)
+
+        # Check if any metric entry matches our epoch/step combination
+        for metric in metrics:
+            metric_epoch = metric.get("epoch")
+            metric_step = metric.get("step")
+
+            # Match if both epoch and step align (when provided)
+            epoch_match = epoch is None or metric_epoch == epoch
+            step_match = step is None or metric_step == step
+
+            if epoch_match and step_match:
+                return True
+
+    except Exception as e:
+        logger.warning(f"Could not check Comet metrics (epoch={epoch}, step={step}): {e}")
+
+    return False
+
+
+def save_results(results: dict, pred_csv_path: str):
+    """Save evaluation results alongside prediction file."""
+    pred_path = Path(pred_csv_path)
+
+    # Convert DataFrames for JSON serialization
+    serializable_results = {
+        k: v.to_dict("records") if isinstance(v, pd.DataFrame) else v
+        for k, v in results.items()
+    }
+
+    # Write results (handle compression)
+    if pred_path.suffix == ".gz":
+        import gzip
+
+        results_path = pred_path.with_suffix(".results.json.gz")
+        with gzip.open(results_path, "wt") as f:
+            json.dump(serializable_results, f, indent=2, default=str)
+    else:
+        results_path = pred_path.with_suffix(".results.json")
+        with open(results_path, "w") as f:
+            json.dump(serializable_results, f, indent=2, default=str)
+
+
+def create_semaphore(pred_csv_path: str, epoch: int = None, step: int = None):
+    """Create semaphore file to indicate processing is complete."""
+    semaphore_path = Path(pred_csv_path).with_suffix(".processed")
+    with open(semaphore_path, "w") as f:
+        json.dump({"epoch": epoch, "step": step, "timestamp": time.time()}, f)
+
+
+def log_to_comet(
+    experiment_id: str,
+    metrics: dict,
+    epoch: int = None,
+    step: int = None,
+    label_dict: dict = None,
+):
+    """Log metrics to Comet experiment using native epoch and step
+    parameters."""
+    if not COMET_AVAILABLE:
+        logger.warning("Comet ML not available, skipping logging")
+        return False
+
+    try:
+        experiment = comet_ml.ExistingExperiment(experiment_key=experiment_id)
+
+        # Log basic metrics with both epoch and step
+        experiment.log_metric("box_recall", metrics["box_recall"], epoch=epoch, step=step)
+        experiment.log_metric(
+            "box_precision", metrics["box_precision"], epoch=epoch, step=step
+        )
+
+        # Log class-specific metrics
+        if metrics["class_recall"] is not None and label_dict is not None:
+            numeric_to_label = {v: k for k, v in label_dict.items()}
+            for _, row in metrics["class_recall"].iterrows():
+                label_name = numeric_to_label.get(row["label"], f"class_{row['label']}")
+                experiment.log_metric(
+                    f"{label_name}_Recall", row["recall"], epoch=epoch, step=step
+                )
+                experiment.log_metric(
+                    f"{label_name}_Precision", row["precision"], epoch=epoch, step=step
+                )
+
+        logger.info(f"Logged metrics to Comet (epoch={epoch}, step={step})")
+        return True
+
+    except Exception as e:
+        logger.error(f"Failed to log to Comet: {e}")
+        return False
+
+
+def process_experiment_log(log_dir: str, args):
+    """Process experiment log directory and evaluate all predictions."""
+    try:
+        hparams, file_pairs = discover_prediction_files(log_dir)
+        experiment_id = hparams.get("experiment_id")
+        label_dict = hparams.get("config", {}).get("label_dict")
+
+        if args.dry_run:
+            logger.info("Dry run mode - would process the following files:")
+            for pred_csv, _gt_csv, epoch, step in file_pairs:
+                should_skip, reason = _should_skip_processing(
+                    pred_csv, experiment_id, epoch, step
+                )
+                status = f"SKIPPED ({reason})" if should_skip else "PROCESS"
+                metric_label = _get_metric_label(epoch, step)
+                logger.info(f"  {metric_label}: {Path(pred_csv).name} - {status}")
+            return
+
+        # Process each file pair
+        for pred_csv, gt_csv, epoch, step in file_pairs:
+            metric_label = _get_metric_label(epoch, step)
+            should_skip, reason = _should_skip_processing(
+                pred_csv, experiment_id, epoch, step
+            )
+
+            if should_skip:
+                logger.info(f"{metric_label}: {reason.title()}, skipping")
+                if reason == "found in Comet":
+                    create_semaphore(pred_csv, epoch=epoch, step=step)
+                continue
+
+            # Process the file
+            logger.info(f"Processing {metric_label}: {Path(pred_csv).name}")
+
+            # Load data with robust parsing to handle inconsistent field counts
+            predictions = pd.read_csv(pred_csv, on_bad_lines="skip")
+            ground_truth = pd.read_csv(gt_csv)
+
+            # Drop any unwanted columns
+            predictions = _clean_unnamed_columns(predictions)
+
+            # Run evaluation
+            results = evaluate_boxes_parallel(
+                predictions=predictions,
+                ground_df=ground_truth,
+                iou_threshold=args.iou_threshold,
+                num_workers=args.workers,
+                temp_dir=args.working_dir,
+            )
+
+            # Log results
+            logger.info(
+                f"{metric_label} - Box Recall: {results['box_recall']:.4f}, Box Precision: {results['box_precision']:.4f}"
+            )
+
+            # Save results and log to Comet
+            save_results(results, pred_csv)
+            comet_success = False
+            if experiment_id:
+                comet_success = log_to_comet(
+                    experiment_id, results, epoch=epoch, step=step, label_dict=label_dict
+                )
+
+            # Create semaphore only if Comet logging succeeded (or no experiment_id)
+            if comet_success or not experiment_id:
+                create_semaphore(pred_csv, epoch=epoch, step=step)
+
+    except Exception as e:
+        logger.error(f"Error processing experiment log directory: {e}")
+
+
+def shard_dataframes(
+    predictions_df: pd.DataFrame,
+    ground_truth_df: pd.DataFrame,
+    num_workers: int,
+    output_dir: Path,
+) -> list[tuple[str, str, list[str]]]:
+    """Shard by image basename with vectorized ops."""
+
+    # Copy to avoid mutating callers
+    preds = predictions_df.copy()
+    gts = ground_truth_df.copy()
+
+    # Normalize image_path to basenames (vectorized; handles / and \)
+    preds.drop(columns="geometry", errors="ignore", inplace=True)
+    preds["image_path"] = _basename(preds["image_path"])
+    gts["image_path"] = _basename(gts["image_path"])
+
+    # Unique basenames from GT define sharding universe
+    unique_images = gts["image_path"].unique()
+    if len(unique_images) == 0:
+        return []
+
+    # Do not create more shards than images
+    num_workers = max(1, min(num_workers, len(unique_images)))
+
+    # Contiguous partition like original
+    images_per_worker = len(unique_images) // num_workers
+    remainder = len(unique_images) % num_workers
+
+    # Map basename -> shard id
+    worker_map = {}
+    start = 0
+    for wid in range(num_workers):
+        n = images_per_worker + (1 if wid < remainder else 0)
+        if n == 0:
+            continue
+        imgs = unique_images[start : start + n]
+        worker_map.update(dict.fromkeys(imgs, wid))
+        start += n
+
+    # Assign shard ids via vectorized map
+    gts["_shard"] = gts["image_path"].map(worker_map).astype("Int64")
+    preds["_shard"] = preds["image_path"].map(worker_map).astype("Int64")
+
+    # Prepare output
+    output_dir.mkdir(parents=True, exist_ok=True)
+    worker_files: list[tuple[str, str, list[str]]] = []
+
+    # Write one CSV pair per shard present
+    present_shards = np.sort(gts["_shard"].dropna().unique())
+    for wid in present_shards:
+        wid = int(wid)
+        gts_w = gts[gts["_shard"] == wid].drop(columns=["_shard"])
+        preds_w = preds[preds["_shard"] == wid].drop(columns=["_shard"])
+        img_list = gts_w["image_path"].unique().tolist()
+
+        if len(img_list) == 0:
+            continue
+
+        pred_file = output_dir / f"worker_{wid}_predictions.csv"
+        gt_file = output_dir / f"worker_{wid}_ground_truth.csv"
+        preds_w.to_csv(pred_file, index=False)
+        gts_w.to_csv(gt_file, index=False)
+
+        worker_files.append((str(pred_file), str(gt_file), img_list))
+
+    return worker_files
+
+
+def process(task: WorkerTask) -> dict:
+    """Worker function to evaluate a shard of images."""
+    assert os.path.exists(task.predictions_file)
+    assert os.path.exists(task.ground_truth_file)
+
+    predictions = pd.read_csv(task.predictions_file)
+    ground_df = pd.read_csv(task.ground_truth_file)
+
+    predictions = utilities.to_gdf(predictions)
+    ground_df = utilities.to_gdf(ground_df)
+
+    # Pre-group predictions by image for efficient access
+    predictions_by_image = {
+        name: group.reset_index(drop=True)
+        for name, group in predictions.groupby("image_path")
+    }
+
+    results = []
+    box_recalls = []
+    box_precisions = []
+
+    groups = ground_df.groupby("image_path")
+    pbar = tqdm(groups, total=len(groups), disable=task.rank != 0)
+
+    for image_path, image_ground_truth in pbar:
+        image_predictions = predictions_by_image.get(image_path, pd.DataFrame())
+        if not isinstance(image_predictions, pd.DataFrame) or image_predictions.empty:
+            image_predictions = pd.DataFrame()
+        recall, precision, result = _box_recall_image(
+            image_predictions, image_ground_truth, iou_threshold=task.iou_threshold
+        )
+
+        if precision:
+            box_precisions.append(precision)
+        box_recalls.append(recall)
+        results.append(result)
+
+    if results:
+        combined_results = pd.concat(results, ignore_index=True)
+        matched_results = (
+            combined_results[combined_results.match]
+            if "match" in combined_results.columns
+            else pd.DataFrame()
+        )
+        local_class_metrics = (
+            compute_class_recall(matched_results) if not matched_results.empty else None
+        )
+    else:
+        combined_results = pd.DataFrame()
+        local_class_metrics = None
+
+    return {
+        "box_recalls": box_recalls,
+        "box_precisions": box_precisions,
+        "class_metrics": local_class_metrics,
+        "worker_id": os.path.basename(task.predictions_file).split("_")[1],
+        "num_results": len(combined_results) if not combined_results.empty else 0,
+    }
+
+
+def reduce(results):
+    # Collect and aggregate results without file I/O
+    all_box_recalls = []
+    all_box_precisions = []
+    worker_class_metrics = []
+    total_results = 0
+
+    for result in results:
+        all_box_recalls.extend(result["box_recalls"])
+        all_box_precisions.extend(result["box_precisions"])
+        if result.get("class_metrics") is not None:
+            worker_class_metrics.append(result["class_metrics"])
+        total_results += result.get("num_results", 0)
+
+    # Skip expensive file operations entirely
+    combined_results = pd.DataFrame()  # Empty for return compatibility
+
+    # Calculate final metrics using vectorized numpy operations
+    box_recall = np.mean(all_box_recalls) if all_box_recalls else 0
+    box_precision = np.mean(all_box_precisions) if all_box_precisions else np.nan
+
+    # Aggregate class metrics from workers
+    class_recall = None
+    if worker_class_metrics:
+        class_aggregation = {}
+
+        for worker_metrics in worker_class_metrics:
+            for _, row in worker_metrics.iterrows():
+                label = row["label"]
+                if label not in class_aggregation:
+                    class_aggregation[label] = {
+                        "true_positives": 0,
+                        "total_ground_truth": 0,
+                        "total_predictions": 0,
+                    }
+
+                # Accumulate counts from each worker
+                true_positives = row["recall"] * row["size"]
+                class_aggregation[label]["true_positives"] += true_positives
+                class_aggregation[label]["total_ground_truth"] += row["size"]
+
+                if row["precision"] > 0:
+                    total_predictions = true_positives / row["precision"]
+                    class_aggregation[label]["total_predictions"] += total_predictions
+
+        # Calculate final aggregated metrics
+        class_data = []
+        for label, counts in class_aggregation.items():
+            final_recall = (
+                counts["true_positives"] / counts["total_ground_truth"]
+                if counts["total_ground_truth"] > 0
+                else 0
+            )
+            final_precision = (
+                counts["true_positives"] / counts["total_predictions"]
+                if counts["total_predictions"] > 0
+                else 0
+            )
+
+            class_data.append(
+                {
+                    "label": label,
+                    "recall": final_recall,
+                    "precision": final_precision,
+                    "size": int(counts["total_ground_truth"]),
+                }
+            )
+
+        class_recall = pd.DataFrame(class_data)
+
+    return combined_results, box_recall, box_precision, class_recall
+
+
+def evaluate_boxes_parallel(
+    predictions: pd.DataFrame,
+    ground_df: pd.DataFrame,
+    iou_threshold: float = 0.4,
+    num_workers: int = None,
+    temp_dir: str = None,
+) -> dict:
+    """Parallel version of evaluate_boxes function.
+
+    Args:
+        predictions: Predictions dataframe
+        ground_df: Ground truth dataframe
+        iou_threshold: IoU threshold for evaluation
+        num_workers: Number of worker processes (default: CPU count)
+        temp_dir: Temporary directory for worker files
+
+    Returns:
+        Dictionary with evaluation results (same format as evaluate_boxes)
+    """
+
+    # Break early if empty predictions or GT
+    if predictions.empty:
+        return {
+            "results": None,
+            "box_recall": 0,
+            "box_precision": np.nan,
+            "class_recall": None,
+        }
+    elif ground_df.empty:
+        return {
+            "results": None,
+            "box_recall": None,
+            "box_precision": 0,
+            "class_recall": None,
+        }
+
+    if num_workers is None:
+        num_workers = mp.cpu_count()
+
+    # Create temporary directory for worker files
+    if temp_dir is None:
+        temp_dir_obj = tempfile.TemporaryDirectory()
+        temp_dir_path = Path(temp_dir_obj.name)
+    else:
+        temp_dir_path = Path(temp_dir)
+        temp_dir_path.mkdir(exist_ok=True)
+        temp_dir_obj = None
+
+    try:
+        # Remove empty samples from ground truth
+        ground_df = ground_df[~((ground_df.xmin == 0) & (ground_df.xmax == 0))]
+
+        # Create sharded files
+        logger.info("Sharding dataframes...")
+        worker_files = shard_dataframes(
+            predictions, ground_df, num_workers, temp_dir_path
+        )
+
+        if not worker_files:
+            warnings.warn(
+                "No worker files created - possibly no data to process", stacklevel=2
+            )
+            return {
+                "results": None,
+                "box_recall": 0,
+                "box_precision": np.nan,
+                "class_recall": None,
+            }
+
+        # Prepare worker arguments
+        worker_args = []
+        for rank, (pred_file, gt_file, _) in enumerate(worker_files):
+            output_file = temp_dir_path / f"worker_{rank}_results.csv"
+            worker_args.append(
+                WorkerTask(pred_file, gt_file, iou_threshold, str(output_file), rank)
+            )
+
+        # Run parallel evaluation
+        logger.info(f"Running parallel evaluation with {len(worker_args)} workers...")
+        t_start = time.time()
+        if num_workers > 1:
+            # Use spawn context to avoid resource sharing issues
+            ctx = mp.get_context("spawn")
+            with ctx.Pool(processes=min(len(worker_args), num_workers)) as pool:
+                try:
+                    worker_results = list(pool.imap(process, worker_args))
+                except KeyboardInterrupt:
+                    pool.terminate()
+                    pool.join()
+                    raise
+        else:
+            worker_results = [process(worker_args[0])]
+
+        t_compute = time.time() - t_start
+        logger.info(f"Parallel computation completed in {t_compute:.2f}s")
+
+        # Reduce over results
+        t_reduce_start = time.time()
+        results, box_recall, box_precision, class_recall = reduce(worker_results)
+        t_reduce = time.time() - t_reduce_start
+        logger.info(f"Result aggregation completed in {t_reduce:.2f}s")
+
+        t_elapsed = time.time() - t_start
+        logger.info(f"Total evaluation time: {t_elapsed:.2f}s")
+
+        return {
+            "results": results if not results.empty else None,
+            "box_recall": box_recall,
+            "box_precision": box_precision,
+            "class_recall": class_recall,
+        }
+
+    finally:
+        # Clean up temporary files explicitly
+        if temp_dir_path.exists():
+            try:
+                for temp_file in temp_dir_path.glob("*"):
+                    if temp_file.is_file():
+                        temp_file.unlink()
+            except Exception:
+                pass  # Ignore cleanup errors
+
+        if temp_dir_obj is not None:
+            temp_dir_obj.cleanup()
+
+
+def main():
+    """Main CLI function for parallel evaluation."""
+    parser = argparse.ArgumentParser(
+        description="Parallel evaluation of DeepForest predictions",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Create mutually exclusive group for input modes
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument("preds", nargs="?", help="Path to predictions CSV file")
+    input_group.add_argument("--log-dir", help="Path to experiment log directory")
+
+    parser.add_argument(
+        "--gt",
+        nargs="?",
+        help="Path to ground truth CSV file (optional if validation CSV specified in config)",
+    )
+    parser.add_argument(
+        "--workers", type=int, default=mp.cpu_count(), help="Number of worker processes"
+    )
+    parser.add_argument(
+        "--iou-threshold", type=float, default=0.4, help="IoU threshold for evaluation"
+    )
+    parser.add_argument(
+        "--working-dir",
+        help="Directory for temporary worker files (default: system temp)",
+    )
+    parser.add_argument("--output", help="Path to save detailed evaluation results CSV")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Preview files that would be processed without logging to Comet",
+    )
+
+    # Config options for Hydra
+    parser.add_argument("--config-dir", help="Show available config overrides and exit")
+    parser.add_argument(
+        "--config-name", help="Show available config overrides and exit", default="config"
+    )
+
+    args, overrides = parser.parse_known_args()
+
+    # Handle experiment log mode
+    if args.log_dir:
+        process_experiment_log(args.log_dir, args)
+        return
+
+    if args.config_dir is not None:
+        initialize_config_dir(version_base=None, config_dir=args.config_dir)
+    else:
+        initialize(version_base=None, config_path="pkg://deepforest.conf")
+
+    base = OmegaConf.structured(StructuredConfig)
+    cfg = compose(config_name=args.config_name, overrides=overrides)
+    config = OmegaConf.merge(base, cfg)
+
+    # Use validation CSV from config if not provided
+    if args.gt is None:
+        if config.validation.csv_file is None:
+            raise ValueError(
+                "No ground truth CSV provided and config.validation.csv_file is not set"
+            )
+        ground_truth_csv = config.validation.csv_file
+        logger.info(f"Using validation CSV from config: {ground_truth_csv}")
+    else:
+        ground_truth_csv = args.gt
+
+    predictions_csv = args.preds
+
+    if predictions_csv is None:
+        raise ValueError("predictions_csv is required")
+    if ground_truth_csv is None:
+        raise ValueError("ground_truth_csv is required")
+
+    # Load data
+    predictions = pd.read_csv(predictions_csv)
+    ground_truth = pd.read_csv(ground_truth_csv)
+
+    logger.info(
+        f"Loaded {len(predictions)} predictions, {len(ground_truth)} ground truth boxes, {len(ground_truth['image_path'].unique())} images"
+    )
+    results = evaluate_boxes_parallel(
+        predictions=predictions,
+        ground_df=ground_truth,
+        iou_threshold=args.iou_threshold,
+        num_workers=args.workers,
+        temp_dir=args.working_dir,
+    )
+
+    # Results
+    logger.info("\nEvaluation Results:")
+    logger.info(f"Box Recall: {results['box_recall']:.4f}")
+    logger.info(f"Box Precision: {results['box_precision']:.4f}")
+
+    if results["class_recall"] is not None:
+        logger.info("\nClass-specific Results:")
+        for _, row in results["class_recall"].iterrows():
+            logger.info(
+                f"Class {row['label']} - Recall: {row['recall']:.4f}, Precision: {row['precision']:.4f}, Size: {row['size']}"
+            )
+
+    # Save detailed results if requested
+    if args.output and results["results"] is not None:
+        results["results"].to_csv(args.output, index=False)
+        logger.info(f"\nDetailed results saved to: {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/deepforest/utilities.py b/src/deepforest/utilities.py
index 0a4da7c93..5273e154f 100644
--- a/src/deepforest/utilities.py
+++ b/src/deepforest/utilities.py
@@ -340,10 +340,44 @@ def determine_geometry_type(df):
             geometry_type = "polygon"
         elif "points" in df.keys():
             geometry_type = "point"
+        else:
+            raise ValueError(
+                f"Could not determine geometry type from keys {list(df.keys())}"
+            )
+
+    else:
+        raise ValueError(f"Unsupported data type: {type(df)}")
 
     return geometry_type
 
 
+def to_gdf(df):
+    if isinstance(df, gpd.GeoDataFrame) or df.empty:
+        return df
+
+    # Check if we have bounding box columns and need to create geometry
+    if "geometry" in df.columns:
+        df = df.copy()
+        # Check if geometry column contains strings (WKT) or already contains Shapely objects
+        if isinstance(df["geometry"].iloc[0], str):
+            df["geometry"] = shapely.wkt.loads(df["geometry"])
+    elif all(col in df.columns for col in ["xmin", "ymin", "xmax", "ymax"]):
+        # Create geometry from bounding box columns
+        df = df.copy()
+        df["geometry"] = shapely.box(
+            df["xmin"].to_numpy(),
+            df["ymin"].to_numpy(),
+            df["xmax"].to_numpy(),
+            df["ymax"].to_numpy(),
+        )
+    else:
+        raise ValueError(
+            "Dataframe must contain a geometry column or bounding box coordinates (xmin, ymin, xmax, ymax)"
+        )
+
+    return gpd.GeoDataFrame(df, geometry="geometry")
+
+
 def format_geometry(predictions, scores=True, geom_type=None):
     """Format a retinanet prediction into a pandas dataframe for a batch of images
     Args:
@@ -393,9 +427,15 @@ def format_boxes(prediction, scores=True):
     if scores:
         df["score"] = prediction["scores"].cpu().detach().numpy()
 
-    df["geometry"] = df.apply(
-        lambda x: shapely.geometry.box(x.xmin, x.ymin, x.xmax, x.ymax), axis=1
+    geom = shapely.box(
+        df["xmin"].to_numpy(),
+        df["ymin"].to_numpy(),
+        df["xmax"].to_numpy(),
+        df["ymax"].to_numpy(),
     )
+
+    df["geometry"] = geom
+
     return df
 
 
diff --git a/src/deepforest/visualize.py b/src/deepforest/visualize.py
index f98bded32..ac1c30a09 100644
--- a/src/deepforest/visualize.py
+++ b/src/deepforest/visualize.py
@@ -21,7 +21,8 @@ def _load_image(
     root_dir: str | None = None,
 ) -> np.typing.NDArray:
     """Utility function to load an image from either a path or a
-    prediction/annotation dataframe.
+    prediction/annotation dataframe. If both are passed, image takes
+    precedence.
 
     Returns an image in RGB format with HWC channel ordering.
 
@@ -34,12 +35,20 @@ def _load_image(
         image: Numpy array
     """
 
-    if image is None and df is None:
-        raise ValueError(
-            "Either an image or a valid dataframe must be provided for plotting."
-        )
+    if image is not None:
+        if isinstance(image, str):
+            if root_dir is not None:
+                image_path = os.path.join(root_dir, image)
+            else:
+                image_path = image
+
+            image = np.array(Image.open(image_path))
+        elif isinstance(image, Image.Image):
+            image = np.array(image)
+        elif not isinstance(image, np.ndarray):
+            raise ValueError("Image should be a numpy array, path or PIL Image.")
 
-    if df is not None:
+    elif df is not None:
         # Resolve image root
         if hasattr(df, "root_dir") and root_dir is None:
             root_dir = df.root_dir
@@ -54,17 +63,10 @@ def _load_image(
 
         image_path = os.path.join(root_dir, df.image_path.unique()[0])
         image = np.array(Image.open(image_path))
-    elif isinstance(image, str):
-        if root_dir is not None:
-            image_path = os.path.join(root_dir, image)
-        else:
-            image_path = image
-
-        image = np.array(Image.open(image_path))
-    elif isinstance(image, Image.Image):
-        image = np.array(image)
-    elif not isinstance(image, np.ndarray):
-        raise ValueError("Image should be a numpy array, path or PIL Image.")
+    else:
+        raise ValueError(
+            "Either an image or a valid dataframe must be provided for plotting."
+        )
 
     # Fix channel ordering
     if image.ndim == 3 and image.shape[0] == 3 and image.shape[2] != 3:
@@ -187,8 +189,8 @@ def plot_predictions(
 
 
 def draw_predictions(
-    image: np.typing.NDArray,
-    df: pd.DataFrame,
+    image: np.typing.NDArray | str | Image.Image | None = None,
+    df: pd.DataFrame | None = None,
     color: tuple | None = None,
     thickness: int = 1,
 ) -> np.typing.NDArray:
@@ -197,13 +199,15 @@ def draw_predictions(
     Returns a copy of the array.
 
     Args:
-        image: a numpy array in RGB order, HWC format
+        image: supported image type
         df: a pandas dataframe with xmin, xmax, ymin, ymax and label column
         color: color of the bounding box as a tuple of BGR color, e.g. orange annotations is (0, 165, 255)
         thickness: thickness of the rectangle border in px
     Returns:
         image: a numpy array with drawn annotations
     """
+
+    image = _load_image(image, df)
     image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR).copy()
 
     if not color:
@@ -511,9 +515,10 @@ def plot_annotations(
             basename = os.path.splitext(
                 os.path.basename(annotations.image_path.unique()[0])
             )[0]
+        os.makedirs(savedir, exist_ok=True)
         image_name = f"{basename}.png"
         image_path = os.path.join(savedir, image_name)
-        cv2.imwrite(image_path, annotated_scene)
+        Image.fromarray(annotated_scene).save(image_path)
     else:
         # Display the image using Matplotlib
         ax.imshow(annotated_scene)
@@ -602,9 +607,11 @@ def plot_results(
             basename = os.path.splitext(os.path.basename(results.image_path.unique()[0]))[
                 0
             ]
+
+        os.makedirs(savedir, exist_ok=True)
         image_name = f"{basename}.png"
         image_path = os.path.join(savedir, image_name)
-        cv2.imwrite(image_path, annotated_scene)
+        Image.fromarray(annotated_scene).save(image_path)
     else:
         # Display the image using Matplotlib
         ax.imshow(annotated_scene)
diff --git a/tests/test_IoU.py b/tests/test_IoU.py
index c3e99b738..f89337184 100644
--- a/tests/test_IoU.py
+++ b/tests/test_IoU.py
@@ -2,8 +2,11 @@
 import os
 
 import geopandas as gpd
+import numpy as np
 import pandas as pd
+import pytest
 import shapely
+from shapely.geometry import box
 
 from deepforest import IoU
 from deepforest import get_data
@@ -27,3 +30,404 @@ def test_compute_IoU(m):
     result = IoU.compute_IoU(ground_truth, predictions)
     assert result.shape[0] == ground_truth.shape[0]
     assert sum(result.IoU) > 10
+
+
+def create_test_geodataframe(boxes, scores=None):
+    """Helper function to create GeoDataFrame from box coordinates.
+
+    Args:
+        boxes: List of (xmin, ymin, xmax, ymax) tuples
+        scores: Optional list of confidence scores
+
+    Returns:
+        GeoDataFrame with geometry and optional score columns
+    """
+    data = {
+        'geometry': [box(*coords) for coords in boxes],
+        'label': [0] * len(boxes),
+        'image_path': ['test.jpg'] * len(boxes)
+    }
+
+    if scores is not None:
+        data['score'] = scores
+
+    return gpd.GeoDataFrame(data)
+
+
+def test_perfect_overlap():
+    """Test IoU calculation for perfectly overlapping boxes."""
+    # Same box coordinates for ground truth and prediction
+    ground_truth = create_test_geodataframe([(0, 0, 10, 10)])
+    predictions = create_test_geodataframe([(0, 0, 10, 10)], scores=[0.9])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should have perfect IoU of 1.0
+    assert len(result) == 1
+    assert result.iloc[0]['IoU'] == 1.0
+    assert result.iloc[0]['prediction_id'] is not None
+    assert result.iloc[0]['score'] == 0.9
+
+
+def test_no_overlap():
+    """Test IoU calculation for non-overlapping boxes."""
+    ground_truth = create_test_geodataframe([(0, 0, 10, 10)])
+    predictions = create_test_geodataframe([(20, 20, 30, 30)], scores=[0.8])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should have IoU of 0.0 (no match)
+    assert len(result) == 1
+    assert result.iloc[0]['IoU'] == 0.0
+    # Note: Hungarian assignment may still assign prediction even with 0 IoU
+    # The key test is that IoU is 0.0
+
+
+def test_partial_overlap():
+    """Test IoU calculation for partially overlapping boxes."""
+    # Box 1: (0,0,10,10) area = 100
+    # Box 2: (5,5,15,15) area = 100
+    # Intersection: (5,5,10,10) area = 25
+    # Union: 100 + 100 - 25 = 175
+    # Expected IoU: 25/175 ≈ 0.143
+    ground_truth = create_test_geodataframe([(0, 0, 10, 10)])
+    predictions = create_test_geodataframe([(5, 5, 15, 15)], scores=[0.7])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    assert len(result) == 1
+    assert result.iloc[0]['prediction_id'] is not None
+    # Should be around 0.143 (25/175)
+    assert 0.14 <= result.iloc[0]['IoU'] <= 0.15
+
+
+def test_nested_boxes():
+    """Test IoU calculation for nested boxes (small inside large)."""
+    # Large box: (0,0,20,20) area = 400
+    # Small box: (5,5,15,15) area = 100
+    # Intersection: (5,5,15,15) area = 100
+    # Union: 400 (large box contains small)
+    # Expected IoU: 100/400 = 0.25
+    ground_truth = create_test_geodataframe([(0, 0, 20, 20)])
+    predictions = create_test_geodataframe([(5, 5, 15, 15)], scores=[0.6])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    assert len(result) == 1
+    assert result.iloc[0]['prediction_id'] is not None
+    assert result.iloc[0]['IoU'] == 0.25
+
+
+def test_adjacent_boxes():
+    """Test IoU calculation for adjacent (touching) boxes."""
+    ground_truth = create_test_geodataframe([(0, 0, 10, 10)])
+    predictions = create_test_geodataframe([(10, 0, 20, 10)], scores=[0.5])  # Touching edge
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Adjacent boxes should have IoU close to 0 (only touching edge)
+    assert len(result) == 1
+    assert result.iloc[0]['IoU'] == 0.0  # No area overlap
+
+
+def test_empty_predictions():
+    """Test IoU calculation with no predictions."""
+    ground_truth = create_test_geodataframe([(0, 0, 10, 10)])
+    predictions = create_test_geodataframe([])  # Empty predictions
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # When there are no predictions, the result should be empty
+    # This matches the behavior shown in the IoU.py implementation
+    if len(result) == 0:
+        # This is acceptable - empty predictions result in empty matches
+        assert True
+    else:
+        # If result has entries, they should represent unmatched ground truth
+        assert len(result) == 1
+        assert result.iloc[0]['IoU'] == 0.0
+        assert result.iloc[0]['prediction_id'] is None
+
+
+def test_empty_ground_truth():
+    """Test IoU calculation with no ground truth."""
+    ground_truth = create_test_geodataframe([])  # Empty ground truth
+    predictions = create_test_geodataframe([(0, 0, 10, 10)], scores=[0.9])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should return empty result
+    assert len(result) == 0
+
+
+@pytest.mark.parametrize("gt_boxes,pred_boxes,expected_matches", [
+    # Single match scenario
+    ([(0, 0, 10, 10)], [(1, 1, 11, 11)], 1),
+    # Multiple boxes, some matches
+    ([(0, 0, 10, 10), (20, 20, 30, 30)], [(1, 1, 11, 11)], 1),
+    # No matches
+    ([(0, 0, 10, 10)], [(50, 50, 60, 60)], 0),
+    # Multiple predictions for one ground truth (should pick best)
+    ([(0, 0, 10, 10)], [(1, 1, 11, 11), (2, 2, 12, 12)], 1),
+])
+def test_matching_scenarios(gt_boxes, pred_boxes, expected_matches):
+    """Test various matching scenarios between ground truth and predictions."""
+    ground_truth = create_test_geodataframe(gt_boxes)
+    predictions = create_test_geodataframe(pred_boxes, scores=[0.8] * len(pred_boxes))
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Check that we get the right number of ground truth entries
+    assert len(result) == len(gt_boxes)
+
+    # Count actual matches (IoU > 0)
+    actual_matches = sum(result['IoU'] > 0)
+    assert actual_matches == expected_matches
+
+
+def test_multiple_predictions_single_ground_truth():
+    """Test that Hungarian matching picks the best prediction for each ground truth."""
+    ground_truth = create_test_geodataframe([(0, 0, 10, 10)])
+
+    # Two predictions competing for the same ground truth
+    # One with higher IoU (better overlap)
+    predictions = create_test_geodataframe([
+        (1, 1, 11, 11),    # Good overlap
+        (5, 5, 15, 15),    # Worse overlap
+    ], scores=[0.9, 0.8])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should have exactly one ground truth entry
+    assert len(result) == 1
+
+    # Should match to the better prediction (higher IoU)
+    assert result.iloc[0]['IoU'] > 0
+    assert result.iloc[0]['prediction_id'] is not None
+
+    # Should pick the prediction with better overlap (first one)
+    # The better overlap should have higher IoU
+    better_iou = (9 * 9) / (10 * 10 + 10 * 10 - 9 * 9)  # Intersection 81, union 119
+    assert abs(result.iloc[0]['IoU'] - better_iou) < 0.01
+
+
+def test_multiple_ground_truths_single_prediction():
+    """Test matching when multiple ground truths compete for one prediction."""
+    # Two ground truth boxes
+    ground_truth = create_test_geodataframe([
+        (0, 0, 10, 10),
+        (15, 15, 25, 25)
+    ])
+
+    # One prediction that overlaps with first ground truth better
+    predictions = create_test_geodataframe([(1, 1, 11, 11)], scores=[0.9])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should have two ground truth entries (one per GT box)
+    assert len(result) == 2
+
+    # Only one should be matched
+    matches = sum(result['IoU'] > 0)
+    assert matches == 1
+
+    # The matched one should be the first ground truth (better overlap)
+    matched_row = result[result['IoU'] > 0].iloc[0]
+    assert matched_row['truth_id'] == 0  # First ground truth index
+
+
+def test_precision_recall_simple_scenario():
+    """Test a simple scenario to validate precision/recall calculations."""
+    from deepforest import evaluate
+
+    # Setup: 2 ground truth boxes, 3 predictions
+    # - 2 true positives (good matches)
+    # - 1 false positive (no matching ground truth)
+    # - 0 false negatives (all ground truths matched)
+
+    ground_truth = create_test_geodataframe([
+        (0, 0, 10, 10),      # Will match prediction 1
+        (20, 20, 30, 30),    # Will match prediction 2
+    ])
+
+    predictions = create_test_geodataframe([
+        (1, 1, 11, 11),      # Matches GT 1 (TP)
+        (21, 21, 31, 31),    # Matches GT 2 (TP)
+        (50, 50, 60, 60),    # No match (FP)
+    ], scores=[0.9, 0.8, 0.7])
+
+    # Test IoU computation
+    iou_result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should have 2 ground truth entries
+    assert len(iou_result) == 2
+
+    # Both should be matched
+    matches = sum(iou_result['IoU'] > 0)
+    assert matches == 2
+
+    # Test precision/recall through evaluation
+    results = evaluate.evaluate_boxes(
+        predictions=predictions,
+        ground_df=ground_truth,
+        iou_threshold=0.3
+    )
+
+    # Expected: 2 TP, 1 FP, 0 FN
+    # Precision = TP / (TP + FP) = 2 / 3 ≈ 0.67
+    # Recall = TP / (TP + FN) = 2 / 2 = 1.0
+    assert abs(results['box_precision'] - 2/3) < 0.01
+    assert results['box_recall'] == 1.0
+
+
+def test_false_negatives_scenario():
+    """Test scenario with missed detections (false negatives)."""
+    from deepforest import evaluate
+
+    # 3 ground truth boxes, 1 prediction
+    # Expected: 1 TP, 0 FP, 2 FN
+    ground_truth = create_test_geodataframe([
+        (0, 0, 10, 10),      # Will match
+        (20, 20, 30, 30),    # Missed (FN)
+        (40, 40, 50, 50),    # Missed (FN)
+    ])
+
+    predictions = create_test_geodataframe([
+        (1, 1, 11, 11),      # Matches first GT
+    ], scores=[0.9])
+
+    results = evaluate.evaluate_boxes(
+        predictions=predictions,
+        ground_df=ground_truth,
+        iou_threshold=0.3
+    )
+
+    # Precision = 1/1 = 1.0 (all predictions are correct)
+    # Recall = 1/3 ≈ 0.33 (only 1 of 3 ground truths detected)
+    assert results['box_precision'] == 1.0
+    assert abs(results['box_recall'] - 1/3) < 0.01
+
+
+def test_threshold_sensitivity():
+    """Test how IoU threshold affects precision/recall."""
+    from deepforest import evaluate
+
+    ground_truth = create_test_geodataframe([(0, 0, 10, 10)])
+
+    # Prediction with moderate overlap (IoU ≈ 0.43)
+    # Box 1: (0,0,10,10) area = 100
+    # Box 2: (3,3,13,13) area = 100
+    # Intersection: (3,3,10,10) area = 49
+    # Union: 100 + 100 - 49 = 151
+    # IoU = 49/151 ≈ 0.32
+    predictions = create_test_geodataframe([(3, 3, 13, 13)], scores=[0.9])
+
+    # At low threshold (0.3), should match
+    results_low = evaluate.evaluate_boxes(
+        predictions=predictions,
+        ground_df=ground_truth,
+        iou_threshold=0.3
+    )
+
+    # At high threshold (0.5), should not match
+    results_high = evaluate.evaluate_boxes(
+        predictions=predictions,
+        ground_df=ground_truth,
+        iou_threshold=0.5
+    )
+
+    # Low threshold: should be a match
+    assert results_low['box_recall'] == 1.0
+    assert results_low['box_precision'] == 1.0
+
+    # High threshold: should not be a match
+    assert results_high['box_recall'] == 0.0
+    assert results_high['box_precision'] == 0.0
+
+
+def test_large_coordinates():
+    """Test IoU calculation with large coordinate values."""
+    # Large coordinate values (satellite imagery coordinates)
+    large_coords = 1000000
+    ground_truth = create_test_geodataframe([(large_coords, large_coords, large_coords + 100, large_coords + 100)])
+    predictions = create_test_geodataframe([(large_coords + 10, large_coords + 10, large_coords + 110, large_coords + 110)], scores=[0.8])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should handle large coordinates properly
+    assert len(result) == 1
+    assert result.iloc[0]['IoU'] > 0  # Should have some overlap
+    assert result.iloc[0]['prediction_id'] is not None
+
+
+def test_small_boxes():
+    """Test IoU calculation with very small boxes."""
+    # Sub-pixel sized boxes
+    ground_truth = create_test_geodataframe([(0.1, 0.1, 0.2, 0.2)])
+    predictions = create_test_geodataframe([(0.15, 0.15, 0.25, 0.25)], scores=[0.9])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should handle small boxes properly
+    assert len(result) == 1
+    assert result.iloc[0]['IoU'] > 0  # Should have some overlap
+
+
+def test_floating_point_precision():
+    """Test IoU calculation with high-precision floating point coordinates."""
+    # High precision coordinates
+    ground_truth = create_test_geodataframe([(0.123456789, 0.987654321, 10.123456789, 10.987654321)])
+    predictions = create_test_geodataframe([(1.123456789, 1.987654321, 11.123456789, 11.987654321)], scores=[0.7])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should handle floating point precision properly
+    assert len(result) == 1
+    assert result.iloc[0]['IoU'] > 0
+    assert isinstance(result.iloc[0]['IoU'], (float, np.floating))
+
+
+
+def test_different_score_ranges():
+    """Test IoU calculation with different confidence score ranges."""
+    ground_truth = create_test_geodataframe([(0, 0, 10, 10)])
+
+    # Test with different score ranges
+    score_ranges = [
+        [0.1],      # Low scores
+        [0.999],    # High scores
+        [50.0],     # Scores > 1 (some models output logits)
+        [0.0],      # Zero score
+    ]
+
+    for scores in score_ranges:
+        predictions = create_test_geodataframe([(1, 1, 11, 11)], scores=scores)
+        result = IoU.compute_IoU(ground_truth, predictions)
+
+        # IoU calculation should be independent of score values
+        assert len(result) == 1
+        assert result.iloc[0]['IoU'] > 0
+        assert result.iloc[0]['score'] == scores[0]
+
+
+@pytest.mark.parametrize("box_size", [2, 10, 100, 1000])  # Changed from 1 to 2 to avoid edge case
+def test_different_box_sizes(box_size):
+    """Test IoU calculation across different box sizes."""
+    ground_truth = create_test_geodataframe([(0, 0, box_size, box_size)])
+    # Overlapping box with 50% overlap
+    overlap_size = box_size // 2
+    predictions = create_test_geodataframe([(overlap_size, overlap_size, box_size + overlap_size, box_size + overlap_size)], scores=[0.8])
+
+    result = IoU.compute_IoU(ground_truth, predictions)
+
+    # Should have consistent IoU regardless of absolute box size
+    assert len(result) == 1
+    assert result.iloc[0]['IoU'] > 0
+
+    # Only check expected IoU for sizes where integer division works cleanly
+    if box_size >= 2:
+        # For 50% overlap: intersection = (box_size/2)^2, union = 2*box_size^2 - (box_size/2)^2
+        expected_iou = (overlap_size ** 2) / (2 * (box_size ** 2) - overlap_size ** 2)
+        # Allow more tolerance for small boxes due to integer arithmetic
+        tolerance = 0.1 if box_size < 10 else 0.01
+        assert abs(result.iloc[0]['IoU'] - expected_iou) < tolerance
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
deleted file mode 100644
index 8faad95bd..000000000
--- a/tests/test_callbacks.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# test callbacks
-import glob
-
-from pytorch_lightning.callbacks import ModelCheckpoint
-
-from deepforest import callbacks
-
-
-def test_log_images(m, tmpdir):
-    im_callback = callbacks.images_callback(savedir=tmpdir, every_n_epochs=2)
-    m.create_trainer(callbacks=[im_callback])
-    m.trainer.fit(m)
-    saved_images = glob.glob("{}/*.png".format(tmpdir))
-    assert len(saved_images) == 1
-
-
-def test_create_checkpoint(m, tmpdir):
-    checkpoint_callback = ModelCheckpoint(
-        dirpath=tmpdir,
-        save_top_k=1,
-        monitor="val_classification",
-        mode="max",
-        every_n_epochs=1,
-    )
-    m.load_model("weecology/deepforest-tree")
-    m.create_trainer(callbacks=[checkpoint_callback])
-    m.trainer.fit(m)
diff --git a/tests/test_callbacks_evaluation.py b/tests/test_callbacks_evaluation.py
new file mode 100644
index 000000000..a89a524db
--- /dev/null
+++ b/tests/test_callbacks_evaluation.py
@@ -0,0 +1,226 @@
+# Test evaluation callback
+import glob
+import json
+import os
+
+import pandas as pd
+import pytest
+
+from deepforest import get_data, main, evaluate
+from deepforest.callbacks import EvaluationCallback
+from deepforest.utilities import read_file
+
+
+@pytest.fixture(scope="module")
+def m(download_release):
+    """Create a test model with minimal configuration."""
+    m = main.deepforest()
+    m.config.train.csv_file = get_data("example.csv")
+    m.config.train.root_dir = os.path.dirname(get_data("example.csv"))
+    m.config.train.fast_dev_run = True
+    m.config.batch_size = 2
+    m.config.validation.csv_file = get_data("example.csv")
+    m.config.validation.root_dir = os.path.dirname(get_data("example.csv"))
+    m.config.workers = 0
+    m.config.train.epochs = 2
+
+    m.create_trainer()
+    m.load_model("weecology/deepforest-tree")
+
+    return m
+
+
+def test_evaluation_callback_save_mode(m, tmpdir):
+    """Test EvaluationCallback in save mode creates proper CSV and metadata files.
+
+    Verifies that the callback:
+    - Creates prediction CSV files with evaluation-compatible format
+    - Creates metadata JSON files with correct structure and content
+    - Saves predictions with expected columns for evaluation
+    - Works when built-in evaluation is disabled
+    """
+    eval_callback = EvaluationCallback(
+        save_dir=tmpdir,
+        every_n_epochs=1,
+        run_evaluation=False
+    )
+
+    # Disable built-in evaluation
+    m.config.validation.val_accuracy_interval = -1
+
+    m.create_trainer(callbacks=[eval_callback], fast_dev_run=False)
+    m.trainer.fit(m)
+
+    # Check that prediction CSV file was created
+    csv_files = glob.glob(f"{tmpdir}/predictions_epoch_*.csv")
+    assert len(csv_files) > 0, "No prediction CSV files found"
+
+    # Check that metadata JSON file was created
+    json_files = glob.glob(f"{tmpdir}/predictions_epoch_*_metadata.json")
+    assert len(json_files) > 0, "No metadata JSON files found"
+
+    # Verify CSV file has expected structure
+    csv_file = csv_files[0]
+    predictions = pd.read_csv(csv_file)
+
+    # Check that predictions have expected columns
+    expected_columns = ["xmin", "ymin", "xmax", "ymax", "label", "score", "image_path"]
+    for col in expected_columns:
+        assert col in predictions.columns, f"Missing column: {col}"
+
+    # Verify metadata JSON has expected fields
+    json_file = json_files[0]
+    with open(json_file, 'r') as f:
+        metadata = json.load(f)
+
+    expected_keys = ["epoch", "predictions_count", "target_csv_file", "target_root_dir"]
+    for key in expected_keys:
+        assert key in metadata, f"Missing metadata key: {key}"
+
+    assert metadata["target_csv_file"] == get_data("example.csv")
+
+
+def test_evaluation_callback_with_evaluation(m, tmpdir):
+    """Test EvaluationCallback with run_evaluation=True runs and logs evaluation metrics.
+
+    Verifies that when run_evaluation=True, the callback:
+    - Saves predictions to CSV files
+    - Runs evaluate_boxes() on the saved predictions
+    - Logs evaluation metrics to the training logs
+    - Works alongside the file saving functionality
+    """
+    eval_callback = EvaluationCallback(
+        save_dir=tmpdir,
+        every_n_epochs=1,
+        run_evaluation=True
+    )
+
+    # Disable built-in evaluation
+    m.config.validation.val_accuracy_interval = -1
+
+    m.create_trainer(callbacks=[eval_callback], fast_dev_run=False)
+    m.trainer.fit(m)
+
+    # Check that CSV file was created
+    csv_files = glob.glob(f"{tmpdir}/predictions_epoch_*.csv")
+    assert len(csv_files) > 0
+
+
+def test_evaluation_callback_vs_builtin_evaluation(m, tmpdir):
+    """Test that callback produces identical evaluation results to built-in evaluation.
+
+    Runs both the EvaluationCallback and built-in evaluation simultaneously
+    and verifies that:
+    - Both produce the same box_recall and box_precision values
+    - The saved predictions can be evaluated independently
+    - Results are consistent between callback and built-in methods
+    """
+    eval_callback = EvaluationCallback(
+        save_dir=tmpdir,
+        every_n_epochs=1,
+        run_evaluation=False
+    )
+
+    # Enable built-in evaluation to run at the same time
+    m.config.validation.val_accuracy_interval = 1
+
+    m.create_trainer(callbacks=[eval_callback], fast_dev_run=False)
+    m.trainer.fit(m)
+
+    # Get logged metrics from built-in evaluation
+    builtin_box_recall = None
+    builtin_box_precision = None
+
+    for logger in m.trainer.loggers:
+        if hasattr(logger, 'metrics'):
+            metrics = logger.metrics
+            if 'box_recall' in metrics:
+                builtin_box_recall = metrics['box_recall']
+            if 'box_precision' in metrics:
+                builtin_box_precision = metrics['box_precision']
+
+    # Load callback predictions and evaluate them
+    csv_files = glob.glob(f"{tmpdir}/predictions_epoch_*.csv")
+    assert len(csv_files) > 0
+
+    callback_predictions = pd.read_csv(csv_files[0])
+    ground_truth = read_file(get_data("example.csv"))
+
+    # Run evaluation on callback predictions
+    callback_results = evaluate.evaluate_boxes(
+        predictions=callback_predictions,
+        ground_df=ground_truth,
+        iou_threshold=0.4
+    )
+
+    # Compare metrics (allowing for small floating point differences)
+    if builtin_box_recall is not None:
+        assert abs(callback_results["box_recall"] - builtin_box_recall) < 0.01
+    if builtin_box_precision is not None:
+        assert abs(callback_results["box_precision"] - builtin_box_precision) < 0.01
+
+
+def test_evaluation_callback_disabled(m, tmpdir):
+    """Test that callback is properly disabled when every_n_epochs=-1.
+
+    Verifies that when every_n_epochs=-1:
+    - No prediction CSV files are created
+    - No metadata JSON files are created
+    - The callback gracefully skips all processing
+    - Training completes normally without callback interference
+    """
+    eval_callback = EvaluationCallback(
+        save_dir=tmpdir,
+        every_n_epochs=-1,  # Disabled
+        run_evaluation=False
+    )
+
+    m.create_trainer(callbacks=[eval_callback], fast_dev_run=False)
+    m.trainer.fit(m)
+
+    # Check that no files were created
+    csv_files = glob.glob(f"{tmpdir}/predictions_epoch_*.csv")
+    json_files = glob.glob(f"{tmpdir}/predictions_epoch_*_metadata.json")
+
+    assert len(csv_files) == 0, "CSV files created when callback should be disabled"
+    assert len(json_files) == 0, "JSON files created when callback should be disabled"
+
+
+def test_evaluation_callback_empty_predictions(m, tmpdir):
+    """Test callback handles edge case of zero predictions gracefully.
+
+    Uses a very high score threshold to ensure no predictions are made,
+    then verifies that:
+    - Metadata JSON files are still created
+    - predictions_count is correctly set to 0
+    - The callback doesn't crash or produce errors
+    - File structure is maintained even without predictions
+    """
+    # Set very high score threshold to get no predictions
+    original_score_thresh = m.model.score_thresh
+    m.model.score_thresh = 0.999
+
+    eval_callback = EvaluationCallback(
+        save_dir=tmpdir,
+        every_n_epochs=1,
+        run_evaluation=False
+    )
+
+    # Disable built-in evaluation
+    m.config.validation.val_accuracy_interval = -1
+
+    m.create_trainer(callbacks=[eval_callback], fast_dev_run=False)
+    m.trainer.fit(m)
+
+    # Restore original score threshold
+    m.model.score_thresh = original_score_thresh
+
+    # Check that files are still created even with no predictions
+    json_files = glob.glob(f"{tmpdir}/predictions_epoch_*_metadata.json")
+    assert len(json_files) > 0, "Metadata JSON should be created even with no predictions"
+
+    # Check metadata shows 0 predictions
+    with open(json_files[0], 'r') as f:
+        metadata = json.load(f)
+
+    assert metadata["predictions_count"] == 0
diff --git a/tests/test_callbacks_image.py b/tests/test_callbacks_image.py
new file mode 100644
index 000000000..18bd6fd16
--- /dev/null
+++ b/tests/test_callbacks_image.py
@@ -0,0 +1,166 @@
+# test callbacks
+import glob
+import os
+from unittest.mock import MagicMock, Mock
+
+import pytest
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers.logger import DummyLogger
+
+from deepforest import get_data
+from deepforest import callbacks, main
+
+class MockCometLogger(DummyLogger):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.setup()
+
+    def setup(self):
+        probe = Mock(spec_set=("log_image",))
+        probe.log_image = MagicMock(name="log_image")
+        self._experiment = probe
+
+    @property
+    def experiment(self):
+        return self._experiment
+
+
+class MockTBLogger(MockCometLogger):
+    def setup(self):
+        probe = Mock(spec_set=("add_image",))
+        probe.add_image = MagicMock(name="add_image")
+        self._experiment = probe
+
+@pytest.fixture(scope="module")
+def m(download_release):
+    m = main.deepforest()
+    m.config.train.csv_file = get_data("example.csv")
+    m.config.train.root_dir = os.path.dirname(get_data("example.csv"))
+    m.config.train.fast_dev_run = True
+    m.config.batch_size = 2
+    m.config.validation.csv_file = get_data("example.csv")
+    m.config.validation.root_dir = os.path.dirname(get_data("example.csv"))
+    m.config.workers = 0
+    m.config.validation.val_accuracy_interval = 1
+    m.config.train.epochs = 2
+
+    m.create_trainer()
+    m.load_model("weecology/deepforest-tree")
+
+    return m
+
+
+def test_log_images_dummy_comet(m, tmpdir):
+    """Test for Comet-style loggers with log_image method"""
+    logger = MockCometLogger()
+    im_callback = callbacks.ImagesCallback(save_dir=tmpdir,
+                                           every_n_epochs=1,
+                                           sample_batches=1,
+                                           dataset_samples=1)
+
+    m.create_trainer(callbacks=[im_callback], logger=logger, fast_dev_run=False)
+    m.trainer.fit(m)
+
+    # Expect 1 from each dataset, 1 from prediction
+    assert logger.experiment.log_image.call_count >= 3
+
+def test_log_images_dummy_tb(m, tmpdir):
+    """Test for Tensorboard-style loggers with add_image method"""
+    logger = MockTBLogger()
+    im_callback = callbacks.ImagesCallback(save_dir=tmpdir,
+                                           every_n_epochs=1,
+                                           sample_batches=1,
+                                           dataset_samples=1)
+
+    m.create_trainer(callbacks=[im_callback], logger=logger, fast_dev_run = False)
+    m.trainer.fit(m)
+
+    # Expect 1 from each dataset, 1 from prediction
+    assert logger.experiment.add_image.call_count >= 3
+
+
+def test_log_images_dummy_both(m, tmpdir):
+    """Test for the correct logger precedence is respected (TB > Comet)."""
+    comet = MockCometLogger()
+    tensorboard = MockTBLogger()
+    loggers = [comet, tensorboard]
+    im_callback = callbacks.ImagesCallback(save_dir=tmpdir,
+                                           every_n_epochs=1,
+                                           sample_batches=1,
+                                           dataset_samples=1)
+
+    m.create_trainer(callbacks=[im_callback], logger=loggers, fast_dev_run = False)
+    m.trainer.fit(m)
+
+    # Expect 1 from each dataset, 1 from prediction
+    assert tensorboard.experiment.add_image.call_count >= 3
+    assert comet.experiment.log_image.call_count == 0
+
+def test_log_images_file(m, tmpdir):
+    im_callback = callbacks.ImagesCallback(save_dir=tmpdir, every_n_epochs=1, sample_batches=5)
+
+    m.create_trainer(callbacks=[im_callback], fast_dev_run = False)
+    m.trainer.fit(m)
+
+    assert os.path.exists(os.path.join(tmpdir, "predictions"))
+    saved_images = glob.glob("{}/predictions/*.png".format(tmpdir))
+    assert len(saved_images) == m.current_epoch*min(len(m.val_dataloader().dataset), im_callback.sample_batches)
+    saved_meta = glob.glob("{}/predictions/*.json".format(tmpdir))
+    assert len(saved_meta) == m.current_epoch*min(len(m.val_dataloader().dataset), im_callback.sample_batches)
+
+    assert os.path.exists(os.path.join(tmpdir, "train_sample"))
+    train_images = glob.glob("{}/train_sample/*".format(tmpdir))
+    assert len(train_images) == min(len(m.train_dataloader().dataset), im_callback.dataset_samples)
+
+    assert os.path.exists(os.path.join(tmpdir, "validation_sample"))
+    val_images = glob.glob("{}/validation_sample/*".format(tmpdir))
+    assert len(val_images) == min(len(m.val_dataloader().dataset), im_callback.dataset_samples)
+
+def test_log_images_fast(m, tmpdir):
+    """Test that no images are logged if fast_dev_run is active"""
+    im_callback = callbacks.ImagesCallback(save_dir=tmpdir, every_n_epochs=1)
+
+    m.create_trainer(callbacks=[im_callback], fast_dev_run=True)
+    m.trainer.fit(m)
+
+    assert not os.path.exists(os.path.join(tmpdir, "predictions"))
+    assert not os.path.exists(os.path.join(tmpdir, "train_sample"))
+    assert not os.path.exists(os.path.join(tmpdir, "validation_sample"))
+
+def test_log_images_no_pred(m, tmpdir):
+    """Test disabling prediction logging"""
+    im_callback = callbacks.ImagesCallback(save_dir=tmpdir, sample_batches=0, every_n_epochs=1)
+
+    m.create_trainer(callbacks=[im_callback], fast_dev_run=False)
+    m.trainer.fit(m)
+
+    assert not os.path.exists(os.path.join(tmpdir, "predictions"))
+    assert os.path.exists(os.path.join(tmpdir, "train_sample"))
+    assert os.path.exists(os.path.join(tmpdir, "validation_sample"))
+
+def test_log_images_no_dataset(m, tmpdir):
+    """Test disabling dataset sample logging"""
+    im_callback = callbacks.ImagesCallback(save_dir=tmpdir, dataset_samples=0, every_n_epochs=1)
+
+    m.create_trainer(callbacks=[im_callback], fast_dev_run=False)
+    m.trainer.fit(m)
+
+    assert os.path.exists(os.path.join(tmpdir, "predictions"))
+    assert not os.path.exists(os.path.join(tmpdir, "train_sample"))
+    assert not os.path.exists(os.path.join(tmpdir, "validation_sample"))
+
+def test_create_checkpoint(m, tmpdir):
+    """Test checkpoint creation"""
+    checkpoint_callback = ModelCheckpoint(
+        filename='model',
+        dirpath=tmpdir,
+        save_top_k=1,
+        monitor="val_classification",
+        mode="max",
+        every_n_epochs=1,
+    )
+    m.load_model("weecology/deepforest-tree")
+    m.create_trainer(callbacks=[checkpoint_callback], fast_dev_run=False)
+    m.trainer.fit(m)
+
+    assert os.path.exists(os.path.join(tmpdir, 'model.ckpt'))
diff --git a/tests/test_conditional_detr.py b/tests/test_conditional_detr.py
new file mode 100644
index 000000000..6fcc42f37
--- /dev/null
+++ b/tests/test_conditional_detr.py
@@ -0,0 +1,177 @@
+# test Transformers/conditional_detr
+import os
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image
+
+from deepforest import get_data
+from deepforest import utilities
+from deepforest.datasets.training import BoxDataset
+from deepforest.models import ConditionalDetr
+
+
+@pytest.fixture()
+def config():
+    config = utilities.load_config(overrides={"architecture": "ConditionalDetr"})
+    config.model.name = "microsoft/conditional-detr-resnet-50"
+    config.train.fast_dev_run = True
+    config.batch_size = 1
+    config.score_thresh = 0.5
+    return config
+
+
+@pytest.fixture()
+def coco_sample():
+    """
+    Dummy sample that conforms to the MS-COCO format
+    """
+    images = [torch.rand((3, 100, 100), dtype=torch.float32)]
+    target = {
+        "labels": torch.zeros(0, dtype=torch.int64),
+        "image_id": 4,
+        "annotations": [{
+            "id": 0,
+            "image_id": 4,
+            "category_id": 0,
+            "bbox": [0, 0, 10, 10],
+            "area": 10,
+            "iscrowd": 0,
+        }]
+    }
+
+    targets = [target]
+    return images, targets
+
+
+def test_check_model(config):
+    model = ConditionalDetr.Model(config)
+    model.check_model()
+
+
+# The test case "2" currently fails due to a bug in transformers
+# which is fixed in transformers-4.53.0, related to the
+# from_pretrained logic.
+@pytest.mark.parametrize("num_classes", [1, 5, 10])
+def test_create_model(config, num_classes):
+    """
+    Test that we can instantiate a model with differing numbers
+    of classes and that we can pass images through.
+    """
+    config.num_classes = num_classes
+    config.label_dict = {f"{i}": i for i in range(num_classes)}
+    detr_model = ConditionalDetr.Model(config).create_model()
+    detr_model.eval()
+    x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+    _ = detr_model(x)
+
+    assert detr_model.label_dict == config.label_dict
+
+
+def test_boxes_in_output(config):
+    """
+    Test that a reference input image yields predictions that
+    include boxes, scores and labels. This model should have
+    trained weights.
+    """
+    detr_model = ConditionalDetr.Model(config).create_model(config.model.name, revision=config.model.revision)
+    detr_model.eval()
+
+    image_path = get_data("OSBS_029.png")
+
+    # Passing a numpy array (or tensor) should work:
+    result = detr_model(np.array(Image.open(image_path)))
+
+    for r in result:
+        assert "boxes" in r
+        assert "scores" in r
+        assert "labels" in r
+
+    # Passing a list is also allowed:
+    result = detr_model([np.array(Image.open(image_path))])
+
+    for r in result:
+        assert "boxes" in r
+        assert "scores" in r
+        assert "labels" in r
+
+
+def test_forward_sample_dummy(config, coco_sample):
+    """
+    Test that in training mode, we get a loss dict and it's
+    non-zero.
+    """
+    detr_model = ConditionalDetr.Model(config).create_model()
+    detr_model.train()
+
+    image, targets = coco_sample
+    loss_dict = detr_model(image, targets, prepare_targets=False)
+
+    # Assert non-zero loss
+    assert sum([loss for loss in loss_dict.values()]) > 0
+
+
+def test_training_sample(config):
+    """
+    Confirm integration between the training BoxDataset and
+    the model.
+    """
+    csv_file = get_data("example.csv")
+    root_dir = os.path.dirname(csv_file)
+    ds = BoxDataset(csv_file=csv_file, root_dir=root_dir)
+
+    image, targets, _ = next(iter(ds))
+
+    detr_model = ConditionalDetr.Model(config).create_model()
+    detr_model.train()
+
+    loss_dict = detr_model(image, targets)
+
+    # Assert non-zero loss
+    assert sum([loss for loss in loss_dict.values()]) > 0
+
+
+def test_prepare_targets_bbox_conversion(config):
+    """
+    Test that _prepare_targets correctly converts bounding boxes from
+    [xmin, ymin, xmax, ymax] format to COCO format [x, y, width, height].
+    """
+    detr_model = ConditionalDetr.Model(config).create_model()
+
+    # Create test targets in [xmin, ymin, xmax, ymax] format
+    test_targets = [
+        {
+            "boxes": torch.tensor([
+                [10.0, 20.0, 50.0, 80.0],  # Box 1: xmin=10, ymin=20, xmax=50, ymax=80
+                [100.0, 150.0, 200.0, 250.0],  # Box 2: xmin=100, ymin=150, xmax=200, ymax=250
+            ]),
+            "labels": torch.tensor([0, 0], dtype=torch.int64),
+        }
+    ]
+
+    # Call _prepare_targets
+    coco_targets = detr_model._prepare_targets(test_targets)
+
+    # Verify output structure
+    assert len(coco_targets) == 1
+    assert "annotations" in coco_targets[0]
+    assert len(coco_targets[0]["annotations"]) == 2
+
+    # Verify Box 1 conversion: [10, 20, 50, 80] -> [10, 20, 40, 60]
+    box1 = coco_targets[0]["annotations"][0]
+    assert box1["bbox"] == [10.0, 20.0, 40.0, 60.0], \
+        f"Expected [10.0, 20.0, 40.0, 60.0] but got {box1['bbox']}"
+    assert box1["area"] == 2400.0, \
+        f"Expected area 2400.0 but got {box1['area']}"
+    assert box1["category_id"] == 0
+    assert box1["iscrowd"] == 0
+
+    # Verify Box 2 conversion: [100, 150, 200, 250] -> [100, 150, 100, 100]
+    box2 = coco_targets[0]["annotations"][1]
+    assert box2["bbox"] == [100.0, 150.0, 100.0, 100.0], \
+        f"Expected [100.0, 150.0, 100.0, 100.0] but got {box2['bbox']}"
+    assert box2["area"] == 10000.0, \
+        f"Expected area 10000.0 but got {box2['area']}"
+    assert box2["category_id"] == 0
+    assert box2["iscrowd"] == 0
diff --git a/tests/test_datasets_training.py b/tests/test_datasets_training.py
index 43ae9a28d..f6bad0593 100644
--- a/tests/test_datasets_training.py
+++ b/tests/test_datasets_training.py
@@ -158,7 +158,7 @@ def test_multi_image_warning():
 
 def test_label_validation__training_csv():
     """Test training CSV labels are validated against label_dict"""
-    m = main.deepforest(config_args={"num_classes": 1}, label_dict={"Bird": 0})
+    m = main.deepforest(config_args={"num_classes": 1, "label_dict": {"Bird": 0}, "train": {"check_annotations": True}})
     m.config.train.csv_file = get_data("example.csv")  # contains 'Tree' label
     m.config.train.root_dir = os.path.dirname(get_data("example.csv"))
     m.create_trainer()
@@ -169,14 +169,15 @@ def test_label_validation__training_csv():
 
 def test_csv_label_validation__validation_csv(m):
     """Test validation CSV labels are validated against label_dict"""
-    m = main.deepforest(config_args={"num_classes": 1}, label_dict={"Tree": 0})
+    m = main.deepforest(config_args={"num_classes": 1, "label_dict": {'Tree': 0},  "train": {"check_annotations": True}})
     m.config.train.csv_file = get_data("example.csv")  # contains 'Tree' label
     m.config.train.root_dir = os.path.dirname(get_data("example.csv"))
     m.config.validation.csv_file = get_data("testfile_multi.csv")  # contains 'Dead', 'Alive' labels
     m.config.validation.root_dir = os.path.dirname(get_data("testfile_multi.csv"))
     m.create_trainer()
 
-    with pytest.raises(ValueError, match="Labels \\['Dead', 'Alive'\\] are missing from label_dict"):
+
+    with pytest.raises(ValueError, match="Labels \\['Alive', 'Dead'\\] are missing from label_dict"):
         m.trainer.fit(m)
 
 
@@ -188,9 +189,9 @@ def test_BoxDataset_validate_labels():
     root_dir = os.path.dirname(csv_file)
 
     # Valid case: CSV labels are in label_dict
-    ds = BoxDataset(csv_file=csv_file, root_dir=root_dir, label_dict={"Tree": 0})
+    _ = BoxDataset(csv_file=csv_file, root_dir=root_dir, label_dict={"Tree": 0}, check_annotations=True)
     # Should not raise an error
 
     # Invalid case: CSV labels are not in label_dict
     with pytest.raises(ValueError, match="Labels \\['Tree'\\] are missing from label_dict"):
-        BoxDataset(csv_file=csv_file, root_dir=root_dir, label_dict={"Bird": 0})
+        BoxDataset(csv_file=csv_file, root_dir=root_dir, label_dict={"Bird": 0}, check_annotations=True)
diff --git a/tests/test_deformable_detr.py b/tests/test_deformable_detr.py
new file mode 100644
index 000000000..eb466d3b6
--- /dev/null
+++ b/tests/test_deformable_detr.py
@@ -0,0 +1,178 @@
+# test Transformers/detr
+import os
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image
+
+from deepforest import get_data
+from deepforest import utilities
+from deepforest.datasets.training import BoxDataset
+from deepforest.models import DeformableDetr
+
+
+@pytest.fixture()
+def config():
+    config = utilities.load_config()
+    config.model.name = "joshvm/milliontrees-detr"
+    config.architecture = "DeformableDetr"
+    config.train.fast_dev_run = True
+    config.batch_size = 1
+    config.score_thresh = 0.5
+    return config
+
+
+@pytest.fixture()
+def coco_sample():
+    """
+    Dummy sample that conforms to the MS-COCO format
+    """
+    images = [torch.rand((3, 100, 100), dtype=torch.float32)]
+    target = {
+        "labels": torch.zeros(0, dtype=torch.int64),
+        "image_id": 4,
+        "annotations": [{
+            "id": 0,
+            "image_id": 4,
+            "category_id": 0,
+            "bbox": [0, 0, 10, 10],
+            "area": 10,
+            "iscrowd": 0,
+        }]
+    }
+
+    targets = [target]
+    return images, targets
+
+
+def test_check_model(config):
+    model = DeformableDetr.Model(config)
+    model.check_model()
+
+
+# The test case "2" currently fails due to a bug in transformers
+# which is fixed in transformers-4.53.0, related to the
+# from_pretrained logic.
+@pytest.mark.parametrize("num_classes", [1, 5, 10])
+def test_create_model(config, num_classes):
+    """
+    Test that we can instantiate a model with differing numbers
+    of classes and that we can pass images through.
+    """
+    config.num_classes = num_classes
+    config.label_dict = {f"{i}": i for i in range(num_classes)}
+    detr_model = DeformableDetr.Model(config).create_model()
+    detr_model.eval()
+    x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+    _ = detr_model(x)
+
+    assert detr_model.label_dict == config.label_dict
+
+
+def test_boxes_in_output(config):
+    """
+    Test that a reference input image yields predictions that
+    include boxes, scores and labels. This model should have
+    trained weights.
+    """
+    detr_model = DeformableDetr.Model(config).create_model(config.model.name, revision=config.model.revision)
+    detr_model.eval()
+
+    image_path = get_data("OSBS_029.png")
+
+    # Passing a numpy array (or tensor) should work:
+    result = detr_model(np.array(Image.open(image_path)))
+
+    for r in result:
+        assert "boxes" in r
+        assert "scores" in r
+        assert "labels" in r
+
+    # Passing a list is also allowed:
+    result = detr_model([np.array(Image.open(image_path))])
+
+    for r in result:
+        assert "boxes" in r
+        assert "scores" in r
+        assert "labels" in r
+
+
+def test_forward_sample_dummy(config, coco_sample):
+    """
+    Test that in training mode, we get a loss dict and it's
+    non-zero.
+    """
+    detr_model = DeformableDetr.Model(config).create_model()
+    detr_model.train()
+
+    image, targets = coco_sample
+    loss_dict = detr_model(image, targets, prepare_targets=False)
+
+    # Assert non-zero loss
+    assert sum([loss for loss in loss_dict.values()]) > 0
+
+
+def test_training_sample(config):
+    """
+    Confirm integration between the training BoxDataset and
+    the model.
+    """
+    csv_file = get_data("example.csv")
+    root_dir = os.path.dirname(csv_file)
+    ds = BoxDataset(csv_file=csv_file, root_dir=root_dir)
+
+    image, targets, _ = next(iter(ds))
+
+    detr_model = DeformableDetr.Model(config).create_model()
+    detr_model.train()
+
+    loss_dict = detr_model(image, targets)
+
+    # Assert non-zero loss
+    assert sum([loss for loss in loss_dict.values()]) > 0
+
+
+def test_prepare_targets_bbox_conversion(config):
+    """
+    Test that _prepare_targets correctly converts bounding boxes from
+    [xmin, ymin, xmax, ymax] format to COCO format [x, y, width, height].
+    """
+    detr_model = DeformableDetr.Model(config).create_model()
+
+    # Create test targets in [xmin, ymin, xmax, ymax] format
+    test_targets = [
+        {
+            "boxes": torch.tensor([
+                [10.0, 20.0, 50.0, 80.0],  # Box 1: xmin=10, ymin=20, xmax=50, ymax=80
+                [100.0, 150.0, 200.0, 250.0],  # Box 2: xmin=100, ymin=150, xmax=200, ymax=250
+            ]),
+            "labels": torch.tensor([0, 0], dtype=torch.int64),
+        }
+    ]
+
+    # Call _prepare_targets
+    coco_targets = detr_model._prepare_targets(test_targets)
+
+    # Verify output structure
+    assert len(coco_targets) == 1
+    assert "annotations" in coco_targets[0]
+    assert len(coco_targets[0]["annotations"]) == 2
+
+    # Verify Box 1 conversion: [10, 20, 50, 80] -> [10, 20, 40, 60]
+    box1 = coco_targets[0]["annotations"][0]
+    assert box1["bbox"] == [10.0, 20.0, 40.0, 60.0], \
+        f"Expected [10.0, 20.0, 40.0, 60.0] but got {box1['bbox']}"
+    assert box1["area"] == 2400.0, \
+        f"Expected area 2400.0 but got {box1['area']}"
+    assert box1["category_id"] == 0
+    assert box1["iscrowd"] == 0
+
+    # Verify Box 2 conversion: [100, 150, 200, 250] -> [100, 150, 100, 100]
+    box2 = coco_targets[0]["annotations"][1]
+    assert box2["bbox"] == [100.0, 150.0, 100.0, 100.0], \
+        f"Expected [100.0, 150.0, 100.0, 100.0] but got {box2['bbox']}"
+    assert box2["area"] == 10000.0, \
+        f"Expected area 10000.0 but got {box2['area']}"
+    assert box2["category_id"] == 0
+    assert box2["iscrowd"] == 0
diff --git a/tests/test_detr.py b/tests/test_detr.py
index adb4b1517..8a5a54b00 100644
--- a/tests/test_detr.py
+++ b/tests/test_detr.py
@@ -1,4 +1,4 @@
-# test Transformers/detr
+# test Transformers/detr (standard DETR)
 import os
 
 import numpy as np
@@ -9,14 +9,13 @@
 from deepforest import get_data
 from deepforest import utilities
 from deepforest.datasets.training import BoxDataset
-from deepforest.models import DeformableDetr
+from deepforest.models import Detr
 
 
 @pytest.fixture()
 def config():
-    config = utilities.load_config()
-    config.model.name = "joshvm/milliontrees-detr"
-    config.architecture = "DeformableDetr"
+    config = utilities.load_config(overrides={"architecture": "Detr"})
+    config.model.name = "facebook/detr-resnet-50"
     config.train.fast_dev_run = True
     config.batch_size = 1
     config.score_thresh = 0.5
@@ -47,7 +46,7 @@ def coco_sample():
 
 
 def test_check_model(config):
-    model = DeformableDetr.Model(config)
+    model = Detr.Model(config)
     model.check_model()
 
 
@@ -62,7 +61,7 @@ def test_create_model(config, num_classes):
     """
     config.num_classes = num_classes
     config.label_dict = {f"{i}": i for i in range(num_classes)}
-    detr_model = DeformableDetr.Model(config).create_model()
+    detr_model = Detr.Model(config).create_model()
     detr_model.eval()
     x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
     _ = detr_model(x)
@@ -76,7 +75,7 @@ def test_boxes_in_output(config):
     include boxes, scores and labels. This model should have
     trained weights.
     """
-    detr_model = DeformableDetr.Model(config).create_model(config.model.name, revision=config.model.revision)
+    detr_model = Detr.Model(config).create_model(config.model.name, revision=config.model.revision)
     detr_model.eval()
 
     image_path = get_data("OSBS_029.png")
@@ -103,7 +102,7 @@ def test_forward_sample_dummy(config, coco_sample):
     Test that in training mode, we get a loss dict and it's
     non-zero.
     """
-    detr_model = DeformableDetr.Model(config).create_model()
+    detr_model = Detr.Model(config).create_model()
     detr_model.train()
 
     image, targets = coco_sample
@@ -124,10 +123,55 @@ def test_training_sample(config):
 
     image, targets, _ = next(iter(ds))
 
-    detr_model = DeformableDetr.Model(config).create_model()
+    detr_model = Detr.Model(config).create_model()
     detr_model.train()
 
     loss_dict = detr_model(image, targets)
 
     # Assert non-zero loss
     assert sum([loss for loss in loss_dict.values()]) > 0
+
+
+def test_prepare_targets_bbox_conversion(config):
+    """
+    Test that _prepare_targets correctly converts bounding boxes from
+    [xmin, ymin, xmax, ymax] format to COCO format [x, y, width, height].
+    """
+    detr_model = Detr.Model(config).create_model()
+
+    # Create test targets in [xmin, ymin, xmax, ymax] format
+    test_targets = [
+        {
+            "boxes": torch.tensor([
+                [10.0, 20.0, 50.0, 80.0],  # Box 1: xmin=10, ymin=20, xmax=50, ymax=80
+                [100.0, 150.0, 200.0, 250.0],  # Box 2: xmin=100, ymin=150, xmax=200, ymax=250
+            ]),
+            "labels": torch.tensor([0, 0], dtype=torch.int64),
+        }
+    ]
+
+    # Call _prepare_targets
+    coco_targets = detr_model._prepare_targets(test_targets)
+
+    # Verify output structure
+    assert len(coco_targets) == 1
+    assert "annotations" in coco_targets[0]
+    assert len(coco_targets[0]["annotations"]) == 2
+
+    # Verify Box 1 conversion: [10, 20, 50, 80] -> [10, 20, 40, 60]
+    box1 = coco_targets[0]["annotations"][0]
+    assert box1["bbox"] == [10.0, 20.0, 40.0, 60.0], \
+        f"Expected [10.0, 20.0, 40.0, 60.0] but got {box1['bbox']}"
+    assert box1["area"] == 2400.0, \
+        f"Expected area 2400.0 but got {box1['area']}"
+    assert box1["category_id"] == 0
+    assert box1["iscrowd"] == 0
+
+    # Verify Box 2 conversion: [100, 150, 200, 250] -> [100, 150, 100, 100]
+    box2 = coco_targets[0]["annotations"][1]
+    assert box2["bbox"] == [100.0, 150.0, 100.0, 100.0], \
+        f"Expected [100.0, 150.0, 100.0, 100.0] but got {box2['bbox']}"
+    assert box2["area"] == 10000.0, \
+        f"Expected area 10000.0 but got {box2['area']}"
+    assert box2["category_id"] == 0
+    assert box2["iscrowd"] == 0
diff --git a/tests/test_main.py b/tests/test_main.py
index 3842f3522..35d01cfe1 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -15,6 +15,7 @@
 from deepforest.utilities import read_file, format_geometry
 from deepforest.datasets import prediction
 from deepforest.visualize import plot_results
+from deepforest.callbacks import EvaluationCallback
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
@@ -34,11 +35,11 @@
 
 @pytest.fixture()
 def two_class_m():
-    m = main.deepforest(config_args={"num_classes": 2},
-                        label_dict={
+    m = main.deepforest(config_args={"num_classes": 2, "label_dict": {
                             "Alive": 0,
                             "Dead": 1
-                        })
+                        }},
+                        )
     m.config.train.csv_file = get_data("testfile_multi.csv")
     m.config.train.root_dir = os.path.dirname(get_data("testfile_multi.csv"))
     m.config.train.fast_dev_run = True
@@ -149,7 +150,7 @@ def test_tensorboard_logger(m, tmpdir):
         m.config.validation.val_accuracy_interval = 1
         m.config.train.epochs = 2
 
-        m.create_trainer(logger=logger, limit_train_batches=1, limit_val_batches=1)
+        m.create_trainer(logger=logger, callbacks=[EvaluationCallback(every_n_epochs=1, run_evaluation=True)], limit_train_batches=1, limit_val_batches=1)
         m.trainer.fit(m)
 
         assert m.trainer.logged_metrics["box_precision"]
@@ -974,8 +975,14 @@ def test_epoch_evaluation_end(m, tmpdir):
     m.config.validation.csv_file = tmpdir.strpath + "/predictions.csv"
     m.config.validation.root_dir = tmpdir.strpath
 
-    results = m.on_validation_epoch_end()
+    predictions_df = m.on_validation_epoch_end()
 
+    # Verify we got predictions back
+    assert not predictions_df.empty
+    assert len(predictions_df) == 2  # Should have 2 predictions
+
+    # Now run evaluation explicitly to test that evaluation works
+    results = m.evaluate(csv_file=tmpdir.strpath + "/predictions.csv", predictions=predictions_df)
     assert results["box_precision"] == 1.0
     assert results["box_recall"] == 1.0
 
@@ -1010,12 +1017,13 @@ def test_empty_frame_accuracy_all_empty_with_predictions(m, tmpdir):
     m.config.validation["csv_file"] = tmpdir.strpath + "/ground_truth.csv"
     m.config.validation["root_dir"] = os.path.dirname(get_data("testfile_deepforest.csv"))
 
-    m.create_trainer()
-    results = m.trainer.validate(m)
+    # Use evaluate() to get both metrics since this test needs box_precision
+    results = m.evaluate(csv_file=tmpdir.strpath + "/ground_truth.csv",
+                        root_dir=os.path.dirname(get_data("testfile_deepforest.csv")))
 
     # This is bit of a preference, if there are no predictions, the empty frame accuracy should be 0, precision is 0, and accuracy is None.
-    assert results[0]["empty_frame_accuracy"] == 0.0
-    assert results[0]["box_precision"] == 0.0
+    assert results["empty_frame_accuracy"] == 0.0
+    assert results["box_precision"] == 0.0
 
 def test_empty_frame_accuracy_mixed_frames_with_predictions(m, tmpdir):
     """Test empty frame accuracy with a mix of empty and non-empty frames.
@@ -1040,8 +1048,10 @@ def test_empty_frame_accuracy_mixed_frames_with_predictions(m, tmpdir):
     m.config.validation.size = 400
 
     m.create_trainer()
-    results = m.trainer.validate(m)
-    assert results[0]["empty_frame_accuracy"] == 0
+    predictions = m.trainer.validate(m)
+    accuracy = m.calculate_empty_frame_accuracy(ground_df, predictions)
+
+    assert accuracy == 0
 
 def test_empty_frame_accuracy_without_predictions(m_without_release, tmpdir):
     """Create a ground truth with empty frames, the accuracy should be 1 with a random model"""
@@ -1059,8 +1069,10 @@ def test_empty_frame_accuracy_without_predictions(m_without_release, tmpdir):
     m.config.validation["root_dir"] = os.path.dirname(get_data("testfile_deepforest.csv"))
 
     m.create_trainer()
-    results = m.trainer.validate(m)
-    assert results[0]["empty_frame_accuracy"] == 1
+    predictions = m.trainer.validate(m)
+    accuracy = m.calculate_empty_frame_accuracy(ground_df, predictions)
+
+    assert accuracy == 0
 
 def test_multi_class_with_empty_frame_accuracy_without_predictions(two_class_m, tmpdir):
     """Create a ground truth with empty frames, the accuracy should be 1 with a random model"""
@@ -1082,12 +1094,17 @@ def test_multi_class_with_empty_frame_accuracy_without_predictions(two_class_m,
 
     two_class_m.create_trainer()
     results = two_class_m.trainer.validate(two_class_m)
-    assert results[0]["empty_frame_accuracy"] == 1
+
+    predictions = m.trainer.validate(m)
+    accuracy = m.calculate_empty_frame_accuracy(ground_df, predictions)
+
+    assert accuracy == 1
 
 def test_evaluate_on_epoch_interval(m):
     m.config.validation.val_accuracy_interval = 1
     m.config.train.epochs = 1
-    m.create_trainer()
+
+    m.create_trainer(callbacks=[EvaluationCallback(every_n_epochs=1, run_evaluation=True)], fast_dev_run=False)
     m.trainer.fit(m)
     assert m.trainer.logged_metrics["box_precision"]
     assert m.trainer.logged_metrics["box_recall"]
@@ -1108,20 +1125,3 @@ def test_set_labels_invalid_length(m): # Expect a ValueError when setting an inv
     invalid_mapping = {"Object": 0, "Extra": 1}
     with pytest.raises(ValueError):
         m.set_labels(invalid_mapping)
-
-def test_on_train_start_basic(m):
-    """Test that on_train_start runs without error and logs images using the default logger."""
-    # Create a mock logger
-    class MockLogger:
-        def __init__(self):
-            self.experiment = Mock()
-            self.experiment.log_image = self.log_image
-            self.images = []
-
-        def log_image(self, image, metadata):
-            self.images.append(image)
-
-    m.create_trainer(fast_dev_run=False, limit_train_batches=2, limit_val_batches=2, logger=MockLogger())
-    m.on_train_start()
-
-    assert len(m.logger.images) == 2
diff --git a/tests/test_retinanet.py b/tests/test_retinanet.py
index a63bfc971..6e3908414 100644
--- a/tests/test_retinanet.py
+++ b/tests/test_retinanet.py
@@ -24,10 +24,45 @@ def _make_empty_sample():
     targets = [negative_target]
     return images, targets
 
+@pytest.mark.parametrize("model_name", [None, "dinov3"])
+def test_retinanet_inference(config, model_name):
+    config.model.name = model_name
+    r = retinanet.Model(config)
+    x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+    retinanet_model = retinanet.Model(config).create_model()
+    retinanet_model.eval()
 
-def test_retinanet(config):
+    # Expect output to be a list for batched input, each
+    # output should have a box, score and label key.
+    with torch.no_grad():
+        predictions = retinanet_model(x)
+        assert isinstance(predictions, list)
+        for pred in predictions:
+            assert "boxes" in pred
+            assert "scores" in pred
+            assert "labels" in pred
+
+@pytest.mark.parametrize("model_name", [None, "dinov3"])
+def test_retinanet_train(config, model_name):
+    config.model.name = model_name
     r = retinanet.Model(config)
-    assert r
+    x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+    targets = [{"boxes": torch.tensor([[0,0,50,50], [25,25,90,90]]),
+               "labels": torch.tensor([0,0]).long()},
+               {"boxes": torch.tensor([[10,50,30,80], [100,100, 200,200]]),
+               "labels": torch.tensor([0,0]).long()}]
+
+    retinanet_model = retinanet.Model(config).create_model()
+    retinanet_model.train()
+
+    # Expect output to be a dictionary of loss values for the batch
+    # for bbox regression and classification
+    loss_dict = retinanet_model(x, targets)
+    assert isinstance(loss_dict, dict)
+    assert "bbox_regression" in loss_dict
+    assert "classification" in loss_dict
+    assert loss_dict["bbox_regression"] > 0
+    assert loss_dict["classification"] > 0
 
 
 def retinanet_check_model(config):