diff --git a/sygra/core/eval/metrics/aggregator_metrics/pass_at_k.py b/sygra/core/eval/metrics/aggregator_metrics/pass_at_k.py new file mode 100644 index 00000000..a2877832 --- /dev/null +++ b/sygra/core/eval/metrics/aggregator_metrics/pass_at_k.py @@ -0,0 +1,126 @@ +""" +Pass@k Metrics +""" + +import math +from typing import Any, Dict, List + +from pydantic import BaseModel, Field, field_validator + +from sygra.core.eval.metrics.aggregator_metrics.aggregator_metric_registry import aggregator_metric +from sygra.core.eval.metrics.aggregator_metrics.base_aggregator_metric import BaseAggregatorMetric +from sygra.core.eval.metrics.base_metric_metadata import BaseMetricMetadata +from sygra.core.eval.metrics.unit_metrics.unit_metric_result import UnitMetricResult +from sygra.logger.logger_config import logger + + +class PassAtKMetricConfig(BaseModel): + """Configuration for PassAtK Metric""" + + k: int = Field(..., description="Number of samples to draw") + + @field_validator("k") + @classmethod + def validate_k(cls, v): + if v is None or v < 0: + raise ValueError( + "value of k is required and must be positive (cannot be None or less than equal to 0)" + ) + return v + + +@aggregator_metric("pass@k") +class PassAtKMetric(BaseAggregatorMetric): + """Calculate pass@k metric: probability that at least one of k independent attempts will succeed. + + Required configuration: + k: Number of samples to draw + """ + + def __init__(self, **config): + """Initialize F1 score metric with two-phase initialization.""" + super().__init__(**config) + self.validate_config() + self.metadata = self.get_metadata() + + def validate_config(self): + """Validate and store F1-specific configuration requirements""" + # Validate using Pydantic config class + config_obj = PassAtKMetricConfig(**self.config) + + # Store validated fields as instance attributes + self.k = config_obj.k + + def get_metadata(self) -> BaseMetricMetadata: + """Return metadata for F1 score metric""" + return BaseMetricMetadata( + name="pass@k", + display_name="Pass@k", + description="Probability that at least one of k independent attempts will succeed.", + range=(0.0, 1.0), + higher_is_better=True, + metric_type="industry", + ) + + def calculate(self, results: List[UnitMetricResult]) -> Dict[str, Any]: + """Calculate Pass@k score. + + Args: + results: List of UnitMetricResult + + Returns: + dict: {"pass@k": float (0.0 to 1.0)} + """ + if not results: + logger.warning(f"{self.__class__.__name__}: No results provided") + return { + "pass@k": 0.0, + } + # Total number of attempts/samples + n = len(results) + # Number of correct solutions + c = self._count_correct(results) + pass_at_k_value = self.pass_at_k(n, c, self.k) + + return { + "pass@k": pass_at_k_value, + } + + @staticmethod + def pass_at_k(n: int, c: int, k: int) -> float: + """Calculate pass@k metric: probability that at least one of k independent attempts will succeed. + + Args: + n (int): Total number of attempts/samples + c (int): Number of correct solutions + k (int): Number of samples to draw + + Returns: + float: Pass@k probability (0 to 1) + + Raises: + ValueError: If invalid parameters are provided + """ + if n <= 0 or c < 0 or k <= 0: + raise ValueError("n and k must be positive, c must be non-negative") + if c > n: + raise ValueError("Number of correct solutions (c) cannot exceed total attempts (n)") + if k > n: + raise ValueError("Sample size (k) cannot exceed total attempts (n)") + + # If all solutions are correct, pass@k = 1 + if c == n: + return 1.0 + + # If no solutions are correct, pass@k = 0 + if c == 0: + return 0.0 + + # Calculate using the complement: 1 - P(all k samples are incorrect) + # P(all incorrect) = C(n-c, k) / C(n, k) + try: + prob_all_incorrect = math.comb(n - c, k) / math.comb(n, k) + return 1.0 - prob_all_incorrect + except (ValueError, ZeroDivisionError): + # Handle edge cases where combinations are invalid + return 0.0 diff --git a/sygra/core/eval/metrics/aggregator_metrics/pass_power_k.py b/sygra/core/eval/metrics/aggregator_metrics/pass_power_k.py new file mode 100644 index 00000000..61efd068 --- /dev/null +++ b/sygra/core/eval/metrics/aggregator_metrics/pass_power_k.py @@ -0,0 +1,100 @@ +""" +Pass^K Metrics +""" + +from typing import Any, Dict, List + +from sygra.core.eval.metrics.aggregator_metrics.aggregator_metric_registry import aggregator_metric +from sygra.core.eval.metrics.aggregator_metrics.base_aggregator_metric import BaseAggregatorMetric +from sygra.core.eval.metrics.aggregator_metrics.pass_at_k import PassAtKMetricConfig +from sygra.core.eval.metrics.base_metric_metadata import BaseMetricMetadata +from sygra.core.eval.metrics.unit_metrics.unit_metric_result import UnitMetricResult +from sygra.logger.logger_config import logger + + +@aggregator_metric("pass^k") +class PassPowerKMetric(BaseAggregatorMetric): + """Calculate pass^k metric: probability that an agent would succeed on all k independent attempts. + + Required configuration: + k: Number of samples to draw + """ + + def __init__(self, **config): + """Initialize F1 score metric with two-phase initialization.""" + super().__init__(**config) + self.validate_config() + self.metadata = self.get_metadata() + + def validate_config(self): + """Validate and store F1-specific configuration requirements""" + # Validate using Pydantic config class + config_obj = PassAtKMetricConfig(**self.config) + + # Store validated fields as instance attributes + self.k = config_obj.k + + def get_metadata(self) -> BaseMetricMetadata: + """Return metadata for F1 score metric""" + return BaseMetricMetadata( + name="pass^k", + display_name="Pass^k", + description="Probability that an agent would succeed on all k independent attempts.", + range=(0.0, 1.0), + higher_is_better=True, + metric_type="industry", + ) + + def calculate(self, results: List[UnitMetricResult]) -> Dict[str, Any]: + """Calculate Pass^k score. + + Args: + results: List of UnitMetricResult + + Returns: + dict: Dictionary containing metrics and related information + {"success_rate": float (0.0 to 1.0), "pass^k": float (0.0 to 1.0)} + + Raises: + ValueError: If invalid parameters are provided + """ + if not results: + logger.warning(f"{self.__class__.__name__}: No results provided") + return {"success_rate": 0.0, "pass^k": 0.0} + # Total number of attempts/samples + n = len(results) + # Number of correct solutions + c = self._count_correct(results) + + if n <= 0: + raise ValueError("Total attempts (n) must be positive") + if c < 0: + raise ValueError("Correct solutions (c) must be non-negative") + if c > n: + raise ValueError("Correct solutions (c) cannot exceed total attempts (n)") + + success_rate = self._safe_divide(c, n) + pass_power_k_value = self.pass_power_k(success_rate, self.k) + + return {"success_rate": success_rate, "pass^k": pass_power_k_value} + + @staticmethod + def pass_power_k(success_rate: float, k: int) -> float: + """Calculate pass^k metric: probability that an agent would succeed on all k independent attempts. + + Args: + success_rate (float): Raw success rate on a single attempt (0 to 1) + k (int): Number of consecutive attempts + + Returns: + float: Pass^k probability (0 to 1) + + Raises: + ValueError: If invalid parameters are provided + """ + if not 0 <= success_rate <= 1: + raise ValueError("Success rate must be between 0 and 1") + if k <= 0: + raise ValueError("k must be positive") + + return success_rate**k diff --git a/tests/core/eval/metrics/aggregator_metrics/test_pass_at_k.py b/tests/core/eval/metrics/aggregator_metrics/test_pass_at_k.py new file mode 100644 index 00000000..0a446e05 --- /dev/null +++ b/tests/core/eval/metrics/aggregator_metrics/test_pass_at_k.py @@ -0,0 +1,255 @@ +""" +Unit tests for PassAtKMetric +Tests pass@k calculation from unit metric results. +""" + +import os +import sys + +# Add project root to sys.path for relative imports to work +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) +) + +import pytest + +from sygra.core.eval.metrics.aggregator_metrics.pass_at_k import PassAtKMetric +from sygra.core.eval.metrics.unit_metrics.unit_metric_result import UnitMetricResult +from sygra.logger.logger_config import logger + + +class TestPassAtKMetric: + """Test suite for PassAtKMetric""" + + def test_get_metric_name(self): + """Test that metric name is 'pass@k'""" + metric = PassAtKMetric(k=1) + assert metric.get_metric_name() == "pass@k" + + def test_calculate_empty_results(self): + """Test calculate with empty results list""" + metric = PassAtKMetric(k=1) + results = [] + output = metric.calculate(results) + + assert "pass@k" in output + assert output["pass@k"] == 0.0 + + def test_calculate_all_correct(self): + """Test calculate when all predictions are correct""" + metric = PassAtKMetric(k=1) + results = [ + UnitMetricResult( + correct=True, + golden={"class": "A"}, + predicted={"class": "A"}, + metadata={"id": 1}, + ), + UnitMetricResult( + correct=True, + golden={"class": "B"}, + predicted={"class": "B"}, + metadata={"id": 2}, + ), + UnitMetricResult( + correct=True, + golden={"class": "C"}, + predicted={"class": "C"}, + metadata={"id": 3}, + ), + ] + output = metric.calculate(results) + + assert "pass@k" in output + assert output["pass@k"] == 1.0 + + def test_calculate_all_incorrect(self): + """Test calculate when all predictions are incorrect""" + metric = PassAtKMetric(k=1) + results = [ + UnitMetricResult( + correct=False, + golden={"class": "A"}, + predicted={"class": "B"}, + metadata={"id": 1}, + ), + UnitMetricResult( + correct=False, + golden={"class": "B"}, + predicted={"class": "C"}, + metadata={"id": 2}, + ), + UnitMetricResult( + correct=False, + golden={"class": "C"}, + predicted={"class": "A"}, + metadata={"id": 3}, + ), + ] + output = metric.calculate(results) + + assert "pass@k" in output + assert output["pass@k"] == 0.0 + + def test_calculate_mixed_results(self): + """Test calculate with mixed correct/incorrect predictions""" + metric = PassAtKMetric(k=1) + results = [ + UnitMetricResult( + correct=True, + golden={"class": "A"}, + predicted={"class": "A"}, + metadata={"id": 1}, + ), + UnitMetricResult( + correct=False, + golden={"class": "B"}, + predicted={"class": "C"}, + metadata={"id": 2}, + ), + UnitMetricResult( + correct=True, + golden={"class": "C"}, + predicted={"class": "C"}, + metadata={"id": 3}, + ), + UnitMetricResult( + correct=False, + golden={"class": "D"}, + predicted={"class": "A"}, + metadata={"id": 4}, + ), + ] + output = metric.calculate(results) + + assert "pass@k" in output + assert output["pass@k"] == 0.5 # 2 correct out of 4 + + def test_calculate_single_correct_result(self): + """Test calculate with single correct result""" + metric = PassAtKMetric(k=1) + results = [ + UnitMetricResult( + correct=True, + golden={"class": "A"}, + predicted={"class": "A"}, + metadata={"id": 1}, + ), + ] + output = metric.calculate(results) + + assert "pass@k" in output + assert output["pass@k"] == 1.0 + + def test_calculate_single_incorrect_result(self): + """Test calculate with single incorrect result""" + metric = PassAtKMetric(k=1) + results = [ + UnitMetricResult( + correct=False, + golden={"class": "A"}, + predicted={"class": "B"}, + metadata={"id": 1}, + ), + ] + output = metric.calculate(results) + + assert "pass@k" in output + assert output["pass@k"] == 0.0 + + def test_calculate_various_pass_at_k_values(self): + """Test calculate with various pass@k percentages""" + + # 75% pass@k (3 out of 4) + results = [ + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + ] + output = PassAtKMetric(k=1).calculate(results) + assert output["pass@k"] == 0.75 + + output = PassAtKMetric(k=2).calculate(results) + assert output["pass@k"] == 1.0 + + # 60% pass@k (3 out of 5) + results = [ + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + ] + output = PassAtKMetric(k=1).calculate(results) + assert output["pass@k"] == 0.6 + + output = PassAtKMetric(k=2).calculate(results) + assert output["pass@k"] == 0.9 + + output = PassAtKMetric(k=3).calculate(results) + assert output["pass@k"] == 1.0 + + # 33.33% pass@k (1 out of 3) + results = [ + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + ] + output = PassAtKMetric(k=1).calculate(results) + assert output["pass@k"] == pytest.approx(0.333, rel=1e-2) + + output = PassAtKMetric(k=2).calculate(results) + assert output["pass@k"] == pytest.approx(0.666, rel=1e-2) + + output = PassAtKMetric(k=3).calculate(results) + assert output["pass@k"] == 1.0 + + def test_calculate_with_complex_metadata(self): + """Test calculate with complex metadata in results""" + results = [ + UnitMetricResult( + correct=True, + golden={"event": "click", "x": 100, "y": 200}, + predicted={"tool": "click", "x": 105, "y": 195}, + metadata={ + "mission_id": "mission_01", + "step_id": "step_1", + "validation_type": "tool_only", + }, + ), + UnitMetricResult( + correct=False, + golden={"event": "type", "text": "hello"}, + predicted={"tool": "click", "text": "world"}, + metadata={ + "mission_id": "mission_01", + "step_id": "step_2", + "validation_type": "full", + }, + ), + ] + output = PassAtKMetric(k=1).calculate(results) + assert "pass@k" in output + assert output["pass@k"] == 0.5 + + output = PassAtKMetric(k=2).calculate(results) + assert "pass@k" in output + assert output["pass@k"] == 1.0 + + def test_calculate_with_different_data_types(self): + """Test calculate with different data types in golden/predicted""" + logger.info("Testing calculate with different data types in golden/predicted") + results = [ + UnitMetricResult(correct=True, golden={"value": 1}, predicted={"value": 1}), + UnitMetricResult(correct=True, golden={"value": "text"}, predicted={"value": "text"}), + UnitMetricResult(correct=True, golden={"value": True}, predicted={"value": True}), + UnitMetricResult(correct=False, golden={"value": [1, 2]}, predicted={"value": [1, 3]}), + ] + output = PassAtKMetric(k=1).calculate(results) + assert "pass@k" in output + assert output["pass@k"] == 0.75 + + output = PassAtKMetric(k=2).calculate(results) + assert "pass@k" in output + assert output["pass@k"] == 1.0 diff --git a/tests/core/eval/metrics/aggregator_metrics/test_pass_power_k.py b/tests/core/eval/metrics/aggregator_metrics/test_pass_power_k.py new file mode 100644 index 00000000..ff8a836d --- /dev/null +++ b/tests/core/eval/metrics/aggregator_metrics/test_pass_power_k.py @@ -0,0 +1,252 @@ +""" +Unit tests for PassPowerKMetric +Tests pass^k calculation from unit metric results. +""" + +import os +import sys + +# Add project root to sys.path for relative imports to work +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) +) + +import pytest + +from sygra.core.eval.metrics.aggregator_metrics.pass_power_k import PassPowerKMetric +from sygra.core.eval.metrics.unit_metrics.unit_metric_result import UnitMetricResult +from sygra.logger.logger_config import logger + + +class TestPassPowerKMetric: + """Test suite for PassAtKMetric""" + + def test_get_metric_name(self): + """Test that metric name is 'pass^k'""" + metric = PassPowerKMetric(k=1) + assert metric.get_metric_name() == "pass^k" + + def test_calculate_empty_results(self): + """Test calculate with empty results list""" + metric = PassPowerKMetric(k=1) + results = [] + output = metric.calculate(results) + + assert "pass^k" in output + assert output["pass^k"] == 0.0 + + def test_calculate_all_correct(self): + """Test calculate when all predictions are correct""" + metric = PassPowerKMetric(k=1) + results = [ + UnitMetricResult( + correct=True, + golden={"class": "A"}, + predicted={"class": "A"}, + metadata={"id": 1}, + ), + UnitMetricResult( + correct=True, + golden={"class": "B"}, + predicted={"class": "B"}, + metadata={"id": 2}, + ), + UnitMetricResult( + correct=True, + golden={"class": "C"}, + predicted={"class": "C"}, + metadata={"id": 3}, + ), + ] + output = metric.calculate(results) + + assert "pass^k" in output + assert output["pass^k"] == 1.0 + + def test_calculate_all_incorrect(self): + """Test calculate when all predictions are incorrect""" + metric = PassPowerKMetric(k=1) + results = [ + UnitMetricResult( + correct=False, + golden={"class": "A"}, + predicted={"class": "B"}, + metadata={"id": 1}, + ), + UnitMetricResult( + correct=False, + golden={"class": "B"}, + predicted={"class": "C"}, + metadata={"id": 2}, + ), + UnitMetricResult( + correct=False, + golden={"class": "C"}, + predicted={"class": "A"}, + metadata={"id": 3}, + ), + ] + output = metric.calculate(results) + + assert "pass^k" in output + assert output["pass^k"] == 0.0 + + def test_calculate_mixed_results(self): + """Test calculate with mixed correct/incorrect predictions""" + metric = PassPowerKMetric(k=1) + results = [ + UnitMetricResult( + correct=True, + golden={"class": "A"}, + predicted={"class": "A"}, + metadata={"id": 1}, + ), + UnitMetricResult( + correct=False, + golden={"class": "B"}, + predicted={"class": "C"}, + metadata={"id": 2}, + ), + UnitMetricResult( + correct=True, + golden={"class": "C"}, + predicted={"class": "C"}, + metadata={"id": 3}, + ), + UnitMetricResult( + correct=False, + golden={"class": "D"}, + predicted={"class": "A"}, + metadata={"id": 4}, + ), + ] + output = metric.calculate(results) + + assert "pass^k" in output + assert output["pass^k"] == 0.5 # 2 correct out of 4 + + def test_calculate_single_correct_result(self): + """Test calculate with single correct result""" + metric = PassPowerKMetric(k=1) + results = [ + UnitMetricResult( + correct=True, + golden={"class": "A"}, + predicted={"class": "A"}, + metadata={"id": 1}, + ), + ] + output = metric.calculate(results) + + assert "pass^k" in output + assert output["pass^k"] == 1.0 + + def test_calculate_single_incorrect_result(self): + """Test calculate with single incorrect result""" + metric = PassPowerKMetric(k=1) + results = [ + UnitMetricResult( + correct=False, + golden={"class": "A"}, + predicted={"class": "B"}, + metadata={"id": 1}, + ), + ] + output = metric.calculate(results) + + assert "pass^k" in output + assert output["pass^k"] == 0.0 + + def test_calculate_various_pass_at_k_values(self): + """Test calculate with various pass^k percentages""" + + # 75% pass^k (3 out of 4) + results = [ + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + ] + output = PassPowerKMetric(k=1).calculate(results) + assert output["pass^k"] == 0.75 + + output = PassPowerKMetric(k=2).calculate(results) + assert output["pass^k"] == 0.5625 + + output = PassPowerKMetric(k=3).calculate(results) + assert output["pass^k"] == 0.421875 + + # 60% pass^k (3 out of 5) + results = [ + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + ] + output = PassPowerKMetric(k=1).calculate(results) + assert output["pass^k"] == 0.6 + + output = PassPowerKMetric(k=2).calculate(results) + assert output["pass^k"] == 0.36 + + # 33.33% pass^k (1 out of 3) + results = [ + UnitMetricResult(correct=True, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + UnitMetricResult(correct=False, golden={}, predicted={}), + ] + output = PassPowerKMetric(k=1).calculate(results) + assert output["pass^k"] == pytest.approx(0.333, rel=1e-2) + + output = PassPowerKMetric(k=2).calculate(results) + assert output["pass^k"] == pytest.approx(0.111, rel=1e-2) + + def test_calculate_with_complex_metadata(self): + """Test calculate with complex metadata in results""" + results = [ + UnitMetricResult( + correct=True, + golden={"event": "click", "x": 100, "y": 200}, + predicted={"tool": "click", "x": 105, "y": 195}, + metadata={ + "mission_id": "mission_01", + "step_id": "step_1", + "validation_type": "tool_only", + }, + ), + UnitMetricResult( + correct=False, + golden={"event": "type", "text": "hello"}, + predicted={"tool": "click", "text": "world"}, + metadata={ + "mission_id": "mission_01", + "step_id": "step_2", + "validation_type": "full", + }, + ), + ] + output = PassPowerKMetric(k=1).calculate(results) + assert "pass^k" in output + assert output["pass^k"] == 0.5 + + output = PassPowerKMetric(k=2).calculate(results) + assert "pass^k" in output + assert output["pass^k"] == 0.25 + + def test_calculate_with_different_data_types(self): + """Test calculate with different data types in golden/predicted""" + logger.info("Testing calculate with different data types in golden/predicted") + results = [ + UnitMetricResult(correct=True, golden={"value": 1}, predicted={"value": 1}), + UnitMetricResult(correct=True, golden={"value": "text"}, predicted={"value": "text"}), + UnitMetricResult(correct=True, golden={"value": True}, predicted={"value": True}), + UnitMetricResult(correct=False, golden={"value": [1, 2]}, predicted={"value": [1, 3]}), + ] + output = PassPowerKMetric(k=1).calculate(results) + assert "pass^k" in output + assert output["pass^k"] == 0.75 + + output = PassPowerKMetric(k=2).calculate(results) + assert "pass^k" in output + assert output["pass^k"] == 0.5625