From 6b11767d30fb08969146d4bb58ac8570cc20c34f Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Tue, 19 May 2026 17:54:34 -0700 Subject: [PATCH 01/13] feat(eval): add evaluator type schemas for classification evaluators Generates BinaryClassificationEvaluator.json and MulticlassClassificationEvaluator.json from the new evaluators added in #1397 so external tooling (Flow UI evaluator picker, `uip maestro flow eval`) can read the config / criteria / justification schemas. Files produced by `python -m uipath.eval.evaluators_types.generate_types`, restricted to the two new evaluator types. A companion PR refreshes the other 11 stale schemas in evaluators_types/. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../BinaryClassificationEvaluator.json | 121 ++++++++++++++++ .../MulticlassClassificationEvaluator.json | 133 ++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json create mode 100644 packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json diff --git a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json new file mode 100644 index 000000000..9f7351865 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json @@ -0,0 +1,121 @@ +{ + "evaluatorTypeId": "uipath-binary-classification", + "evaluatorConfigSchema": { + "$defs": { + "BinaryClassificationEvaluationCriteria": { + "description": "Per-datapoint criteria: which class this sample should belong to.", + "properties": { + "expected_class": { + "title": "Expected Class", + "type": "string" + } + }, + "required": [ + "expected_class" + ], + "title": "BinaryClassificationEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the binary classification evaluator.", + "properties": { + "name": { + "default": "BinaryClassificationEvaluator", + "title": "Name", + "type": "string" + }, + "description": { + "default": "", + "description": "The description of the evaluator", + "title": "Description", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/BinaryClassificationEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + }, + "line_by_line_evaluator": { + "default": false, + "description": "If True, split output by delimiter and evaluate each line separately", + "title": "Line By Line Evaluator", + "type": "boolean" + }, + "line_delimiter": { + "default": "\n", + "description": "Delimiter to split output when line_by_line_evaluator is True", + "title": "Line Delimiter", + "type": "string" + }, + "positive_class": { + "title": "Positive Class", + "type": "string" + }, + "metric_type": { + "default": "precision", + "enum": [ + "precision", + "recall", + "f-score" + ], + "title": "Metric Type", + "type": "string" + }, + "f_value": { + "default": 1.0, + "title": "F Value", + "type": "number" + } + }, + "required": [ + "positive_class" + ], + "title": "BinaryClassificationEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Per-datapoint criteria: which class this sample should belong to.", + "properties": { + "expected_class": { + "title": "Expected Class", + "type": "string" + } + }, + "required": [ + "expected_class" + ], + "title": "BinaryClassificationEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "description": "Base class for all evaluator justifications.", + "properties": { + "expected": { + "title": "Expected", + "type": "string" + }, + "actual": { + "title": "Actual", + "type": "string" + } + }, + "required": [ + "expected", + "actual" + ], + "title": "BaseEvaluatorJustification", + "type": "object" + } +} \ No newline at end of file diff --git a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json new file mode 100644 index 000000000..72262ba92 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json @@ -0,0 +1,133 @@ +{ + "evaluatorTypeId": "uipath-multiclass-classification", + "evaluatorConfigSchema": { + "$defs": { + "MulticlassClassificationEvaluationCriteria": { + "description": "Per-datapoint criteria: which class this sample should belong to.", + "properties": { + "expected_class": { + "title": "Expected Class", + "type": "string" + } + }, + "required": [ + "expected_class" + ], + "title": "MulticlassClassificationEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the multiclass classification evaluator.", + "properties": { + "name": { + "default": "MulticlassClassificationEvaluator", + "title": "Name", + "type": "string" + }, + "description": { + "default": "", + "description": "The description of the evaluator", + "title": "Description", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/MulticlassClassificationEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + }, + "line_by_line_evaluator": { + "default": false, + "description": "If True, split output by delimiter and evaluate each line separately", + "title": "Line By Line Evaluator", + "type": "boolean" + }, + "line_delimiter": { + "default": "\n", + "description": "Delimiter to split output when line_by_line_evaluator is True", + "title": "Line Delimiter", + "type": "string" + }, + "classes": { + "items": { + "type": "string" + }, + "title": "Classes", + "type": "array" + }, + "metric_type": { + "default": "f-score", + "enum": [ + "precision", + "recall", + "f-score" + ], + "title": "Metric Type", + "type": "string" + }, + "averaging": { + "default": "macro", + "enum": [ + "micro", + "macro" + ], + "title": "Averaging", + "type": "string" + }, + "f_value": { + "default": 1.0, + "title": "F Value", + "type": "number" + } + }, + "required": [ + "classes" + ], + "title": "MulticlassClassificationEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Per-datapoint criteria: which class this sample should belong to.", + "properties": { + "expected_class": { + "title": "Expected Class", + "type": "string" + } + }, + "required": [ + "expected_class" + ], + "title": "MulticlassClassificationEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "description": "Base class for all evaluator justifications.", + "properties": { + "expected": { + "title": "Expected", + "type": "string" + }, + "actual": { + "title": "Actual", + "type": "string" + } + }, + "required": [ + "expected", + "actual" + ], + "title": "BaseEvaluatorJustification", + "type": "object" + } +} \ No newline at end of file From 037b60cdb6e721c494b2b4fd173e6bf1bdb450ed Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Tue, 19 May 2026 18:27:58 -0700 Subject: [PATCH 02/13] test(eval): add e2e tests + sample projects for classification evaluators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two sample projects under packages/uipath/samples/ that double as end-to-end test fixtures for the binary and multiclass classification evaluators added in #1397: - binary_classification_agent — rule-based spam/ham classifier wired up to the binary classification evaluator with metric_type=precision. Eval set is designed so 4/5 datapoints pass but precision is 2/3 because of one deliberate false positive. - multiclass_classification_simple — rule-based 3-class router (payments / support / spam) wired up to the multiclass classification evaluator with macro-averaged F1. Eval set forces a misroute that hurts both payments precision and support recall, giving macro F1 = 26/30. Adds tests/cli/eval/test_classification_samples_e2e.py which loads each sample's eval-sets/default.json, wires its main.py into a stand-in runtime, calls evaluate(), and asserts both the per-row scores and the aggregated metric produced by reduce_scores. Locks in the dataset-level math, not just per-row correct/incorrect. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../binary_classification_agent/bindings.json | 4 + .../evaluations/eval-sets/default.json | 63 ++++++ .../evaluators/binary-classification.json | 16 ++ .../binary_classification_agent/main.py | 39 ++++ .../pyproject.toml | 9 + .../binary_classification_agent/uipath.json | 5 + .../bindings.json | 4 + .../evaluations/eval-sets/default.json | 85 ++++++++ .../evaluators/multiclass-classification.json | 17 ++ .../multiclass_classification_simple/main.py | 51 +++++ .../pyproject.toml | 9 + .../uipath.json | 5 + .../eval/test_classification_samples_e2e.py | 193 ++++++++++++++++++ 13 files changed, 500 insertions(+) create mode 100644 packages/uipath/samples/binary_classification_agent/bindings.json create mode 100644 packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json create mode 100644 packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json create mode 100644 packages/uipath/samples/binary_classification_agent/main.py create mode 100644 packages/uipath/samples/binary_classification_agent/pyproject.toml create mode 100644 packages/uipath/samples/binary_classification_agent/uipath.json create mode 100644 packages/uipath/samples/multiclass_classification_simple/bindings.json create mode 100644 packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json create mode 100644 packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json create mode 100644 packages/uipath/samples/multiclass_classification_simple/main.py create mode 100644 packages/uipath/samples/multiclass_classification_simple/pyproject.toml create mode 100644 packages/uipath/samples/multiclass_classification_simple/uipath.json create mode 100644 packages/uipath/tests/cli/eval/test_classification_samples_e2e.py diff --git a/packages/uipath/samples/binary_classification_agent/bindings.json b/packages/uipath/samples/binary_classification_agent/bindings.json new file mode 100644 index 000000000..5e9beeb01 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/bindings.json @@ -0,0 +1,4 @@ +{ + "version": "2.0", + "resources": [] +} diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json new file mode 100644 index 000000000..f47cd25b8 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json @@ -0,0 +1,63 @@ +{ + "version": "1.0", + "id": "SpamBinaryEval", + "name": "Binary spam classifier — precision", + "evaluatorRefs": ["BinarySpamPrecision"], + "evaluations": [ + { + "id": "spam-prize", + "name": "Spam: prize giveaway", + "inputs": { + "email_subject": "You won a FREE iPhone!!!", + "email_body": "Congratulations! Click here to claim your prize now." + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "spam" } + } + }, + { + "id": "spam-promo", + "name": "Spam: unsolicited promo", + "inputs": { + "email_subject": "Winner of the monthly drawing", + "email_body": "You've been selected. Click here to redeem." + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "spam" } + } + }, + { + "id": "ham-invoice", + "name": "Ham: legitimate invoice", + "inputs": { + "email_subject": "Your March invoice is ready", + "email_body": "Your monthly invoice of $45.99 is attached. Payment is due March 15." + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "ham" } + } + }, + { + "id": "ham-meeting", + "name": "Ham: meeting request", + "inputs": { + "email_subject": "Sync on Q2 planning", + "email_body": "Can we meet Wednesday at 2pm to align on next quarter's roadmap?" + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "ham" } + } + }, + { + "id": "ham-mislabeled", + "name": "Ham mislabeled as spam (forces a false positive)", + "inputs": { + "email_subject": "Free coffee in the break room!!!", + "email_body": "Just a heads up — the new espresso machine is set up." + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "ham" } + } + } + ] +} diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json new file mode 100644 index 000000000..21f7d6850 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json @@ -0,0 +1,16 @@ +{ + "version": "1.0", + "id": "BinarySpamPrecision", + "description": "Precision on the 'spam' positive class", + "evaluatorTypeId": "uipath-binary-classification", + "evaluatorConfig": { + "name": "BinarySpamPrecision", + "targetOutputKey": "category", + "positiveClass": "spam", + "metricType": "precision", + "fValue": 1.0, + "defaultEvaluationCriteria": { + "expectedClass": "ham" + } + } +} diff --git a/packages/uipath/samples/binary_classification_agent/main.py b/packages/uipath/samples/binary_classification_agent/main.py new file mode 100644 index 000000000..1df5dea15 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/main.py @@ -0,0 +1,39 @@ +"""Rule-based spam/ham classifier demonstrating the binary classification evaluator.""" + +from dataclasses import dataclass + +from uipath.tracing import traced + +SPAMMY_TOKENS = { + "free", + "winner", + "congratulations", + "click here", + "prize", + "!!!", +} + + +@dataclass +class EmailInput: + email_subject: str + email_body: str + + +@dataclass +class Classification: + category: str + + +@traced(name="classify_email", span_type="tool") +def classify_email(subject: str, body: str) -> str: + """Return 'spam' if any spam-indicator token appears in the subject or body.""" + text = f"{subject} {body}".lower() + return "spam" if any(token in text for token in SPAMMY_TOKENS) else "ham" + + +@traced() +async def main(input: EmailInput) -> Classification: + """Classify an email as 'spam' or 'ham'.""" + category = classify_email(input.email_subject, input.email_body) + return Classification(category=category) diff --git a/packages/uipath/samples/binary_classification_agent/pyproject.toml b/packages/uipath/samples/binary_classification_agent/pyproject.toml new file mode 100644 index 000000000..7d81d251a --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "binary-classification-agent" +version = "0.0.1" +description = "Rule-based spam/ham classifier demonstrating the binary classification evaluator" +requires-python = ">=3.11" +dependencies = ["uipath"] + +[dependency-groups] +dev = ["uipath-dev"] diff --git a/packages/uipath/samples/binary_classification_agent/uipath.json b/packages/uipath/samples/binary_classification_agent/uipath.json new file mode 100644 index 000000000..9b02c2654 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "main.py:main" + } +} diff --git a/packages/uipath/samples/multiclass_classification_simple/bindings.json b/packages/uipath/samples/multiclass_classification_simple/bindings.json new file mode 100644 index 000000000..5e9beeb01 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/bindings.json @@ -0,0 +1,4 @@ +{ + "version": "2.0", + "resources": [] +} diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json new file mode 100644 index 000000000..27e66c25d --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json @@ -0,0 +1,85 @@ +{ + "version": "1.0", + "id": "EmailMulticlassEval", + "name": "3-class email router — macro F1", + "evaluatorRefs": ["EmailMulticlassFScore"], + "evaluations": [ + { + "id": "pay-invoice", + "name": "Payments: invoice reminder", + "inputs": { + "email_subject": "Your March invoice is ready", + "email_body": "Your monthly invoice of $45.99 is now available. Payment is due March 15." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "payments" } + } + }, + { + "id": "pay-refund", + "name": "Payments: refund request", + "inputs": { + "email_subject": "Refund for last month's charge", + "email_body": "I was charged twice for the same service. Please process a refund." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "payments" } + } + }, + { + "id": "support-broken", + "name": "Support: feature broken", + "inputs": { + "email_subject": "Login is broken", + "email_body": "I'm getting an error when trying to sign in. Need help." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "support" } + } + }, + { + "id": "support-question", + "name": "Support: how-to question", + "inputs": { + "email_subject": "How do I export my data?", + "email_body": "Can you help me figure out where the export button is?" + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "support" } + } + }, + { + "id": "spam-prize", + "name": "Spam: prize giveaway", + "inputs": { + "email_subject": "You won a FREE iPhone!!!", + "email_body": "Congratulations! Click here to claim your prize." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "spam" } + } + }, + { + "id": "spam-promo", + "name": "Spam: marketing winner", + "inputs": { + "email_subject": "Winner of the monthly drawing", + "email_body": "Congratulations, click here to redeem your reward." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "spam" } + } + }, + { + "id": "support-misrouted-by-payment-word", + "name": "Support email accidentally routed to payments (forces an FP for payments)", + "inputs": { + "email_subject": "Question about my billing portal access", + "email_body": "I cannot log into the billing portal. The page just spins. Can you help?" + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "support" } + } + } + ] +} diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json new file mode 100644 index 000000000..859a18562 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json @@ -0,0 +1,17 @@ +{ + "version": "1.0", + "id": "EmailMulticlassFScore", + "description": "Macro-averaged F1 across payments / support / spam", + "evaluatorTypeId": "uipath-multiclass-classification", + "evaluatorConfig": { + "name": "EmailMulticlassFScore", + "targetOutputKey": "category", + "classes": ["payments", "support", "spam"], + "metricType": "f-score", + "averaging": "macro", + "fValue": 1.0, + "defaultEvaluationCriteria": { + "expectedClass": "support" + } + } +} diff --git a/packages/uipath/samples/multiclass_classification_simple/main.py b/packages/uipath/samples/multiclass_classification_simple/main.py new file mode 100644 index 000000000..3ab684298 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/main.py @@ -0,0 +1,51 @@ +"""Rule-based 3-class email router demonstrating the multiclass classification evaluator.""" + +from dataclasses import dataclass + +from uipath.tracing import traced + +SPAM_TOKENS = {"free", "winner", "congratulations", "click here", "prize", "!!!"} +PAYMENT_TOKENS = {"invoice", "payment", "refund", "charge", "billing", "$"} +SUPPORT_TOKENS = { + "help", + "support", + "issue", + "error", + "ticket", + "broken", + "not working", +} + + +@dataclass +class EmailInput: + email_subject: str + email_body: str + + +@dataclass +class Classification: + category: str + + +@traced(name="classify_email", span_type="tool") +def classify_email(subject: str, body: str) -> str: + """Classify into 'spam', 'payments', or 'support' using priority rules. + + Spam is checked first so promos with billing-flavored words still route to spam. + Payments is checked before support because it is the more specific intent. + Support is the catch-all default. + """ + text = f"{subject} {body}".lower() + if any(token in text for token in SPAM_TOKENS): + return "spam" + if any(token in text for token in PAYMENT_TOKENS): + return "payments" + return "support" + + +@traced() +async def main(input: EmailInput) -> Classification: + """Route an email to one of three queues.""" + category = classify_email(input.email_subject, input.email_body) + return Classification(category=category) diff --git a/packages/uipath/samples/multiclass_classification_simple/pyproject.toml b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml new file mode 100644 index 000000000..e803a2a43 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "multiclass-classification-simple" +version = "0.0.1" +description = "Rule-based 3-class email router demonstrating the multiclass classification evaluator with macro-averaged F1" +requires-python = ">=3.11" +dependencies = ["uipath"] + +[dependency-groups] +dev = ["uipath-dev"] diff --git a/packages/uipath/samples/multiclass_classification_simple/uipath.json b/packages/uipath/samples/multiclass_classification_simple/uipath.json new file mode 100644 index 000000000..9b02c2654 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "main.py:main" + } +} diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py new file mode 100644 index 000000000..202363221 --- /dev/null +++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py @@ -0,0 +1,193 @@ +"""End-to-end tests that run the classification sample projects through evaluate(). + +These tests double as integration coverage for the binary and multiclass +classification evaluators added in #1397 — they wire each sample's main.py +into a stand-in runtime, run the full eval set, and assert the per-row scores +plus the aggregated metric produced by `reduce_scores`. +""" + +import importlib.util +import uuid +from pathlib import Path +from types import ModuleType +from typing import Any, AsyncGenerator + +import pytest + +from uipath.core.events import EventBus +from uipath.core.tracing import UiPathTraceManager +from uipath.eval.helpers import EvalHelpers +from uipath.eval.runtime import UiPathEvalContext, evaluate +from uipath.eval.runtime._types import UiPathEvalOutput +from uipath.eval.runtime.runtime import compute_evaluator_scores +from uipath.runtime import ( + UiPathExecuteOptions, + UiPathRuntimeEvent, + UiPathRuntimeFactorySettings, + UiPathRuntimeProtocol, + UiPathRuntimeResult, + UiPathRuntimeStatus, + UiPathRuntimeStorageProtocol, + UiPathStreamOptions, +) +from uipath.runtime.schema import UiPathRuntimeSchema + +SAMPLES_DIR = Path(__file__).resolve().parents[3] / "samples" + + +def _load_sample_main(sample_dir: Path) -> ModuleType: + """Import a sample's main.py as an isolated module.""" + module_name = f"_eval_sample_{sample_dir.name}" + spec = importlib.util.spec_from_file_location(module_name, sample_dir / "main.py") + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class _SampleRuntime: + """Runtime that delegates execution to the sample's `main` function.""" + + def __init__(self, sample_main: Any) -> None: + self._sample_main = sample_main + + async def execute( + self, + input: dict[str, Any] | None = None, + options: UiPathExecuteOptions | None = None, + ) -> UiPathRuntimeResult: + input_model = self._sample_main.EmailInput(**(input or {})) + output = await self._sample_main.main(input_model) + return UiPathRuntimeResult( + output={"category": output.category}, + status=UiPathRuntimeStatus.SUCCESSFUL, + ) + + async def stream( + self, + input: dict[str, Any] | None = None, + options: UiPathStreamOptions | None = None, + ) -> AsyncGenerator[UiPathRuntimeEvent, None]: + yield await self.execute(input, None) + + async def get_schema(self) -> UiPathRuntimeSchema: + return UiPathRuntimeSchema( + filePath="main.py", + uniqueId="main", + type="agent", + input={ + "type": "object", + "properties": { + "email_subject": {"type": "string"}, + "email_body": {"type": "string"}, + }, + }, + output={ + "type": "object", + "properties": {"category": {"type": "string"}}, + }, + ) + + async def dispose(self) -> None: + pass + + +class _SampleFactory: + def __init__(self, sample_main: Any) -> None: + self._sample_main = sample_main + + def discover_entrypoints(self) -> list[str]: + return ["main"] + + async def get_storage(self) -> UiPathRuntimeStorageProtocol | None: + return None + + async def get_settings(self) -> UiPathRuntimeFactorySettings | None: + return None + + async def new_runtime( + self, entrypoint: str, runtime_id: str, **kwargs: Any + ) -> UiPathRuntimeProtocol: + return _SampleRuntime(self._sample_main) + + async def dispose(self) -> None: + pass + + +async def _run_sample(sample_dir: Path) -> tuple[UiPathEvalOutput, dict[str, float]]: + """Run the sample's eval set and return (per-row output, evaluator_averages).""" + sample_main = _load_sample_main(sample_dir) + factory = _SampleFactory(sample_main) + + eval_set_path = str(sample_dir / "evaluations" / "eval-sets" / "default.json") + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None + ) + + runtime = await factory.new_runtime("main", "test-runtime-id") + runtime_schema = await runtime.get_schema() + + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators + + result = await evaluate( + factory, + UiPathTraceManager(), + context, + EventBus(), + ) + + eval_output = UiPathEvalOutput.model_validate(result.output) + _, evaluator_averages = compute_evaluator_scores( + eval_output.evaluation_set_results, evaluators + ) + return eval_output, evaluator_averages + + +def _per_row_scores(output: UiPathEvalOutput) -> dict[str, float]: + return { + row.evaluation_name: row.evaluation_run_results[0].result.score + for row in output.evaluation_set_results + } + + +async def test_binary_classification_sample_end_to_end(): + """Binary spam classifier: 4/5 datapoints correct, but precision is 2/3 because of one FP.""" + output, averages = await _run_sample(SAMPLES_DIR / "binary_classification_agent") + + per_row = _per_row_scores(output) + assert per_row == { + "Spam: prize giveaway": 1.0, + "Spam: unsolicited promo": 1.0, + "Ham: legitimate invoice": 1.0, + "Ham: meeting request": 1.0, + "Ham mislabeled as spam (forces a false positive)": 0.0, + } + # Precision = TP / (TP + FP) = 2 / (2 + 1) = 0.6666... + assert averages["BinarySpamPrecision"] == pytest.approx(2 / 3, rel=1e-6) + + +async def test_multiclass_classification_sample_end_to_end(): + """Multiclass router: 6/7 correct, macro F1 = (0.8 + 0.8 + 1.0) / 3 = 0.8666...""" + output, averages = await _run_sample( + SAMPLES_DIR / "multiclass_classification_simple" + ) + + per_row = _per_row_scores(output) + assert per_row == { + "Payments: invoice reminder": 1.0, + "Payments: refund request": 1.0, + "Support: feature broken": 1.0, + "Support: how-to question": 1.0, + "Spam: prize giveaway": 1.0, + "Spam: marketing winner": 1.0, + "Support email accidentally routed to payments " + "(forces an FP for payments)": 0.0, + } + # payments F1=0.8 (P=2/3, R=1), support F1=0.8 (P=1, R=2/3), spam F1=1.0 + # macro = mean = 2.6 / 3 + assert averages["EmailMulticlassFScore"] == pytest.approx(2.6 / 3, rel=1e-6) From 5e574f1895feccb314fd929d57e15dd69580c5f0 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Wed, 20 May 2026 14:05:44 -0700 Subject: [PATCH 03/13] feat(eval): add dataset-level evaluator framework with precision/recall/f-score MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces a new BaseDatasetEvaluator concept that runs once per evaluation set after all per-datapoint evaluators complete. It consumes per-datapoint EvaluationResultDto values from a named source evaluator and emits a single run-level EvaluationResult. Includes three starter evaluators for multiclass classification metrics: - PrecisionDatasetEvaluator - RecallDatasetEvaluator - FScoreDatasetEvaluator (configurable beta) Each takes a required classes list (populated from the UI), supports micro or macro averaging, and emits per-class TP/TN/FP/FN plus the confusion matrix in details. Binary is the 2-class case — no separate binary path. Architecture: BaseDatasetEvaluator is a parallel hierarchy to GenericBaseEvaluator (not a subclass) so the per-datapoint dispatch loop cannot accidentally pick up a dataset evaluator. Each dataset evaluator declares a single source_evaluator by name; the runtime groups per-datapoint results by evaluator name and routes the right list to each dataset evaluator. Configs load from /../dataset_evaluators/*.json mirroring the evaluators directory layout. Patch version bumped: 2.10.68 -> 2.10.69. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/uipath/pyproject.toml | 2 +- packages/uipath/src/uipath/_cli/cli_eval.py | 7 + .../eval/evaluators/base_dataset_evaluator.py | 75 ++++ .../classification_dataset_evaluators.py | 311 +++++++++++++ .../evaluators/dataset_evaluator_factory.py | 52 +++ packages/uipath/src/uipath/eval/helpers.py | 88 ++++ .../src/uipath/eval/models/evaluation_set.py | 3 + .../uipath/src/uipath/eval/models/models.py | 3 + .../uipath/src/uipath/eval/runtime/_types.py | 5 +- .../uipath/src/uipath/eval/runtime/context.py | 2 + .../uipath/src/uipath/eval/runtime/runtime.py | 50 +++ .../test_dataset_classification_evaluators.py | 411 ++++++++++++++++++ 12 files changed, 1007 insertions(+), 2 deletions(-) create mode 100644 packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py create mode 100644 packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py create mode 100644 packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py create mode 100644 packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml index 36550f54d..0d70cb383 100644 --- a/packages/uipath/pyproject.toml +++ b/packages/uipath/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.10.68" +version = "2.10.69" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/packages/uipath/src/uipath/_cli/cli_eval.py b/packages/uipath/src/uipath/_cli/cli_eval.py index e101717d6..2e35db849 100644 --- a/packages/uipath/src/uipath/_cli/cli_eval.py +++ b/packages/uipath/src/uipath/_cli/cli_eval.py @@ -412,6 +412,13 @@ async def execute_eval(): get_agent_model(eval_context.runtime_schema), ) + eval_context.dataset_evaluators = ( + await EvalHelpers.load_dataset_evaluators( + resolved_eval_set_path, + eval_context.evaluation_set, + ) + ) + # Runtime is not required anymore. await runtime.dispose() diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py new file mode 100644 index 000000000..ae818a421 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py @@ -0,0 +1,75 @@ +"""Base abstractions for dataset-level evaluators. + +A dataset-level evaluator runs once per evaluation set, after all per-datapoint +evaluators have produced their results. It consumes the per-datapoint +EvaluationResultDto values from one named source evaluator and emits a single +EvaluationResult that summarizes the dataset. + +Concretely distinct from GenericBaseEvaluator: different evaluate() signature, +different lifecycle. Kept as a parallel hierarchy rather than a subclass so +the runtime cannot accidentally dispatch a dataset evaluator through the +per-datapoint loop. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Generic, TypeVar + +from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel + +from ..models.models import EvaluationResult, EvaluationResultDto + + +class BaseDatasetEvaluatorConfig(BaseModel): + """Configuration shared by all dataset-level evaluators.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + id: str + name: str + type: str + source_evaluator: str = Field( + ..., + description=( + "Name of the per-datapoint evaluator whose EvaluationResultDto values " + "this dataset evaluator consumes." + ), + ) + + +ConfigT = TypeVar("ConfigT", bound=BaseDatasetEvaluatorConfig) + + +class BaseDatasetEvaluator(ABC, Generic[ConfigT]): + """Abstract base for dataset-level evaluators. + + Subclasses implement ``evaluate`` over the per-datapoint EvaluationResultDto + values produced by ``config.source_evaluator``. + """ + + config: ConfigT + + def __init__(self, config: ConfigT) -> None: + """Store the evaluator's configuration.""" + self.config = config + + @property + def name(self) -> str: + """Logical name of this evaluator instance (used as result-dict key).""" + return self.config.name + + @property + def source_evaluator(self) -> str: + """Name of the upstream evaluator whose results this one consumes.""" + return self.config.source_evaluator + + @classmethod + @abstractmethod + def get_evaluator_id(cls) -> str: + """Stable identifier matching the ``type`` discriminator on configs.""" + + @abstractmethod + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Reduce per-datapoint results into a single run-level EvaluationResult.""" diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py new file mode 100644 index 000000000..272541e21 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py @@ -0,0 +1,311 @@ +"""Dataset-level classification evaluators: Precision, Recall, F-score. + +All three share the same internal machinery — a k x k confusion matrix built +from each per-datapoint result's BaseEvaluatorJustification (expected, actual) +strings. They differ only in the final formula and (for F-score) the beta +parameter. The headline ``score`` is the micro or macro average per config; +``details`` carries the full per-class breakdown plus the confusion matrix. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel + +from ..models.models import ( + EvaluationResult, + EvaluationResultDto, + EvaluatorType, + NumericEvaluationResult, +) +from .base_dataset_evaluator import BaseDatasetEvaluator, BaseDatasetEvaluatorConfig +from .base_evaluator import BaseEvaluatorJustification + + +def _coerce_justification(details: object) -> tuple[str, str] | None: + """Extract (expected, actual) from an EvaluationResultDto.details payload.""" + if isinstance(details, BaseEvaluatorJustification): + return details.expected, details.actual + if isinstance(details, dict): + try: + j = BaseEvaluatorJustification.model_validate(details) + except Exception: + return None + return j.expected, j.actual + return None + + +class PerClassMetrics(BaseModel): + """Per-class confusion counts plus the metric the evaluator computed.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + tp: int + tn: int + fp: int + fn: int + support: int + value: float + + +class ClassificationDetails(BaseModel): + """Structured details payload emitted by every classification evaluator.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + metric: str + average: str + classes: list[str] + confusion_matrix: list[list[int]] + per_class: dict[str, PerClassMetrics] + micro: float + macro: float + n_total: int + n_scored: int + n_skipped: int + + +class _ConfusionData: + """Internal: confusion matrix and per-class counts derived from results.""" + + __slots__ = ("classes", "matrix", "n_total", "n_scored", "n_skipped") + + def __init__( + self, + classes: list[str], + matrix: list[list[int]], + n_total: int, + n_scored: int, + n_skipped: int, + ) -> None: + self.classes = classes + self.matrix = matrix + self.n_total = n_total + self.n_scored = n_scored + self.n_skipped = n_skipped + + def counts_for(self, class_index: int) -> tuple[int, int, int, int]: + """Return (tp, fp, fn, tn) for a class index.""" + k = len(self.classes) + tp = self.matrix[class_index][class_index] + fp = sum(self.matrix[class_index][j] for j in range(k)) - tp + fn = sum(self.matrix[j][class_index] for j in range(k)) - tp + tn = self.n_scored - tp - fp - fn + return tp, fp, fn, tn + + +def _build_confusion( + results: list[EvaluationResultDto], + classes: list[str], + case_sensitive: bool, +) -> _ConfusionData: + """Build a confusion matrix from per-datapoint results. + + Results without a parseable justification are counted in ``n_skipped`` and + omitted from the matrix. Pairs whose expected or actual label isn't in + ``classes`` are also skipped. + """ + + def norm(label: str) -> str: + return label if case_sensitive else label.lower() + + canonical_classes = [norm(c) for c in classes] + index_of = {c: i for i, c in enumerate(canonical_classes)} + k = len(canonical_classes) + matrix = [[0] * k for _ in range(k)] + + n_total = len(results) + n_scored = 0 + n_skipped = 0 + + for r in results: + j = _coerce_justification(r.details) + if j is None: + n_skipped += 1 + continue + exp = norm(j[0]) + act = norm(j[1]) + if exp not in index_of or act not in index_of: + n_skipped += 1 + continue + matrix[index_of[act]][index_of[exp]] += 1 + n_scored += 1 + + return _ConfusionData( + classes=canonical_classes, + matrix=matrix, + n_total=n_total, + n_scored=n_scored, + n_skipped=n_skipped, + ) + + +def _precision_of(tp: int, fp: int, _fn: int, _tn: int) -> float: + return tp / (tp + fp) if (tp + fp) > 0 else 0.0 + + +def _recall_of(tp: int, _fp: int, fn: int, _tn: int) -> float: + return tp / (tp + fn) if (tp + fn) > 0 else 0.0 + + +def _f_score_of(beta: float): + beta_sq = beta * beta + + def compute(tp: int, fp: int, fn: int, _tn: int) -> float: + p = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + r = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + denom = beta_sq * p + r + return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0 + + return compute + + +def _build_details( + confusion: _ConfusionData, + metric_name: str, + average: str, + per_class_fn, +) -> tuple[ClassificationDetails, float]: + """Compute per-class values, micro, macro, and pick the headline. + + Returns (details, headline_score). ``headline_score`` is the micro or macro + average per the evaluator's ``average`` setting. + """ + per_class: dict[str, PerClassMetrics] = {} + total_tp = 0 + total_fp = 0 + total_fn = 0 + + for c, label in enumerate(confusion.classes): + tp, fp, fn, tn = confusion.counts_for(c) + total_tp += tp + total_fp += fp + total_fn += fn + per_class[label] = PerClassMetrics( + tp=tp, + tn=tn, + fp=fp, + fn=fn, + support=tp + fn, + value=per_class_fn(tp, fp, fn, tn), + ) + + micro = per_class_fn(total_tp, total_fp, total_fn, 0) + + k = len(confusion.classes) + macro = sum(per_class[c].value for c in confusion.classes) / k if k > 0 else 0.0 + + details = ClassificationDetails( + metric=metric_name, + average=average, + classes=confusion.classes, + confusion_matrix=confusion.matrix, + per_class=per_class, + micro=micro, + macro=macro, + n_total=confusion.n_total, + n_scored=confusion.n_scored, + n_skipped=confusion.n_skipped, + ) + + headline = micro if average == "micro" else macro + return details, headline + + +# ─── configs ────────────────────────────────────────────────────────────────── + + +class _BaseClassificationConfig(BaseDatasetEvaluatorConfig): + """Shared config for the three classification evaluators.""" + + classes: list[str] = Field( + ..., + min_length=1, + description="Class labels expected in the upstream evaluator's justifications.", + ) + average: Literal["micro", "macro"] = "macro" + case_sensitive: bool = False + + +class PrecisionDatasetEvaluatorConfig(_BaseClassificationConfig): + """Configuration for the dataset-level precision evaluator.""" + + type: str = EvaluatorType.DATASET_PRECISION.value + + +class RecallDatasetEvaluatorConfig(_BaseClassificationConfig): + """Configuration for the dataset-level recall evaluator.""" + + type: str = EvaluatorType.DATASET_RECALL.value + + +class FScoreDatasetEvaluatorConfig(_BaseClassificationConfig): + """Configuration for the dataset-level F-score evaluator.""" + + type: str = EvaluatorType.DATASET_F_SCORE.value + f_value: float = Field(default=1.0, gt=0, description="Beta value for F_beta.") + + +# ─── evaluators ─────────────────────────────────────────────────────────────── + + +class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionDatasetEvaluatorConfig]): + """Dataset-level precision evaluator (multiclass, micro or macro averaged).""" + + @classmethod + def get_evaluator_id(cls) -> str: + """Identifier matching the type discriminator on configs.""" + return EvaluatorType.DATASET_PRECISION.value + + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Compute the precision report and return the headline as score.""" + confusion = _build_confusion( + results, self.config.classes, self.config.case_sensitive + ) + details, headline = _build_details( + confusion, "precision", self.config.average, _precision_of + ) + return NumericEvaluationResult(score=headline, details=details) + + +class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallDatasetEvaluatorConfig]): + """Dataset-level recall evaluator (multiclass, micro or macro averaged).""" + + @classmethod + def get_evaluator_id(cls) -> str: + """Identifier matching the type discriminator on configs.""" + return EvaluatorType.DATASET_RECALL.value + + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Compute the recall report and return the headline as score.""" + confusion = _build_confusion( + results, self.config.classes, self.config.case_sensitive + ) + details, headline = _build_details( + confusion, "recall", self.config.average, _recall_of + ) + return NumericEvaluationResult(score=headline, details=details) + + +class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreDatasetEvaluatorConfig]): + """Dataset-level F-beta evaluator (multiclass, micro or macro averaged).""" + + @classmethod + def get_evaluator_id(cls) -> str: + """Identifier matching the type discriminator on configs.""" + return EvaluatorType.DATASET_F_SCORE.value + + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Compute the F-beta report and return the headline as score.""" + confusion = _build_confusion( + results, self.config.classes, self.config.case_sensitive + ) + details, headline = _build_details( + confusion, + "f_score", + self.config.average, + _f_score_of(self.config.f_value), + ) + return NumericEvaluationResult(score=headline, details=details) diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py new file mode 100644 index 000000000..8ba0dbe62 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py @@ -0,0 +1,52 @@ +"""Factory that instantiates dataset-level evaluators from configuration.""" + +from __future__ import annotations + +from typing import Any + +from ..models.models import EvaluatorType +from .base_dataset_evaluator import BaseDatasetEvaluator +from .classification_dataset_evaluators import ( + FScoreDatasetEvaluator, + FScoreDatasetEvaluatorConfig, + PrecisionDatasetEvaluator, + PrecisionDatasetEvaluatorConfig, + RecallDatasetEvaluator, + RecallDatasetEvaluatorConfig, +) + +_EVALUATOR_REGISTRY: dict[str, type[BaseDatasetEvaluator[Any]]] = { + EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluator, + EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluator, + EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluator, +} + +_CONFIG_REGISTRY: dict[str, type[Any]] = { + EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluatorConfig, + EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluatorConfig, + EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluatorConfig, +} + + +def build_dataset_evaluator( + config_data: dict[str, Any], +) -> BaseDatasetEvaluator[Any]: + """Build a dataset evaluator instance from a parsed JSON config dict. + + Raises: + ValueError: If ``type`` is missing or unknown. + """ + evaluator_type = config_data.get("type") + if not evaluator_type: + raise ValueError("Dataset evaluator config is missing required field 'type'") + + config_cls = _CONFIG_REGISTRY.get(evaluator_type) + evaluator_cls = _EVALUATOR_REGISTRY.get(evaluator_type) + if config_cls is None or evaluator_cls is None: + known = sorted(_EVALUATOR_REGISTRY.keys()) + raise ValueError( + f"Unknown dataset evaluator type '{evaluator_type}'. Known types: {known}" + ) + + config = config_cls.model_validate(config_data) + return evaluator_cls(config) diff --git a/packages/uipath/src/uipath/eval/helpers.py b/packages/uipath/src/uipath/eval/helpers.py index 8405e4a7a..fbe210a93 100644 --- a/packages/uipath/src/uipath/eval/helpers.py +++ b/packages/uipath/src/uipath/eval/helpers.py @@ -9,7 +9,9 @@ from uipath.runtime.schema import UiPathRuntimeSchema +from .evaluators.base_dataset_evaluator import BaseDatasetEvaluator from .evaluators.base_evaluator import GenericBaseEvaluator +from .evaluators.dataset_evaluator_factory import build_dataset_evaluator from .evaluators.evaluator_factory import EvaluatorFactory from .mocks._types import InputMockingStrategy, LLMMockingStrategy from .models._conversational_utils import UiPathLegacyEvalChatMessagesMapper @@ -280,6 +282,92 @@ async def load_evaluators( return evaluators + @staticmethod + async def load_dataset_evaluators( + eval_set_path: str, + evaluation_set: EvaluationSet, + ) -> list[BaseDatasetEvaluator[Any]]: + """Load dataset-level evaluators referenced by the evaluation set. + + Dataset evaluator config JSON files are expected to live under + ``/../dataset_evaluators/``, mirroring the evaluators + layout. Each config is matched to a reference by its top-level ``id``. + + Validates that every dataset evaluator's ``source_evaluator`` is one of + the per-datapoint evaluators declared on the eval set; raises if not. + """ + if evaluation_set is None: + raise ValueError("eval_set cannot be None") + + dataset_ref_ids = { + ref.ref for ref in evaluation_set.dataset_evaluator_refs + } + if not dataset_ref_ids: + return [] + + dataset_dir = Path(eval_set_path).parent.parent / "dataset_evaluators" + if not dataset_dir.exists(): + raise ValueError( + f"Dataset evaluators directory not found at '{dataset_dir}', " + f"but evaluation set references dataset evaluators: " + f"{sorted(dataset_ref_ids)}" + ) + + # Build the set of per-datapoint evaluator names so we can validate + # source_evaluator references up front. + if evaluation_set.evaluator_configs: + known_evaluator_names = { + ref.ref for ref in evaluation_set.evaluator_configs + } + else: + known_evaluator_names = set(evaluation_set.evaluator_refs) + + dataset_evaluators: list[BaseDatasetEvaluator[Any]] = [] + found_ids: set[str] = set() + + for file in dataset_dir.glob("*.json"): + try: + with open(file, "r", encoding="utf-8") as f: + data = json.load(f) + except json.JSONDecodeError as e: + raise ValueError( + f"Invalid JSON in dataset evaluator file '{file}': {str(e)}." + ) from e + + evaluator_id = data.get("id") + if evaluator_id not in dataset_ref_ids: + continue + + try: + evaluator = build_dataset_evaluator(data) + except Exception as e: + raise ValueError( + f"Failed to create dataset evaluator from file '{file}': " + f"{str(e)}." + ) from e + + if ( + known_evaluator_names + and evaluator.source_evaluator not in known_evaluator_names + ): + raise ValueError( + f"Dataset evaluator '{evaluator.name}' references " + f"source_evaluator='{evaluator.source_evaluator}' which is " + f"not declared in this evaluation set. Known evaluators: " + f"{sorted(known_evaluator_names)}" + ) + + dataset_evaluators.append(evaluator) + found_ids.add(evaluator_id) + + missing = dataset_ref_ids - found_ids + if missing: + raise ValueError( + f"Could not find the following dataset evaluators: {missing}" + ) + + return dataset_evaluators + def get_agent_model(schema: UiPathRuntimeSchema) -> str | None: """Get agent model from the runtime schema metadata. diff --git a/packages/uipath/src/uipath/eval/models/evaluation_set.py b/packages/uipath/src/uipath/eval/models/evaluation_set.py index 22e6ce244..711fedeb9 100644 --- a/packages/uipath/src/uipath/eval/models/evaluation_set.py +++ b/packages/uipath/src/uipath/eval/models/evaluation_set.py @@ -145,6 +145,9 @@ class EvaluationSet(BaseModel): evaluator_configs: list[EvaluatorReference] = Field( default_factory=list, alias="evaluatorConfigs" ) + dataset_evaluator_refs: list[EvaluatorReference] = Field( + default_factory=list, alias="datasetEvaluatorRefs" + ) evaluations: list[EvaluationItem] = Field(default_factory=list) model_settings: list[EvaluationSetModelSettings] = Field( default_factory=list, alias="modelSettings" diff --git a/packages/uipath/src/uipath/eval/models/models.py b/packages/uipath/src/uipath/eval/models/models.py index d2dc26df9..f3c9b57e1 100644 --- a/packages/uipath/src/uipath/eval/models/models.py +++ b/packages/uipath/src/uipath/eval/models/models.py @@ -300,6 +300,9 @@ class EvaluatorType(str, Enum): TOOL_CALL_OUTPUT = "uipath-tool-call-output" BINARY_CLASSIFICATION = "uipath-binary-classification" MULTICLASS_CLASSIFICATION = "uipath-multiclass-classification" + DATASET_PRECISION = "uipath-dataset-precision" + DATASET_RECALL = "uipath-dataset-recall" + DATASET_F_SCORE = "uipath-dataset-f-score" class ToolCall(BaseModel): diff --git a/packages/uipath/src/uipath/eval/runtime/_types.py b/packages/uipath/src/uipath/eval/runtime/_types.py index 2aee5e599..fa84f0d9e 100644 --- a/packages/uipath/src/uipath/eval/runtime/_types.py +++ b/packages/uipath/src/uipath/eval/runtime/_types.py @@ -1,7 +1,7 @@ import logging from opentelemetry.sdk.trace import ReadableSpan -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from pydantic.alias_generators import to_camel from uipath.runtime import UiPathRuntimeResult @@ -78,6 +78,9 @@ class UiPathEvalOutput(BaseModel): evaluation_set_name: str evaluation_set_results: list[UiPathEvalRunResult] + dataset_evaluator_results: dict[str, EvaluationResultDto] = Field( + default_factory=dict + ) @property def score(self) -> float: diff --git a/packages/uipath/src/uipath/eval/runtime/context.py b/packages/uipath/src/uipath/eval/runtime/context.py index b8224718c..f3b713320 100644 --- a/packages/uipath/src/uipath/eval/runtime/context.py +++ b/packages/uipath/src/uipath/eval/runtime/context.py @@ -4,6 +4,7 @@ from uipath.runtime.schema import UiPathRuntimeSchema +from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator from ..evaluators.base_evaluator import GenericBaseEvaluator from ..models.evaluation_set import EvaluationSet @@ -27,3 +28,4 @@ class UiPathEvalContext: input_overrides: dict[str, Any] | None = None resume: bool = False job_id: str | None = None + dataset_evaluators: list[BaseDatasetEvaluator[Any]] | None = None diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index 7f7614446..5cadcc527 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -45,6 +45,7 @@ from uipath.runtime.schema import UiPathRuntimeSchema from .._execution_context import ExecutionSpanCollector +from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator from ..evaluators.base_evaluator import GenericBaseEvaluator from ..evaluators.output_evaluator import OutputEvaluationCriteria from ..helpers import get_agent_model @@ -202,6 +203,43 @@ def compute_evaluator_scores( return final_score, agg_metrics_per_evaluator +def compute_dataset_evaluator_results( + evaluation_set_results: list[UiPathEvalRunResult], + dataset_evaluators: Iterable[BaseDatasetEvaluator[Any]], +) -> dict[str, EvaluationResultDto]: + """Run each dataset evaluator over its source evaluator's per-datapoint results. + + Args: + evaluation_set_results: Per-datapoint results from the run. + dataset_evaluators: Dataset-level evaluator instances. Each is routed to + the per-datapoint results from ``evaluator.source_evaluator``. + + Returns: + Dict mapping dataset evaluator name to its serialized EvaluationResultDto. + Dataset evaluators whose source produced no results are still invoked + with an empty list so they can emit a zeroed result. + """ + results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict( + list + ) + for eval_run_result in evaluation_set_results: + for eval_run_result_dto in eval_run_result.evaluation_run_results: + if eval_run_result_dto.is_line_result: + continue + results_by_evaluator[eval_run_result_dto.evaluator_name].append( + eval_run_result_dto.result + ) + + dataset_results: dict[str, EvaluationResultDto] = {} + for evaluator in dataset_evaluators: + source = evaluator.source_evaluator + evaluation_result = evaluator.evaluate(results_by_evaluator.get(source, [])) + dataset_results[evaluator.name] = EvaluationResultDto.from_evaluation_result( + evaluation_result + ) + return dataset_results + + class UiPathEvalRuntime: """Specialized runtime for evaluation runs, with access to the factory.""" @@ -381,6 +419,18 @@ async def execute(self) -> UiPathRuntimeResult: evaluators, ) + # Run any dataset-level evaluators configured on the eval + # set. Each consumes the per-datapoint results from one + # named source evaluator and emits a single run-level + # EvaluationResultDto stored on UiPathEvalOutput. + if self.context.dataset_evaluators: + results.dataset_evaluator_results = ( + compute_dataset_evaluator_results( + results.evaluation_set_results, + self.context.dataset_evaluators, + ) + ) + # Configure span with output and metadata await configure_eval_set_run_span( span=span, diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py new file mode 100644 index 000000000..08d81818d --- /dev/null +++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py @@ -0,0 +1,411 @@ +"""Tests for dataset-level classification evaluators (Precision, Recall, FScore). + +Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases +(empty input, out-of-vocab labels, malformed details), and runtime-level +routing where compute_dataset_evaluator_results selects results by name. +""" + +import uuid + +import pytest + +from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification +from uipath.eval.evaluators.classification_dataset_evaluators import ( + ClassificationDetails, + FScoreDatasetEvaluator, + FScoreDatasetEvaluatorConfig, + PrecisionDatasetEvaluator, + PrecisionDatasetEvaluatorConfig, + RecallDatasetEvaluator, + RecallDatasetEvaluatorConfig, +) +from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator +from uipath.eval.models.models import ( + EvaluationResultDto, + EvaluatorType, + NumericEvaluationResult, +) +from uipath.eval.runtime._types import ( + UiPathEvalRunResult, + UiPathEvalRunResultDto, +) +from uipath.eval.runtime.runtime import compute_dataset_evaluator_results + + +def _result( + expected: str, actual: str, score: float | None = None +) -> EvaluationResultDto: + """Build an EvaluationResultDto carrying an expected/actual justification.""" + if score is None: + score = 1.0 if expected.lower() == actual.lower() else 0.0 + justification = BaseEvaluatorJustification(expected=expected, actual=actual) + return EvaluationResultDto( + score=score, + details=justification.model_dump(), + ) + + +def _precision(classes: list[str], average: str = "macro") -> PrecisionDatasetEvaluator: + return PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="p1", + name="precision", + source_evaluator="intent_match", + classes=classes, + average=average, # type: ignore[arg-type] + ) + ) + + +def _recall(classes: list[str], average: str = "macro") -> RecallDatasetEvaluator: + return RecallDatasetEvaluator( + RecallDatasetEvaluatorConfig( + id="r1", + name="recall", + source_evaluator="intent_match", + classes=classes, + average=average, # type: ignore[arg-type] + ) + ) + + +def _fscore( + classes: list[str], average: str = "macro", f_value: float = 1.0 +) -> FScoreDatasetEvaluator: + return FScoreDatasetEvaluator( + FScoreDatasetEvaluatorConfig( + id="f1", + name="fscore", + source_evaluator="intent_match", + classes=classes, + average=average, # type: ignore[arg-type] + f_value=f_value, + ) + ) + + +def _details(result: NumericEvaluationResult) -> ClassificationDetails: + """Type-narrowing helper for asserting on details.""" + assert isinstance(result.details, ClassificationDetails) + return result.details + + +class TestPrecisionEvaluator: + def test_empty_input_returns_zeroed_result(self) -> None: + result = _precision(["cat", "dog"]).evaluate([]) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + d = _details(result) + assert d.n_total == 0 and d.n_scored == 0 + assert d.confusion_matrix == [[0, 0], [0, 0]] + assert d.per_class["cat"].tp == 0 + assert d.per_class["cat"].tn == 0 + + def test_two_class_macro(self) -> None: + # 4 datapoints: 2 TP_yes, 1 FN_yes (predicted no), 1 FP_yes (predicted yes when expected no). + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), # FN for yes, FP for no + _result("no", "yes"), # FP for yes, FN for no + ] + result = _precision(["yes", "no"], average="macro").evaluate(results) + d = _details(result) + # precision_yes = 2 / (2 + 1) = 2/3 + # precision_no = 0 / (0 + 1) = 0 + # macro = (2/3 + 0) / 2 = 1/3 + assert d.per_class["yes"].value == pytest.approx(2 / 3) + assert d.per_class["no"].value == pytest.approx(0.0) + assert d.macro == pytest.approx((2 / 3 + 0.0) / 2) + assert result.score == pytest.approx(d.macro) + + def test_two_class_micro_equals_accuracy(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + result = _precision(["yes", "no"], average="micro").evaluate(results) + d = _details(result) + # micro precision = sum(TP) / sum(TP + FP) + # sum(TP) = 2 (yes diag) + 0 (no diag) = 2 + # sum(FP) = 1 (yes off-diag row) + 1 (no off-diag row) = 2 + # micro = 2 / (2 + 2) = 0.5 — equals accuracy 2/4 in the 2-class case + assert d.micro == pytest.approx(0.5) + assert result.score == pytest.approx(0.5) + + def test_three_class_macro(self) -> None: + # Each class gets 2 TP, 1 FP, 1 FN — symmetric setup + pairs = [ + ("cat", "cat"), + ("cat", "cat"), + ("cat", "dog"), # FN_cat, FP_dog + ("dog", "dog"), + ("dog", "dog"), + ("dog", "bird"), # FN_dog, FP_bird + ("bird", "bird"), + ("bird", "bird"), + ("bird", "cat"), # FN_bird, FP_cat + ] + result = _precision(["cat", "dog", "bird"], average="macro").evaluate( + [_result(e, a) for e, a in pairs] + ) + d = _details(result) + # per-class precision = 2 / (2 + 1) = 2/3 for all three + for label in ("cat", "dog", "bird"): + m = d.per_class[label] + assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5 + assert m.value == pytest.approx(2 / 3) + assert d.macro == pytest.approx(2 / 3) + assert result.score == pytest.approx(2 / 3) + + +class TestRecallEvaluator: + def test_two_class_macro(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + result = _recall(["yes", "no"], average="macro").evaluate(results) + d = _details(result) + # recall_yes = TP / (TP + FN) = 2 / (2 + 1) = 2/3 + # recall_no = 0 / (0 + 1) = 0 + # macro = 1/3 + assert d.per_class["yes"].value == pytest.approx(2 / 3) + assert d.per_class["no"].value == pytest.approx(0.0) + assert result.score == pytest.approx(1 / 3) + + def test_recall_differs_from_precision(self) -> None: + # Asymmetric example so precision != recall. + results = [ + _result("yes", "yes"), # TP + _result("yes", "yes"), # TP + _result("no", "yes"), # FP for yes + _result("no", "yes"), # FP for yes + _result("no", "no"), # TP for no + ] + p = _details(_precision(["yes", "no"], average="macro").evaluate(results)) + r = _details(_recall(["yes", "no"], average="macro").evaluate(results)) + # precision_yes = 2/(2+2)=0.5, precision_no = 1/(1+0)=1.0 + assert p.per_class["yes"].value == pytest.approx(0.5) + assert p.per_class["no"].value == pytest.approx(1.0) + # recall_yes = 2/(2+0)=1.0, recall_no = 1/(1+2)=1/3 + assert r.per_class["yes"].value == pytest.approx(1.0) + assert r.per_class["no"].value == pytest.approx(1 / 3) + + +class TestFScoreEvaluator: + def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + f = _details( + _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results) + ) + # precision_yes = 2/3, recall_yes = 2/3 -> F1_yes = 2/3 + # precision_no = 0, recall_no = 0 -> F1_no = 0 + assert f.per_class["yes"].value == pytest.approx(2 / 3) + assert f.per_class["no"].value == pytest.approx(0.0) + assert f.macro == pytest.approx((2 / 3 + 0.0) / 2) + + def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None: + # Asymmetric setup: precision_yes = 0.5, recall_yes = 1.0. + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("no", "yes"), + _result("no", "yes"), + _result("no", "no"), + ] + f1 = _details( + _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results) + ) + f2 = _details( + _fscore(["yes", "no"], average="macro", f_value=2.0).evaluate(results) + ) + # F_beta with beta>1 weighs recall higher. Since recall_yes > precision_yes, + # F2_yes should be > F1_yes. + assert f2.per_class["yes"].value > f1.per_class["yes"].value + + def test_three_class_micro_pools_across_classes(self) -> None: + # Same symmetric setup as the precision macro test. + pairs = [ + ("cat", "cat"), + ("cat", "cat"), + ("cat", "dog"), + ("dog", "dog"), + ("dog", "dog"), + ("dog", "bird"), + ("bird", "bird"), + ("bird", "bird"), + ("bird", "cat"), + ] + d = _details( + _fscore(["cat", "dog", "bird"], average="micro", f_value=1.0).evaluate( + [_result(e, a) for e, a in pairs] + ) + ) + # micro precision == micro recall == 6/9 (accuracy when each off-diag + # contributes once to FP and once to FN globally). micro F1 = 6/9. + assert d.micro == pytest.approx(6 / 9) + + +class TestSkippingAndEdgeCases: + def test_out_of_vocab_labels_are_skipped(self) -> None: + results = [ + _result("cat", "cat"), + _result("cat", "platypus"), # actual not in classes + _result("zebra", "dog"), # expected not in classes + ] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2 + + def test_results_without_justification_are_skipped(self) -> None: + results = [ + _result("cat", "cat"), + EvaluationResultDto(score=1.0, details="just a string"), + EvaluationResultDto(score=0.0, details={"unrelated": "shape"}), + ] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2 + + def test_case_insensitive_by_default(self) -> None: + results = [_result("Cat", "CAT"), _result("DOG", "dog")] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.per_class["cat"].tp == 1 + assert d.per_class["dog"].tp == 1 + + +class TestFactory: + def test_builds_evaluator_from_dict(self) -> None: + config_data = { + "id": "precision_intent", + "name": "precision_intent", + "type": EvaluatorType.DATASET_PRECISION.value, + "sourceEvaluator": "intent_match", + "classes": ["yes", "no"], + "average": "macro", + } + evaluator = build_dataset_evaluator(config_data) + assert isinstance(evaluator, PrecisionDatasetEvaluator) + assert evaluator.source_evaluator == "intent_match" + assert evaluator.name == "precision_intent" + + def test_unknown_type_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown dataset evaluator type"): + build_dataset_evaluator( + { + "id": "x", + "name": "x", + "type": "uipath-not-a-thing", + "sourceEvaluator": "intent_match", + "classes": ["yes", "no"], + } + ) + + def test_missing_type_raises(self) -> None: + with pytest.raises(ValueError, match="missing required field 'type'"): + build_dataset_evaluator( + { + "id": "x", + "name": "x", + "sourceEvaluator": "intent_match", + "classes": ["yes", "no"], + } + ) + + +class TestComputeDatasetEvaluatorResults: + """End-to-end: dataset evaluator picks results by source_evaluator name.""" + + def test_routes_to_correct_source_and_ignores_others(self) -> None: + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + UiPathEvalRunResultDto( + evaluator_name="some_other_evaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.5), + ), + ], + ), + UiPathEvalRunResult( + evaluation_name="dp2", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "no"), + ), + ], + ), + ] + + out = compute_dataset_evaluator_results( + eval_results, [_precision(["yes", "no"], average="macro")] + ) + assert set(out) == {"precision"} + dto = out["precision"] + assert isinstance(dto, EvaluationResultDto) + # The unrelated 0.5 score from some_other_evaluator must NOT be in the + # matrix — only the two intent_match results count. + assert isinstance(dto.details, dict) + assert dto.details["n_scored"] == 2 + + def test_line_by_line_subresults_are_excluded(self) -> None: + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + is_line_result=True, + ), + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("no", "no"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results( + eval_results, [_precision(["yes", "no"])] + ) + assert isinstance(out["precision"].details, dict) + assert out["precision"].details["n_scored"] == 1 + + def test_source_with_no_results_produces_zeroed_report(self) -> None: + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="some_other_evaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + ], + ), + ] + out = compute_dataset_evaluator_results( + eval_results, [_precision(["yes", "no"])] + ) + dto = out["precision"] + assert dto.score == 0.0 + assert isinstance(dto.details, dict) + assert dto.details["n_scored"] == 0 From d6b7ab5566d07a9e34611358a4b7539912982936 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Wed, 20 May 2026 16:14:00 -0700 Subject: [PATCH 04/13] docs(eval): add runnable dataset evaluator demo + bump uv.lock for 2.10.69 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit examples/dataset_evaluators_demo.py walks the new dataset-level evaluators (Precision / Recall / F-score) through five scenarios that exercise the math end-to-end at the SDK layer: 1. Balanced 3-class — symmetric confusion matrix, macro == micro 2. Imbalanced 2-class — shows where macro and micro diverge 3. Same data, four metrics (Precision, Recall, F1, F2) — proves the F-beta knob actually moves per-class numbers 4. Out-of-vocab + malformed details — n_skipped surfaces, no silent drops 5. Realistic 4-class intent classifier — uneven per-class performance Each scenario prints the confusion matrix as a table, the per-class TP/TN/FP/FN + the metric, and a snippet of the wire JSON that AutoMapper will surface to the frontend. Run:: cd packages/uipath && uv run python examples/dataset_evaluators_demo.py uv.lock reflects the pyproject.toml version bump (2.10.68 -> 2.10.69) already in this PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../examples/dataset_evaluators_demo.py | 359 ++++++++++++++++++ packages/uipath/uv.lock | 4 +- 2 files changed, 361 insertions(+), 2 deletions(-) create mode 100644 packages/uipath/examples/dataset_evaluators_demo.py diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py new file mode 100644 index 000000000..a8f80858d --- /dev/null +++ b/packages/uipath/examples/dataset_evaluators_demo.py @@ -0,0 +1,359 @@ +"""Runnable proof that the dataset-level evaluators work on realistic data. + +Five scenarios exercise the framework end-to-end at the SDK layer (no +worker, no backend). Each prints the headline score plus a confusion +matrix table, so the math is inspectable rather than a passing-test +binary signal. + +Run:: + + cd packages/uipath + uv run python examples/dataset_evaluators_demo.py +""" + +from __future__ import annotations + +import json +from typing import Iterable + +from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification +from uipath.eval.evaluators.classification_dataset_evaluators import ( + ClassificationDetails, + FScoreDatasetEvaluator, + FScoreDatasetEvaluatorConfig, + PrecisionDatasetEvaluator, + PrecisionDatasetEvaluatorConfig, + RecallDatasetEvaluator, + RecallDatasetEvaluatorConfig, +) +from uipath.eval.models.models import EvaluationResultDto, NumericEvaluationResult + + +# ─── helpers ────────────────────────────────────────────────────────────────── + + +def make_result(expected: str, actual: str) -> EvaluationResultDto: + """Build a single per-datapoint EvaluationResultDto. + + Models what an upstream ExactMatch evaluator would produce after running + on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with + the expected/actual labels carried in the justification. + """ + score = 1.0 if expected.lower() == actual.lower() else 0.0 + justification = BaseEvaluatorJustification(expected=expected, actual=actual) + return EvaluationResultDto(score=score, details=justification.model_dump()) + + +def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]: + return [make_result(e, a) for e, a in pairs] + + +def print_header(title: str) -> None: + print() + print("═" * 78) + print(f" {title}") + print("═" * 78) + + +def print_confusion(details: ClassificationDetails) -> None: + """Pretty-print the confusion matrix as a table.""" + classes = details.classes + cell_width = max(7, max(len(c) for c in classes) + 1) + header = " " * cell_width + " │ " + " │ ".join(c.center(cell_width) for c in classes) + " │ ← expected" + print(header) + print("─" * len(header)) + for predicted_idx, predicted_label in enumerate(classes): + row_cells = [ + str(details.confusion_matrix[predicted_idx][expected_idx]).rjust(cell_width) + for expected_idx in range(len(classes)) + ] + print(predicted_label.ljust(cell_width) + " │ " + " │ ".join(row_cells) + " │") + print(" " * cell_width + "↑ predicted") + + +def print_per_class(details: ClassificationDetails) -> None: + """One-row-per-class table of TP/TN/FP/FN + the metric.""" + label_w = max(len("class"), max(len(c) for c in details.classes)) + metric = details.metric + header = f" {'class'.ljust(label_w)} │ TP TN FP FN support {metric}" + print(header) + print(" " + "─" * (len(header) - 2)) + for cls, m in details.per_class.items(): + print( + f" {cls.ljust(label_w)} │ " + f"{m.tp:>2} {m.tn:>2} {m.fp:>2} {m.fn:>2} {m.support:>7} " + f"{m.value:.3f}" + ) + + +def report( + title: str, + result: NumericEvaluationResult, + *, + show_json_tail: bool = False, +) -> None: + """Render one scenario's result block.""" + print_header(title) + assert isinstance(result.details, ClassificationDetails) + d = result.details + print( + f" metric = {d.metric} average = {d.average} " + f"score (headline) = {result.score:.4f}" + ) + print( + f" micro = {d.micro:.4f} macro = {d.macro:.4f} " + f"scored = {d.n_scored}/{d.n_total} skipped = {d.n_skipped}" + ) + print() + print_confusion(d) + print() + print_per_class(d) + if show_json_tail: + print() + print(" ── wire JSON (matches frontend zod schema) ──") + # Just show a snippet to keep output focused. + payload = d.model_dump(by_alias=True) + print( + " " + + json.dumps( + {k: payload[k] for k in ("metric", "average", "micro", "macro")}, + indent=2, + ).replace("\n", "\n ") + ) + + +# ─── scenarios ──────────────────────────────────────────────────────────────── + + +def scenario_1_balanced_three_class() -> None: + """Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong.""" + pairs = [ + ("book", "book"), + ("book", "book"), + ("book", "cancel"), # FN_book, FP_cancel + ("cancel", "cancel"), + ("cancel", "cancel"), + ("cancel", "reschedule"), # FN_cancel, FP_reschedule + ("reschedule", "reschedule"), + ("reschedule", "reschedule"), + ("reschedule", "book"), # FN_reschedule, FP_book + ] + results = materialize_pairs(pairs) + evaluator = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="precision_intent", + name="precision_intent", + source_evaluator="intent_match", + classes=["book", "cancel", "reschedule"], + average="macro", + ) + ) + report( + "Scenario 1 — Balanced 3-class (intent recognition)\n" + " Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.", + evaluator.evaluate(results), + show_json_tail=True, + ) + + +def scenario_2_imbalanced_two_class() -> None: + """Rare-positive case — why macro vs micro matters. + + 20 datapoints. Only 4 are actually positive (the rare class). A weak + classifier could trivially get high accuracy by predicting "negative" + everywhere — micro precision masks that, macro doesn't. + """ + pairs: list[tuple[str, str]] = [] + # 16 true negatives where the classifier said "negative" (correct). + pairs += [("negative", "negative")] * 13 + # 3 false positives — classifier hallucinated "positive" on actual negatives. + pairs += [("negative", "positive")] * 3 + # 2 true positives. + pairs += [("positive", "positive")] * 2 + # 2 false negatives — classifier missed real positives. + pairs += [("positive", "negative")] * 2 + + results = materialize_pairs(pairs) + classes = ["positive", "negative"] + + macro = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="p_macro", + name="precision (macro)", + source_evaluator="positive_match", + classes=classes, + average="macro", + ) + ) + micro = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="p_micro", + name="precision (micro)", + source_evaluator="positive_match", + classes=classes, + average="micro", + ) + ) + report( + "Scenario 2a — Imbalanced 2-class, MACRO precision\n" + " Rare positive class. Macro averages per-class, so the rare class\n" + " having precision = 2/(2+3) = 0.40 drags the score down.", + macro.evaluate(results), + ) + report( + "Scenario 2b — Same data, MICRO precision\n" + " Pools TP/FP across classes. In a 2-class case this equals accuracy.\n" + " Notice macro << micro — that's the bias you'd miss with micro alone.", + micro.evaluate(results), + ) + + +def scenario_3_precision_vs_recall_vs_f() -> None: + """Same dataset, three different metrics — show they diverge on asymmetric data.""" + pairs = [ + ("yes", "yes"), + ("yes", "yes"), + ("no", "yes"), # FP for yes + ("no", "yes"), # FP for yes + ("no", "no"), + ("no", "no"), + ("yes", "no"), # FN for yes + ] + results = materialize_pairs(pairs) + classes = ["yes", "no"] + + p = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="p", + name="precision", + source_evaluator="yes_match", + classes=classes, + average="macro", + ) + ) + r = RecallDatasetEvaluator( + RecallDatasetEvaluatorConfig( + id="r", + name="recall", + source_evaluator="yes_match", + classes=classes, + average="macro", + ) + ) + f1 = FScoreDatasetEvaluator( + FScoreDatasetEvaluatorConfig( + id="f1", + name="f1", + source_evaluator="yes_match", + classes=classes, + average="macro", + f_value=1.0, + ) + ) + f2 = FScoreDatasetEvaluator( + FScoreDatasetEvaluatorConfig( + id="f2", + name="f2", + source_evaluator="yes_match", + classes=classes, + average="macro", + f_value=2.0, + ) + ) + report( + "Scenario 3a — Precision on a recall-favourable dataset", + p.evaluate(results), + ) + report( + "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)", + r.evaluate(results), + ) + report( + "Scenario 3c — F1 (harmonic mean of P and R)", + f1.evaluate(results), + ) + report( + "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)", + f2.evaluate(results), + ) + + +def scenario_4_skipped_datapoints() -> None: + """Show how malformed / out-of-vocab data is reported, not silently dropped.""" + results = [ + make_result("cat", "cat"), + make_result("dog", "dog"), + make_result("cat", "platypus"), # actual not in classes → skipped + make_result("zebra", "cat"), # expected not in classes → skipped + EvaluationResultDto(score=1.0, details="bare string — no justification"), + EvaluationResultDto(score=0.0, details={"unrelated": "shape"}), + ] + evaluator = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="precision_robustness", + name="precision_robustness", + source_evaluator="any_match", + classes=["cat", "dog"], + average="macro", + ) + ) + report( + "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n" + " 6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n" + " report so you can tell whether a low score is a real signal or\n" + " just sparse data.", + evaluator.evaluate(results), + ) + + +def scenario_5_realistic_intent_classifier() -> None: + """A larger, more interesting 4-class dataset — uneven per-class performance.""" + pairs = [ + # 'book' is easy: classifier handles it well + *[("book", "book")] * 10, + ("book", "cancel"), + # 'cancel' is medium: a few errors + *[("cancel", "cancel")] * 6, + ("cancel", "book"), + ("cancel", "modify"), + # 'reschedule' is hard: classifier confuses it with 'modify' + ("reschedule", "reschedule"), + ("reschedule", "reschedule"), + ("reschedule", "modify"), + ("reschedule", "modify"), + # 'modify' is rare: only 2 cases, classifier gets one + ("modify", "modify"), + ("modify", "reschedule"), + ] + results = materialize_pairs(pairs) + classes = ["book", "cancel", "reschedule", "modify"] + macro_f1 = FScoreDatasetEvaluator( + FScoreDatasetEvaluatorConfig( + id="f1_4class", + name="f1_4class", + source_evaluator="intent_match", + classes=classes, + average="macro", + f_value=1.0, + ) + ) + report( + "Scenario 5 — Realistic 4-class intent classifier\n" + " Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n" + " 'modify' weakness; micro F1 would have hidden it under 'book' wins.", + macro_f1.evaluate(results), + ) + + +def main() -> None: + scenario_1_balanced_three_class() + scenario_2_imbalanced_two_class() + scenario_3_precision_vs_recall_vs_f() + scenario_4_skipped_datapoints() + scenario_5_realistic_intent_classifier() + print() + print("Done. All scenarios computed from real evaluator code.") + + +if __name__ == "__main__": + main() diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock index 41ae12119..19b0d047b 100644 --- a/packages/uipath/uv.lock +++ b/packages/uipath/uv.lock @@ -3,7 +3,7 @@ revision = 3 requires-python = ">=3.11" [options] -exclude-newer = "2026-05-17T17:25:34.9197064Z" +exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "P2D" [options.exclude-newer-package] @@ -2552,7 +2552,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.10.68" +version = "2.10.69" source = { editable = "." } dependencies = [ { name = "applicationinsights" }, From fb091e46c686da88958aa002cbfdb34527fe08ab Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 21:26:19 -0700 Subject: [PATCH 05/13] refactor(eval): embed aggregator specs in per-datapoint evaluator configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pivot dataset evaluators from a separate hierarchy with source_evaluator pointers to an embedded aggregator-spec design: each per-datapoint classification evaluator's config carries a self-contained list of aggregators (precision / recall / fscore), each with its own classes, averaging, and f_value. No properties are shared up to the evaluator level — aggregators are fully self-describing. - Drop source_evaluator pointer from BaseDatasetEvaluatorConfig. - Add discriminated AggregatorSpec union (precision/recall/fscore). - Add aggregators field to Binary/Multiclass classification configs. - Refactor build_dataset_evaluator + compute_dataset_evaluator_results to consume aggregator specs from per-datapoint configs directly. - Drop EvaluationSet.dataset_evaluator_refs (no separate list). Co-Authored-By: Claude Opus 4.7 --- .../examples/dataset_evaluators_demo.py | 189 ++++------ packages/uipath/src/uipath/_cli/cli_eval.py | 7 - .../eval/evaluators/_aggregator_specs.py | 53 +++ .../eval/evaluators/base_dataset_evaluator.py | 67 ++-- .../binary_classification_evaluator.py | 7 + .../classification_dataset_evaluators.py | 102 ++---- .../evaluators/dataset_evaluator_factory.py | 67 ++-- .../multiclass_classification_evaluator.py | 7 + packages/uipath/src/uipath/eval/helpers.py | 88 ----- .../src/uipath/eval/models/evaluation_set.py | 3 - .../uipath/src/uipath/eval/runtime/context.py | 2 - .../uipath/src/uipath/eval/runtime/runtime.py | 63 ++-- .../test_dataset_classification_evaluators.py | 332 +++++++++++------- 13 files changed, 460 insertions(+), 527 deletions(-) create mode 100644 packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py index a8f80858d..2d13f3572 100644 --- a/packages/uipath/examples/dataset_evaluators_demo.py +++ b/packages/uipath/examples/dataset_evaluators_demo.py @@ -16,28 +16,27 @@ import json from typing import Iterable +from uipath.eval.evaluators._aggregator_specs import ( + FScoreAggregatorSpec, + PrecisionAggregatorSpec, + RecallAggregatorSpec, +) from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification from uipath.eval.evaluators.classification_dataset_evaluators import ( ClassificationDetails, - FScoreDatasetEvaluator, - FScoreDatasetEvaluatorConfig, - PrecisionDatasetEvaluator, - PrecisionDatasetEvaluatorConfig, - RecallDatasetEvaluator, - RecallDatasetEvaluatorConfig, ) +from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator from uipath.eval.models.models import EvaluationResultDto, NumericEvaluationResult - # ─── helpers ────────────────────────────────────────────────────────────────── def make_result(expected: str, actual: str) -> EvaluationResultDto: """Build a single per-datapoint EvaluationResultDto. - Models what an upstream ExactMatch evaluator would produce after running - on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with - the expected/actual labels carried in the justification. + Models what an upstream classification evaluator would produce after running + on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with the + expected/actual labels carried in the justification. """ score = 1.0 if expected.lower() == actual.lower() else 0.0 justification = BaseEvaluatorJustification(expected=expected, actual=actual) @@ -45,10 +44,12 @@ def make_result(expected: str, actual: str) -> EvaluationResultDto: def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]: + """Build a list of EvaluationResultDto from (expected, actual) pairs.""" return [make_result(e, a) for e, a in pairs] def print_header(title: str) -> None: + """Print a section header banner.""" print() print("═" * 78) print(f" {title}") @@ -59,7 +60,12 @@ def print_confusion(details: ClassificationDetails) -> None: """Pretty-print the confusion matrix as a table.""" classes = details.classes cell_width = max(7, max(len(c) for c in classes) + 1) - header = " " * cell_width + " │ " + " │ ".join(c.center(cell_width) for c in classes) + " │ ← expected" + header = ( + " " * cell_width + + " │ " + + " │ ".join(c.center(cell_width) for c in classes) + + " │ ← expected" + ) print(header) print("─" * len(header)) for predicted_idx, predicted_label in enumerate(classes): @@ -111,7 +117,6 @@ def report( if show_json_tail: print() print(" ── wire JSON (matches frontend zod schema) ──") - # Just show a snippet to keep output focused. payload = d.model_dump(by_alias=True) print( " " @@ -130,69 +135,44 @@ def scenario_1_balanced_three_class() -> None: pairs = [ ("book", "book"), ("book", "book"), - ("book", "cancel"), # FN_book, FP_cancel + ("book", "cancel"), ("cancel", "cancel"), ("cancel", "cancel"), - ("cancel", "reschedule"), # FN_cancel, FP_reschedule + ("cancel", "reschedule"), ("reschedule", "reschedule"), ("reschedule", "reschedule"), - ("reschedule", "book"), # FN_reschedule, FP_book + ("reschedule", "book"), ] - results = materialize_pairs(pairs) - evaluator = PrecisionDatasetEvaluator( - PrecisionDatasetEvaluatorConfig( - id="precision_intent", - name="precision_intent", - source_evaluator="intent_match", - classes=["book", "cancel", "reschedule"], - average="macro", - ) + spec = PrecisionAggregatorSpec( + classes=["book", "cancel", "reschedule"], averaging="macro" ) + evaluator = build_dataset_evaluator(spec, source_evaluator="intent_match") report( "Scenario 1 — Balanced 3-class (intent recognition)\n" " Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.", - evaluator.evaluate(results), + evaluator.evaluate(materialize_pairs(pairs)), show_json_tail=True, ) def scenario_2_imbalanced_two_class() -> None: - """Rare-positive case — why macro vs micro matters. - - 20 datapoints. Only 4 are actually positive (the rare class). A weak - classifier could trivially get high accuracy by predicting "negative" - everywhere — micro precision masks that, macro doesn't. - """ + """Rare-positive case — why macro vs micro matters.""" pairs: list[tuple[str, str]] = [] - # 16 true negatives where the classifier said "negative" (correct). pairs += [("negative", "negative")] * 13 - # 3 false positives — classifier hallucinated "positive" on actual negatives. pairs += [("negative", "positive")] * 3 - # 2 true positives. pairs += [("positive", "positive")] * 2 - # 2 false negatives — classifier missed real positives. pairs += [("positive", "negative")] * 2 results = materialize_pairs(pairs) classes = ["positive", "negative"] - macro = PrecisionDatasetEvaluator( - PrecisionDatasetEvaluatorConfig( - id="p_macro", - name="precision (macro)", - source_evaluator="positive_match", - classes=classes, - average="macro", - ) + macro = build_dataset_evaluator( + PrecisionAggregatorSpec(classes=classes, averaging="macro"), + source_evaluator="positive_match", ) - micro = PrecisionDatasetEvaluator( - PrecisionDatasetEvaluatorConfig( - id="p_micro", - name="precision (micro)", - source_evaluator="positive_match", - classes=classes, - average="micro", - ) + micro = build_dataset_evaluator( + PrecisionAggregatorSpec(classes=classes, averaging="micro"), + source_evaluator="positive_match", ) report( "Scenario 2a — Imbalanced 2-class, MACRO precision\n" @@ -202,8 +182,7 @@ def scenario_2_imbalanced_two_class() -> None: ) report( "Scenario 2b — Same data, MICRO precision\n" - " Pools TP/FP across classes. In a 2-class case this equals accuracy.\n" - " Notice macro << micro — that's the bias you'd miss with micro alone.", + " Pools TP/FP across classes. In a 2-class case this equals accuracy.", micro.evaluate(results), ) @@ -213,69 +192,35 @@ def scenario_3_precision_vs_recall_vs_f() -> None: pairs = [ ("yes", "yes"), ("yes", "yes"), - ("no", "yes"), # FP for yes - ("no", "yes"), # FP for yes + ("no", "yes"), + ("no", "yes"), ("no", "no"), ("no", "no"), - ("yes", "no"), # FN for yes + ("yes", "no"), ] results = materialize_pairs(pairs) classes = ["yes", "no"] - p = PrecisionDatasetEvaluator( - PrecisionDatasetEvaluatorConfig( - id="p", - name="precision", + evaluators = { + "Scenario 3a — Precision on a recall-favourable dataset": build_dataset_evaluator( + PrecisionAggregatorSpec(classes=classes, averaging="macro"), source_evaluator="yes_match", - classes=classes, - average="macro", - ) - ) - r = RecallDatasetEvaluator( - RecallDatasetEvaluatorConfig( - id="r", - name="recall", + ), + "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)": build_dataset_evaluator( + RecallAggregatorSpec(classes=classes, averaging="macro"), source_evaluator="yes_match", - classes=classes, - average="macro", - ) - ) - f1 = FScoreDatasetEvaluator( - FScoreDatasetEvaluatorConfig( - id="f1", - name="f1", + ), + "Scenario 3c — F1 (harmonic mean of P and R)": build_dataset_evaluator( + FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0), source_evaluator="yes_match", - classes=classes, - average="macro", - f_value=1.0, - ) - ) - f2 = FScoreDatasetEvaluator( - FScoreDatasetEvaluatorConfig( - id="f2", - name="f2", + ), + "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)": build_dataset_evaluator( + FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=2.0), source_evaluator="yes_match", - classes=classes, - average="macro", - f_value=2.0, - ) - ) - report( - "Scenario 3a — Precision on a recall-favourable dataset", - p.evaluate(results), - ) - report( - "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)", - r.evaluate(results), - ) - report( - "Scenario 3c — F1 (harmonic mean of P and R)", - f1.evaluate(results), - ) - report( - "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)", - f2.evaluate(results), - ) + ), + } + for title, evaluator in evaluators.items(): + report(title, evaluator.evaluate(results)) def scenario_4_skipped_datapoints() -> None: @@ -283,19 +228,14 @@ def scenario_4_skipped_datapoints() -> None: results = [ make_result("cat", "cat"), make_result("dog", "dog"), - make_result("cat", "platypus"), # actual not in classes → skipped - make_result("zebra", "cat"), # expected not in classes → skipped + make_result("cat", "platypus"), + make_result("zebra", "cat"), EvaluationResultDto(score=1.0, details="bare string — no justification"), EvaluationResultDto(score=0.0, details={"unrelated": "shape"}), ] - evaluator = PrecisionDatasetEvaluator( - PrecisionDatasetEvaluatorConfig( - id="precision_robustness", - name="precision_robustness", - source_evaluator="any_match", - classes=["cat", "dog"], - average="macro", - ) + evaluator = build_dataset_evaluator( + PrecisionAggregatorSpec(classes=["cat", "dog"], averaging="macro"), + source_evaluator="any_match", ) report( "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n" @@ -309,33 +249,23 @@ def scenario_4_skipped_datapoints() -> None: def scenario_5_realistic_intent_classifier() -> None: """A larger, more interesting 4-class dataset — uneven per-class performance.""" pairs = [ - # 'book' is easy: classifier handles it well *[("book", "book")] * 10, ("book", "cancel"), - # 'cancel' is medium: a few errors *[("cancel", "cancel")] * 6, ("cancel", "book"), ("cancel", "modify"), - # 'reschedule' is hard: classifier confuses it with 'modify' ("reschedule", "reschedule"), ("reschedule", "reschedule"), ("reschedule", "modify"), ("reschedule", "modify"), - # 'modify' is rare: only 2 cases, classifier gets one ("modify", "modify"), ("modify", "reschedule"), ] results = materialize_pairs(pairs) classes = ["book", "cancel", "reschedule", "modify"] - macro_f1 = FScoreDatasetEvaluator( - FScoreDatasetEvaluatorConfig( - id="f1_4class", - name="f1_4class", - source_evaluator="intent_match", - classes=classes, - average="macro", - f_value=1.0, - ) + macro_f1 = build_dataset_evaluator( + FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0), + source_evaluator="intent_match", ) report( "Scenario 5 — Realistic 4-class intent classifier\n" @@ -346,6 +276,7 @@ def scenario_5_realistic_intent_classifier() -> None: def main() -> None: + """Run every scenario sequentially.""" scenario_1_balanced_three_class() scenario_2_imbalanced_two_class() scenario_3_precision_vs_recall_vs_f() diff --git a/packages/uipath/src/uipath/_cli/cli_eval.py b/packages/uipath/src/uipath/_cli/cli_eval.py index 2e35db849..e101717d6 100644 --- a/packages/uipath/src/uipath/_cli/cli_eval.py +++ b/packages/uipath/src/uipath/_cli/cli_eval.py @@ -412,13 +412,6 @@ async def execute_eval(): get_agent_model(eval_context.runtime_schema), ) - eval_context.dataset_evaluators = ( - await EvalHelpers.load_dataset_evaluators( - resolved_eval_set_path, - eval_context.evaluation_set, - ) - ) - # Runtime is not required anymore. await runtime.dispose() diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py new file mode 100644 index 000000000..fde129506 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py @@ -0,0 +1,53 @@ +"""Aggregator specs embedded in per-datapoint classification evaluator configs. + +Each aggregator is a self-contained run-level metric (precision / recall / +f-score) attached to a classification evaluator. Specs do not share any +properties — each variant declares its own ``classes``, ``averaging``, and +(for fscore) ``f_value`` independently. This keeps each aggregator's contract +explicit at the JSON level: nothing is hoisted up to the evaluator and silently +applied to siblings. +""" + +from __future__ import annotations + +from typing import Annotated, Literal, Union + +from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel + + +class PrecisionAggregatorSpec(BaseModel): + """Run-level precision aggregator (multiclass, micro or macro averaged).""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + type: Literal["precision"] = "precision" + classes: list[str] = Field(..., min_length=1) + averaging: Literal["macro", "micro"] + + +class RecallAggregatorSpec(BaseModel): + """Run-level recall aggregator (multiclass, micro or macro averaged).""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + type: Literal["recall"] = "recall" + classes: list[str] = Field(..., min_length=1) + averaging: Literal["macro", "micro"] + + +class FScoreAggregatorSpec(BaseModel): + """Run-level F-beta aggregator (multiclass, micro or macro averaged).""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + type: Literal["fscore"] = "fscore" + classes: list[str] = Field(..., min_length=1) + averaging: Literal["macro", "micro"] + f_value: float = Field(default=1.0, gt=0) + + +AggregatorSpec = Annotated[ + Union[PrecisionAggregatorSpec, RecallAggregatorSpec, FScoreAggregatorSpec], + Field(discriminator="type"), +] diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py index ae818a421..dcb33cc78 100644 --- a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py @@ -5,9 +5,15 @@ EvaluationResultDto values from one named source evaluator and emits a single EvaluationResult that summarizes the dataset. +Unlike the earlier pointer-style design, dataset evaluators no longer carry +their own JSON config or a ``source_evaluator`` field. They are constructed by +the factory directly from an :class:`AggregatorSpec` embedded in a per-datapoint +classification evaluator's config, together with the source evaluator's name +which is supplied externally by the runtime when walking those configs. + Concretely distinct from GenericBaseEvaluator: different evaluate() signature, -different lifecycle. Kept as a parallel hierarchy rather than a subclass so -the runtime cannot accidentally dispatch a dataset evaluator through the +different lifecycle. Kept as a parallel hierarchy rather than a subclass so the +runtime cannot accidentally dispatch a dataset evaluator through the per-datapoint loop. """ @@ -16,59 +22,44 @@ from abc import ABC, abstractmethod from typing import Generic, TypeVar -from pydantic import BaseModel, ConfigDict, Field -from pydantic.alias_generators import to_camel - from ..models.models import EvaluationResult, EvaluationResultDto +from ._aggregator_specs import AggregatorSpec - -class BaseDatasetEvaluatorConfig(BaseModel): - """Configuration shared by all dataset-level evaluators.""" - - model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) - - id: str - name: str - type: str - source_evaluator: str = Field( - ..., - description=( - "Name of the per-datapoint evaluator whose EvaluationResultDto values " - "this dataset evaluator consumes." - ), - ) - - -ConfigT = TypeVar("ConfigT", bound=BaseDatasetEvaluatorConfig) +SpecT = TypeVar("SpecT", bound="AggregatorSpec") -class BaseDatasetEvaluator(ABC, Generic[ConfigT]): +class BaseDatasetEvaluator(ABC, Generic[SpecT]): """Abstract base for dataset-level evaluators. - Subclasses implement ``evaluate`` over the per-datapoint EvaluationResultDto - values produced by ``config.source_evaluator``. + Constructed from an :class:`AggregatorSpec` and the name of the source + per-datapoint evaluator whose results this aggregator consumes. The + dataset evaluator's "name" used for result keying is derived from + ``"{source_evaluator}.{spec.type}"`` so two aggregators on the same source + don't collide. """ - config: ConfigT + spec: SpecT + _source_evaluator: str - def __init__(self, config: ConfigT) -> None: - """Store the evaluator's configuration.""" - self.config = config - - @property - def name(self) -> str: - """Logical name of this evaluator instance (used as result-dict key).""" - return self.config.name + def __init__(self, spec: SpecT, source_evaluator: str) -> None: + """Store the aggregator spec and the source evaluator name.""" + self.spec = spec + self._source_evaluator = source_evaluator @property def source_evaluator(self) -> str: """Name of the upstream evaluator whose results this one consumes.""" - return self.config.source_evaluator + return self._source_evaluator + + @property + def name(self) -> str: + """Stable key for this dataset evaluator's result in the output map.""" + return f"{self._source_evaluator}.{self.spec.type}" @classmethod @abstractmethod def get_evaluator_id(cls) -> str: - """Stable identifier matching the ``type`` discriminator on configs.""" + """Stable identifier matching the ``type`` discriminator on specs.""" @abstractmethod def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py index d56509228..0a65c2c64 100644 --- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py @@ -19,6 +19,7 @@ UiPathEvaluationError, UiPathEvaluationErrorCategory, ) +from ._aggregator_specs import AggregatorSpec from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification from .output_evaluator import ( BaseOutputEvaluator, @@ -41,6 +42,12 @@ class BinaryClassificationEvaluatorConfig( positive_class: str metric_type: Literal["precision", "recall", "f-score"] = "precision" f_value: float = 1.0 + # Optional run-level aggregators (precision / recall / fscore). Each is a + # self-contained spec carrying its own ``classes``, ``averaging``, and + # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list + # after all per-datapoint evaluators complete and emits one structured + # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``. + aggregators: list[AggregatorSpec] | None = None class BinaryClassificationEvaluator( diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py index 272541e21..b15020c25 100644 --- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py +++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py @@ -3,15 +3,14 @@ All three share the same internal machinery — a k x k confusion matrix built from each per-datapoint result's BaseEvaluatorJustification (expected, actual) strings. They differ only in the final formula and (for F-score) the beta -parameter. The headline ``score`` is the micro or macro average per config; -``details`` carries the full per-class breakdown plus the confusion matrix. +parameter. The headline ``score`` is the micro or macro average per the +embedded :class:`AggregatorSpec`; ``details`` carries the full per-class +breakdown plus the confusion matrix. """ from __future__ import annotations -from typing import Literal - -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict from pydantic.alias_generators import to_camel from ..models.models import ( @@ -20,7 +19,12 @@ EvaluatorType, NumericEvaluationResult, ) -from .base_dataset_evaluator import BaseDatasetEvaluator, BaseDatasetEvaluatorConfig +from ._aggregator_specs import ( + FScoreAggregatorSpec, + PrecisionAggregatorSpec, + RecallAggregatorSpec, +) +from .base_dataset_evaluator import BaseDatasetEvaluator from .base_evaluator import BaseEvaluatorJustification @@ -99,19 +103,15 @@ def counts_for(self, class_index: int) -> tuple[int, int, int, int]: def _build_confusion( results: list[EvaluationResultDto], classes: list[str], - case_sensitive: bool, ) -> _ConfusionData: """Build a confusion matrix from per-datapoint results. Results without a parseable justification are counted in ``n_skipped`` and omitted from the matrix. Pairs whose expected or actual label isn't in - ``classes`` are also skipped. + ``classes`` are also skipped. Labels are normalized to lowercase so a + classifier returning "Book" vs configured "book" still matches. """ - - def norm(label: str) -> str: - return label if case_sensitive else label.lower() - - canonical_classes = [norm(c) for c in classes] + canonical_classes = [c.lower() for c in classes] index_of = {c: i for i, c in enumerate(canonical_classes)} k = len(canonical_classes) matrix = [[0] * k for _ in range(k)] @@ -125,8 +125,8 @@ def norm(label: str) -> str: if j is None: n_skipped += 1 continue - exp = norm(j[0]) - act = norm(j[1]) + exp = j[0].lower() + act = j[1].lower() if exp not in index_of or act not in index_of: n_skipped += 1 continue @@ -168,11 +168,7 @@ def _build_details( average: str, per_class_fn, ) -> tuple[ClassificationDetails, float]: - """Compute per-class values, micro, macro, and pick the headline. - - Returns (details, headline_score). ``headline_score`` is the micro or macro - average per the evaluator's ``average`` setting. - """ + """Compute per-class values, micro, macro, and pick the headline.""" per_class: dict[str, PerClassMetrics] = {} total_tp = 0 total_fp = 0 @@ -214,98 +210,58 @@ def _build_details( return details, headline -# ─── configs ────────────────────────────────────────────────────────────────── - - -class _BaseClassificationConfig(BaseDatasetEvaluatorConfig): - """Shared config for the three classification evaluators.""" - - classes: list[str] = Field( - ..., - min_length=1, - description="Class labels expected in the upstream evaluator's justifications.", - ) - average: Literal["micro", "macro"] = "macro" - case_sensitive: bool = False - - -class PrecisionDatasetEvaluatorConfig(_BaseClassificationConfig): - """Configuration for the dataset-level precision evaluator.""" - - type: str = EvaluatorType.DATASET_PRECISION.value - - -class RecallDatasetEvaluatorConfig(_BaseClassificationConfig): - """Configuration for the dataset-level recall evaluator.""" - - type: str = EvaluatorType.DATASET_RECALL.value - - -class FScoreDatasetEvaluatorConfig(_BaseClassificationConfig): - """Configuration for the dataset-level F-score evaluator.""" - - type: str = EvaluatorType.DATASET_F_SCORE.value - f_value: float = Field(default=1.0, gt=0, description="Beta value for F_beta.") - - # ─── evaluators ─────────────────────────────────────────────────────────────── -class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionDatasetEvaluatorConfig]): +class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionAggregatorSpec]): """Dataset-level precision evaluator (multiclass, micro or macro averaged).""" @classmethod def get_evaluator_id(cls) -> str: - """Identifier matching the type discriminator on configs.""" + """Identifier matching the type discriminator on specs.""" return EvaluatorType.DATASET_PRECISION.value def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: """Compute the precision report and return the headline as score.""" - confusion = _build_confusion( - results, self.config.classes, self.config.case_sensitive - ) + confusion = _build_confusion(results, self.spec.classes) details, headline = _build_details( - confusion, "precision", self.config.average, _precision_of + confusion, "precision", self.spec.averaging, _precision_of ) return NumericEvaluationResult(score=headline, details=details) -class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallDatasetEvaluatorConfig]): +class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallAggregatorSpec]): """Dataset-level recall evaluator (multiclass, micro or macro averaged).""" @classmethod def get_evaluator_id(cls) -> str: - """Identifier matching the type discriminator on configs.""" + """Identifier matching the type discriminator on specs.""" return EvaluatorType.DATASET_RECALL.value def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: """Compute the recall report and return the headline as score.""" - confusion = _build_confusion( - results, self.config.classes, self.config.case_sensitive - ) + confusion = _build_confusion(results, self.spec.classes) details, headline = _build_details( - confusion, "recall", self.config.average, _recall_of + confusion, "recall", self.spec.averaging, _recall_of ) return NumericEvaluationResult(score=headline, details=details) -class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreDatasetEvaluatorConfig]): +class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreAggregatorSpec]): """Dataset-level F-beta evaluator (multiclass, micro or macro averaged).""" @classmethod def get_evaluator_id(cls) -> str: - """Identifier matching the type discriminator on configs.""" + """Identifier matching the type discriminator on specs.""" return EvaluatorType.DATASET_F_SCORE.value def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: """Compute the F-beta report and return the headline as score.""" - confusion = _build_confusion( - results, self.config.classes, self.config.case_sensitive - ) + confusion = _build_confusion(results, self.spec.classes) details, headline = _build_details( confusion, "f_score", - self.config.average, - _f_score_of(self.config.f_value), + self.spec.averaging, + _f_score_of(self.spec.f_value), ) return NumericEvaluationResult(score=headline, details=details) diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py index 8ba0dbe62..d597b9085 100644 --- a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py +++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py @@ -1,52 +1,61 @@ -"""Factory that instantiates dataset-level evaluators from configuration.""" +"""Factory that instantiates dataset-level evaluators from aggregator specs. + +Dataset evaluators are now built from a self-contained :class:`AggregatorSpec` +embedded in a per-datapoint classification evaluator's config, plus the source +evaluator's name (supplied by the runtime when walking those configs). The +factory inspects the spec's ``type`` discriminator and returns the matching +evaluator instance. +""" from __future__ import annotations from typing import Any -from ..models.models import EvaluatorType +from ._aggregator_specs import ( + AggregatorSpec, + FScoreAggregatorSpec, + PrecisionAggregatorSpec, + RecallAggregatorSpec, +) from .base_dataset_evaluator import BaseDatasetEvaluator from .classification_dataset_evaluators import ( FScoreDatasetEvaluator, - FScoreDatasetEvaluatorConfig, PrecisionDatasetEvaluator, - PrecisionDatasetEvaluatorConfig, RecallDatasetEvaluator, - RecallDatasetEvaluatorConfig, ) _EVALUATOR_REGISTRY: dict[str, type[BaseDatasetEvaluator[Any]]] = { - EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluator, - EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluator, - EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluator, -} - -_CONFIG_REGISTRY: dict[str, type[Any]] = { - EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluatorConfig, - EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluatorConfig, - EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluatorConfig, + "precision": PrecisionDatasetEvaluator, + "recall": RecallDatasetEvaluator, + "fscore": FScoreDatasetEvaluator, } def build_dataset_evaluator( - config_data: dict[str, Any], + spec: AggregatorSpec, + source_evaluator: str, ) -> BaseDatasetEvaluator[Any]: - """Build a dataset evaluator instance from a parsed JSON config dict. + """Build a dataset evaluator instance from an aggregator spec. + + Args: + spec: A validated :class:`AggregatorSpec` (precision / recall / fscore). + source_evaluator: Name of the per-datapoint evaluator whose results + this aggregator consumes. Raises: - ValueError: If ``type`` is missing or unknown. + ValueError: If ``spec.type`` doesn't match any known aggregator. """ - evaluator_type = config_data.get("type") - if not evaluator_type: - raise ValueError("Dataset evaluator config is missing required field 'type'") - - config_cls = _CONFIG_REGISTRY.get(evaluator_type) - evaluator_cls = _EVALUATOR_REGISTRY.get(evaluator_type) - if config_cls is None or evaluator_cls is None: + evaluator_cls = _EVALUATOR_REGISTRY.get(spec.type) + if evaluator_cls is None: known = sorted(_EVALUATOR_REGISTRY.keys()) - raise ValueError( - f"Unknown dataset evaluator type '{evaluator_type}'. Known types: {known}" - ) + raise ValueError(f"Unknown aggregator type '{spec.type}'. Known types: {known}") + return evaluator_cls(spec, source_evaluator) + - config = config_cls.model_validate(config_data) - return evaluator_cls(config) +__all__ = [ + "AggregatorSpec", + "PrecisionAggregatorSpec", + "RecallAggregatorSpec", + "FScoreAggregatorSpec", + "build_dataset_evaluator", +] diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py index 69790c3aa..842d13174 100644 --- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py @@ -20,6 +20,7 @@ UiPathEvaluationError, UiPathEvaluationErrorCategory, ) +from ._aggregator_specs import AggregatorSpec from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification from .output_evaluator import ( BaseOutputEvaluator, @@ -43,6 +44,12 @@ class MulticlassClassificationEvaluatorConfig( metric_type: Literal["precision", "recall", "f-score"] = "f-score" averaging: Literal["micro", "macro"] = "macro" f_value: float = 1.0 + # Optional run-level aggregators (precision / recall / fscore). Each is a + # self-contained spec carrying its own ``classes``, ``averaging``, and + # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list + # after all per-datapoint evaluators complete and emits one structured + # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``. + aggregators: list[AggregatorSpec] | None = None class MulticlassClassificationEvaluator( diff --git a/packages/uipath/src/uipath/eval/helpers.py b/packages/uipath/src/uipath/eval/helpers.py index fbe210a93..8405e4a7a 100644 --- a/packages/uipath/src/uipath/eval/helpers.py +++ b/packages/uipath/src/uipath/eval/helpers.py @@ -9,9 +9,7 @@ from uipath.runtime.schema import UiPathRuntimeSchema -from .evaluators.base_dataset_evaluator import BaseDatasetEvaluator from .evaluators.base_evaluator import GenericBaseEvaluator -from .evaluators.dataset_evaluator_factory import build_dataset_evaluator from .evaluators.evaluator_factory import EvaluatorFactory from .mocks._types import InputMockingStrategy, LLMMockingStrategy from .models._conversational_utils import UiPathLegacyEvalChatMessagesMapper @@ -282,92 +280,6 @@ async def load_evaluators( return evaluators - @staticmethod - async def load_dataset_evaluators( - eval_set_path: str, - evaluation_set: EvaluationSet, - ) -> list[BaseDatasetEvaluator[Any]]: - """Load dataset-level evaluators referenced by the evaluation set. - - Dataset evaluator config JSON files are expected to live under - ``/../dataset_evaluators/``, mirroring the evaluators - layout. Each config is matched to a reference by its top-level ``id``. - - Validates that every dataset evaluator's ``source_evaluator`` is one of - the per-datapoint evaluators declared on the eval set; raises if not. - """ - if evaluation_set is None: - raise ValueError("eval_set cannot be None") - - dataset_ref_ids = { - ref.ref for ref in evaluation_set.dataset_evaluator_refs - } - if not dataset_ref_ids: - return [] - - dataset_dir = Path(eval_set_path).parent.parent / "dataset_evaluators" - if not dataset_dir.exists(): - raise ValueError( - f"Dataset evaluators directory not found at '{dataset_dir}', " - f"but evaluation set references dataset evaluators: " - f"{sorted(dataset_ref_ids)}" - ) - - # Build the set of per-datapoint evaluator names so we can validate - # source_evaluator references up front. - if evaluation_set.evaluator_configs: - known_evaluator_names = { - ref.ref for ref in evaluation_set.evaluator_configs - } - else: - known_evaluator_names = set(evaluation_set.evaluator_refs) - - dataset_evaluators: list[BaseDatasetEvaluator[Any]] = [] - found_ids: set[str] = set() - - for file in dataset_dir.glob("*.json"): - try: - with open(file, "r", encoding="utf-8") as f: - data = json.load(f) - except json.JSONDecodeError as e: - raise ValueError( - f"Invalid JSON in dataset evaluator file '{file}': {str(e)}." - ) from e - - evaluator_id = data.get("id") - if evaluator_id not in dataset_ref_ids: - continue - - try: - evaluator = build_dataset_evaluator(data) - except Exception as e: - raise ValueError( - f"Failed to create dataset evaluator from file '{file}': " - f"{str(e)}." - ) from e - - if ( - known_evaluator_names - and evaluator.source_evaluator not in known_evaluator_names - ): - raise ValueError( - f"Dataset evaluator '{evaluator.name}' references " - f"source_evaluator='{evaluator.source_evaluator}' which is " - f"not declared in this evaluation set. Known evaluators: " - f"{sorted(known_evaluator_names)}" - ) - - dataset_evaluators.append(evaluator) - found_ids.add(evaluator_id) - - missing = dataset_ref_ids - found_ids - if missing: - raise ValueError( - f"Could not find the following dataset evaluators: {missing}" - ) - - return dataset_evaluators - def get_agent_model(schema: UiPathRuntimeSchema) -> str | None: """Get agent model from the runtime schema metadata. diff --git a/packages/uipath/src/uipath/eval/models/evaluation_set.py b/packages/uipath/src/uipath/eval/models/evaluation_set.py index 74c822595..c80da8e14 100644 --- a/packages/uipath/src/uipath/eval/models/evaluation_set.py +++ b/packages/uipath/src/uipath/eval/models/evaluation_set.py @@ -173,9 +173,6 @@ class EvaluationSet(BaseModel): evaluator_configs: list[EvaluatorReference] = Field( default_factory=list, alias="evaluatorConfigs" ) - dataset_evaluator_refs: list[EvaluatorReference] = Field( - default_factory=list, alias="datasetEvaluatorRefs" - ) evaluations: list[EvaluationItem] = Field(default_factory=list) model_settings: list[EvaluationSetModelSettings] = Field( default_factory=list, alias="modelSettings" diff --git a/packages/uipath/src/uipath/eval/runtime/context.py b/packages/uipath/src/uipath/eval/runtime/context.py index f3b713320..b8224718c 100644 --- a/packages/uipath/src/uipath/eval/runtime/context.py +++ b/packages/uipath/src/uipath/eval/runtime/context.py @@ -4,7 +4,6 @@ from uipath.runtime.schema import UiPathRuntimeSchema -from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator from ..evaluators.base_evaluator import GenericBaseEvaluator from ..models.evaluation_set import EvaluationSet @@ -28,4 +27,3 @@ class UiPathEvalContext: input_overrides: dict[str, Any] | None = None resume: bool = False job_id: str | None = None - dataset_evaluators: list[BaseDatasetEvaluator[Any]] | None = None diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index 5cadcc527..c64f8f158 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -45,8 +45,8 @@ from uipath.runtime.schema import UiPathRuntimeSchema from .._execution_context import ExecutionSpanCollector -from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator from ..evaluators.base_evaluator import GenericBaseEvaluator +from ..evaluators.dataset_evaluator_factory import build_dataset_evaluator from ..evaluators.output_evaluator import OutputEvaluationCriteria from ..helpers import get_agent_model from ..mocks._cache_manager import CacheManager @@ -205,19 +205,24 @@ def compute_evaluator_scores( def compute_dataset_evaluator_results( evaluation_set_results: list[UiPathEvalRunResult], - dataset_evaluators: Iterable[BaseDatasetEvaluator[Any]], + evaluators: Iterable[GenericBaseEvaluator[Any, Any, Any]], ) -> dict[str, EvaluationResultDto]: - """Run each dataset evaluator over its source evaluator's per-datapoint results. + """Run any dataset-level aggregators embedded in per-datapoint evaluator configs. + + Walks ``evaluators`` looking for any whose config carries an ``aggregators`` + list (currently only Binary/Multiclass classification). For each aggregator + spec, builds the corresponding dataset evaluator via the factory and runs it + over the per-datapoint results that came from that source evaluator. Args: evaluation_set_results: Per-datapoint results from the run. - dataset_evaluators: Dataset-level evaluator instances. Each is routed to - the per-datapoint results from ``evaluator.source_evaluator``. + evaluators: Per-datapoint evaluator instances that ran during this eval + set. Their configs may carry ``aggregators`` lists. Returns: - Dict mapping dataset evaluator name to its serialized EvaluationResultDto. - Dataset evaluators whose source produced no results are still invoked - with an empty list so they can emit a zeroed result. + Dict mapping ``"{evaluator_name}.{aggregator_type}"`` to the run-level + EvaluationResultDto. Aggregators whose source produced no results are + still invoked with an empty list so they emit a zeroed result. """ results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict( list @@ -231,12 +236,21 @@ def compute_dataset_evaluator_results( ) dataset_results: dict[str, EvaluationResultDto] = {} - for evaluator in dataset_evaluators: - source = evaluator.source_evaluator - evaluation_result = evaluator.evaluate(results_by_evaluator.get(source, [])) - dataset_results[evaluator.name] = EvaluationResultDto.from_evaluation_result( - evaluation_result - ) + for evaluator in evaluators: + evaluator_config = getattr(evaluator, "evaluator_config", None) + if evaluator_config is None: + continue + aggregators = getattr(evaluator_config, "aggregators", None) + if not aggregators: + continue + source_name = evaluator_config.name + source_results = results_by_evaluator.get(source_name, []) + for spec in aggregators: + dataset_evaluator = build_dataset_evaluator(spec, source_name) + evaluation_result = dataset_evaluator.evaluate(source_results) + dataset_results[dataset_evaluator.name] = ( + EvaluationResultDto.from_evaluation_result(evaluation_result) + ) return dataset_results @@ -419,17 +433,18 @@ async def execute(self) -> UiPathRuntimeResult: evaluators, ) - # Run any dataset-level evaluators configured on the eval - # set. Each consumes the per-datapoint results from one - # named source evaluator and emits a single run-level - # EvaluationResultDto stored on UiPathEvalOutput. - if self.context.dataset_evaluators: - results.dataset_evaluator_results = ( - compute_dataset_evaluator_results( - results.evaluation_set_results, - self.context.dataset_evaluators, - ) + # Run any dataset-level aggregators embedded in per-datapoint + # classification evaluator configs (the ``aggregators`` list). + # Each aggregator consumes per-datapoint results from its + # parent evaluator and emits one run-level EvaluationResultDto + # keyed ``{evaluator_name}.{aggregator_type}`` on + # UiPathEvalOutput.dataset_evaluator_results. + results.dataset_evaluator_results = ( + compute_dataset_evaluator_results( + results.evaluation_set_results, + evaluators, ) + ) # Configure span with output and metadata await configure_eval_set_run_span( diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py index 08d81818d..53e1e9855 100644 --- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py +++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py @@ -1,28 +1,34 @@ """Tests for dataset-level classification evaluators (Precision, Recall, FScore). Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases -(empty input, out-of-vocab labels, malformed details), and runtime-level -routing where compute_dataset_evaluator_results selects results by name. +(empty input, out-of-vocab labels, malformed details), factory dispatch, and +runtime-level routing where compute_dataset_evaluator_results walks +per-datapoint evaluator configs' embedded ``aggregators`` lists. """ import uuid import pytest +from pydantic import BaseModel +from uipath.eval.evaluators._aggregator_specs import ( + FScoreAggregatorSpec, + PrecisionAggregatorSpec, + RecallAggregatorSpec, +) from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification from uipath.eval.evaluators.classification_dataset_evaluators import ( ClassificationDetails, FScoreDatasetEvaluator, - FScoreDatasetEvaluatorConfig, PrecisionDatasetEvaluator, - PrecisionDatasetEvaluatorConfig, RecallDatasetEvaluator, - RecallDatasetEvaluatorConfig, ) from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator +from uipath.eval.evaluators.multiclass_classification_evaluator import ( + MulticlassClassificationEvaluator, +) from uipath.eval.models.models import ( EvaluationResultDto, - EvaluatorType, NumericEvaluationResult, ) from uipath.eval.runtime._types import ( @@ -45,51 +51,54 @@ def _result( ) -def _precision(classes: list[str], average: str = "macro") -> PrecisionDatasetEvaluator: - return PrecisionDatasetEvaluator( - PrecisionDatasetEvaluatorConfig( - id="p1", - name="precision", - source_evaluator="intent_match", - classes=classes, - average=average, # type: ignore[arg-type] - ) - ) +def _precision( + classes: list[str], averaging: str = "macro" +) -> PrecisionDatasetEvaluator: + spec = PrecisionAggregatorSpec(classes=classes, averaging=averaging) # type: ignore[arg-type] + return PrecisionDatasetEvaluator(spec, source_evaluator="intent_match") -def _recall(classes: list[str], average: str = "macro") -> RecallDatasetEvaluator: - return RecallDatasetEvaluator( - RecallDatasetEvaluatorConfig( - id="r1", - name="recall", - source_evaluator="intent_match", - classes=classes, - average=average, # type: ignore[arg-type] - ) - ) +def _recall(classes: list[str], averaging: str = "macro") -> RecallDatasetEvaluator: + spec = RecallAggregatorSpec(classes=classes, averaging=averaging) # type: ignore[arg-type] + return RecallDatasetEvaluator(spec, source_evaluator="intent_match") def _fscore( - classes: list[str], average: str = "macro", f_value: float = 1.0 + classes: list[str], averaging: str = "macro", f_value: float = 1.0 ) -> FScoreDatasetEvaluator: - return FScoreDatasetEvaluator( - FScoreDatasetEvaluatorConfig( - id="f1", - name="fscore", - source_evaluator="intent_match", - classes=classes, - average=average, # type: ignore[arg-type] - f_value=f_value, - ) + spec = FScoreAggregatorSpec( + classes=classes, + averaging=averaging, # type: ignore[arg-type] + f_value=f_value, ) + return FScoreDatasetEvaluator(spec, source_evaluator="intent_match") -def _details(result: NumericEvaluationResult) -> ClassificationDetails: +def _details(result: object) -> ClassificationDetails: """Type-narrowing helper for asserting on details.""" + assert isinstance(result, NumericEvaluationResult) assert isinstance(result.details, ClassificationDetails) return result.details +def _multiclass_evaluator( + name: str, + classes: list[str], + aggregators: list[BaseModel], +) -> MulticlassClassificationEvaluator: + """Build a per-datapoint multiclass evaluator with embedded aggregators.""" + return MulticlassClassificationEvaluator.model_validate( + { + "id": str(uuid.uuid4()), + "evaluatorConfig": { + "name": name, + "classes": classes, + "aggregators": [spec.model_dump(by_alias=True) for spec in aggregators], + }, + } + ) + + class TestPrecisionEvaluator: def test_empty_input_returns_zeroed_result(self) -> None: result = _precision(["cat", "dog"]).evaluate([]) @@ -102,14 +111,13 @@ def test_empty_input_returns_zeroed_result(self) -> None: assert d.per_class["cat"].tn == 0 def test_two_class_macro(self) -> None: - # 4 datapoints: 2 TP_yes, 1 FN_yes (predicted no), 1 FP_yes (predicted yes when expected no). results = [ _result("yes", "yes"), _result("yes", "yes"), - _result("yes", "no"), # FN for yes, FP for no - _result("no", "yes"), # FP for yes, FN for no + _result("yes", "no"), + _result("no", "yes"), ] - result = _precision(["yes", "no"], average="macro").evaluate(results) + result = _precision(["yes", "no"], averaging="macro").evaluate(results) d = _details(result) # precision_yes = 2 / (2 + 1) = 2/3 # precision_no = 0 / (0 + 1) = 0 @@ -126,33 +134,27 @@ def test_two_class_micro_equals_accuracy(self) -> None: _result("yes", "no"), _result("no", "yes"), ] - result = _precision(["yes", "no"], average="micro").evaluate(results) + result = _precision(["yes", "no"], averaging="micro").evaluate(results) d = _details(result) - # micro precision = sum(TP) / sum(TP + FP) - # sum(TP) = 2 (yes diag) + 0 (no diag) = 2 - # sum(FP) = 1 (yes off-diag row) + 1 (no off-diag row) = 2 - # micro = 2 / (2 + 2) = 0.5 — equals accuracy 2/4 in the 2-class case assert d.micro == pytest.approx(0.5) assert result.score == pytest.approx(0.5) def test_three_class_macro(self) -> None: - # Each class gets 2 TP, 1 FP, 1 FN — symmetric setup pairs = [ ("cat", "cat"), ("cat", "cat"), - ("cat", "dog"), # FN_cat, FP_dog + ("cat", "dog"), ("dog", "dog"), ("dog", "dog"), - ("dog", "bird"), # FN_dog, FP_bird + ("dog", "bird"), ("bird", "bird"), ("bird", "bird"), - ("bird", "cat"), # FN_bird, FP_cat + ("bird", "cat"), ] - result = _precision(["cat", "dog", "bird"], average="macro").evaluate( + result = _precision(["cat", "dog", "bird"], averaging="macro").evaluate( [_result(e, a) for e, a in pairs] ) d = _details(result) - # per-class precision = 2 / (2 + 1) = 2/3 for all three for label in ("cat", "dog", "bird"): m = d.per_class[label] assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5 @@ -169,30 +171,24 @@ def test_two_class_macro(self) -> None: _result("yes", "no"), _result("no", "yes"), ] - result = _recall(["yes", "no"], average="macro").evaluate(results) + result = _recall(["yes", "no"], averaging="macro").evaluate(results) d = _details(result) - # recall_yes = TP / (TP + FN) = 2 / (2 + 1) = 2/3 - # recall_no = 0 / (0 + 1) = 0 - # macro = 1/3 assert d.per_class["yes"].value == pytest.approx(2 / 3) assert d.per_class["no"].value == pytest.approx(0.0) assert result.score == pytest.approx(1 / 3) def test_recall_differs_from_precision(self) -> None: - # Asymmetric example so precision != recall. results = [ - _result("yes", "yes"), # TP - _result("yes", "yes"), # TP - _result("no", "yes"), # FP for yes - _result("no", "yes"), # FP for yes - _result("no", "no"), # TP for no + _result("yes", "yes"), + _result("yes", "yes"), + _result("no", "yes"), + _result("no", "yes"), + _result("no", "no"), ] - p = _details(_precision(["yes", "no"], average="macro").evaluate(results)) - r = _details(_recall(["yes", "no"], average="macro").evaluate(results)) - # precision_yes = 2/(2+2)=0.5, precision_no = 1/(1+0)=1.0 + p = _details(_precision(["yes", "no"], averaging="macro").evaluate(results)) + r = _details(_recall(["yes", "no"], averaging="macro").evaluate(results)) assert p.per_class["yes"].value == pytest.approx(0.5) assert p.per_class["no"].value == pytest.approx(1.0) - # recall_yes = 2/(2+0)=1.0, recall_no = 1/(1+2)=1/3 assert r.per_class["yes"].value == pytest.approx(1.0) assert r.per_class["no"].value == pytest.approx(1 / 3) @@ -206,16 +202,13 @@ def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None: _result("no", "yes"), ] f = _details( - _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results) + _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results) ) - # precision_yes = 2/3, recall_yes = 2/3 -> F1_yes = 2/3 - # precision_no = 0, recall_no = 0 -> F1_no = 0 assert f.per_class["yes"].value == pytest.approx(2 / 3) assert f.per_class["no"].value == pytest.approx(0.0) assert f.macro == pytest.approx((2 / 3 + 0.0) / 2) def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None: - # Asymmetric setup: precision_yes = 0.5, recall_yes = 1.0. results = [ _result("yes", "yes"), _result("yes", "yes"), @@ -224,17 +217,14 @@ def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None: _result("no", "no"), ] f1 = _details( - _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results) + _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results) ) f2 = _details( - _fscore(["yes", "no"], average="macro", f_value=2.0).evaluate(results) + _fscore(["yes", "no"], averaging="macro", f_value=2.0).evaluate(results) ) - # F_beta with beta>1 weighs recall higher. Since recall_yes > precision_yes, - # F2_yes should be > F1_yes. assert f2.per_class["yes"].value > f1.per_class["yes"].value def test_three_class_micro_pools_across_classes(self) -> None: - # Same symmetric setup as the precision macro test. pairs = [ ("cat", "cat"), ("cat", "cat"), @@ -247,12 +237,10 @@ def test_three_class_micro_pools_across_classes(self) -> None: ("bird", "cat"), ] d = _details( - _fscore(["cat", "dog", "bird"], average="micro", f_value=1.0).evaluate( + _fscore(["cat", "dog", "bird"], averaging="micro", f_value=1.0).evaluate( [_result(e, a) for e, a in pairs] ) ) - # micro precision == micro recall == 6/9 (accuracy when each off-diag - # contributes once to FP and once to FN globally). micro F1 = 6/9. assert d.micro == pytest.approx(6 / 9) @@ -260,8 +248,8 @@ class TestSkippingAndEdgeCases: def test_out_of_vocab_labels_are_skipped(self) -> None: results = [ _result("cat", "cat"), - _result("cat", "platypus"), # actual not in classes - _result("zebra", "dog"), # expected not in classes + _result("cat", "platypus"), + _result("zebra", "dog"), ] d = _details(_precision(["cat", "dog"]).evaluate(results)) assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2 @@ -275,7 +263,7 @@ def test_results_without_justification_are_skipped(self) -> None: d = _details(_precision(["cat", "dog"]).evaluate(results)) assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2 - def test_case_insensitive_by_default(self) -> None: + def test_case_insensitive(self) -> None: results = [_result("Cat", "CAT"), _result("DOG", "dog")] d = _details(_precision(["cat", "dog"]).evaluate(results)) assert d.per_class["cat"].tp == 1 @@ -283,48 +271,97 @@ def test_case_insensitive_by_default(self) -> None: class TestFactory: - def test_builds_evaluator_from_dict(self) -> None: - config_data = { - "id": "precision_intent", - "name": "precision_intent", - "type": EvaluatorType.DATASET_PRECISION.value, - "sourceEvaluator": "intent_match", - "classes": ["yes", "no"], - "average": "macro", - } - evaluator = build_dataset_evaluator(config_data) + """The factory now takes an AggregatorSpec instance + source name, not a dict.""" + + def test_builds_precision_from_spec(self) -> None: + spec = PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro") + evaluator = build_dataset_evaluator(spec, "intent_match") assert isinstance(evaluator, PrecisionDatasetEvaluator) assert evaluator.source_evaluator == "intent_match" - assert evaluator.name == "precision_intent" - - def test_unknown_type_raises(self) -> None: - with pytest.raises(ValueError, match="Unknown dataset evaluator type"): - build_dataset_evaluator( - { - "id": "x", - "name": "x", - "type": "uipath-not-a-thing", - "sourceEvaluator": "intent_match", - "classes": ["yes", "no"], - } - ) + assert evaluator.name == "intent_match.precision" - def test_missing_type_raises(self) -> None: - with pytest.raises(ValueError, match="missing required field 'type'"): - build_dataset_evaluator( - { - "id": "x", - "name": "x", - "sourceEvaluator": "intent_match", - "classes": ["yes", "no"], - } - ) + def test_builds_recall_from_spec(self) -> None: + spec = RecallAggregatorSpec(classes=["yes", "no"], averaging="micro") + evaluator = build_dataset_evaluator(spec, "intent_match") + assert isinstance(evaluator, RecallDatasetEvaluator) + assert evaluator.name == "intent_match.recall" + + def test_builds_fscore_from_spec(self) -> None: + spec = FScoreAggregatorSpec( + classes=["yes", "no"], averaging="macro", f_value=2.0 + ) + evaluator = build_dataset_evaluator(spec, "intent_match") + assert isinstance(evaluator, FScoreDatasetEvaluator) + assert evaluator.spec.f_value == 2.0 + + +class TestAggregatorSpecJsonRoundTrip: + """Pin the wire shape sent to the C# side.""" + + def test_precision_uses_self_contained_fields(self) -> None: + spec = PrecisionAggregatorSpec.model_validate( + { + "type": "precision", + "classes": ["book", "cancel", "reschedule"], + "averaging": "macro", + } + ) + dumped = spec.model_dump(by_alias=True) + assert dumped == { + "type": "precision", + "classes": ["book", "cancel", "reschedule"], + "averaging": "macro", + } + + def test_fscore_uses_camelcase_fvalue_on_wire(self) -> None: + spec = FScoreAggregatorSpec.model_validate( + { + "type": "fscore", + "classes": ["yes", "no"], + "averaging": "macro", + "fValue": 1.5, + } + ) + assert spec.f_value == 1.5 + dumped = spec.model_dump(by_alias=True) + assert dumped["fValue"] == 1.5 + assert "f_value" not in dumped + + def test_multiclass_evaluator_round_trips_aggregators(self) -> None: + """Per-datapoint evaluator config carries aggregators[]; survives dump+load.""" + ev = _multiclass_evaluator( + "intent_classifier", + classes=["book", "cancel", "reschedule"], + aggregators=[ + PrecisionAggregatorSpec( + classes=["book", "cancel", "reschedule"], averaging="macro" + ), + FScoreAggregatorSpec( + classes=["book", "cancel", "reschedule"], + averaging="macro", + f_value=1.0, + ), + ], + ) + assert ev.evaluator_config.aggregators is not None + assert len(ev.evaluator_config.aggregators) == 2 + assert ev.evaluator_config.aggregators[0].type == "precision" + assert ev.evaluator_config.aggregators[1].type == "fscore" class TestComputeDatasetEvaluatorResults: - """End-to-end: dataset evaluator picks results by source_evaluator name.""" + """End-to-end: runtime walks evaluator configs' aggregators[].""" + + def test_walks_aggregators_on_classification_evaluator(self) -> None: + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"), + RecallAggregatorSpec(classes=["yes", "no"], averaging="macro"), + ], + ) - def test_routes_to_correct_source_and_ignores_others(self) -> None: eval_results = [ UiPathEvalRunResult( evaluation_name="dp1", @@ -353,18 +390,42 @@ def test_routes_to_correct_source_and_ignores_others(self) -> None: ), ] - out = compute_dataset_evaluator_results( - eval_results, [_precision(["yes", "no"], average="macro")] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + # Two aggregators on intent_match → two keys, prefixed by source name. + assert set(out) == {"intent_match.precision", "intent_match.recall"} + precision_dto = out["intent_match.precision"] + assert isinstance(precision_dto, EvaluationResultDto) + assert isinstance(precision_dto.details, dict) + # The unrelated 0.5 score from some_other_evaluator must NOT be in the matrix. + assert precision_dto.details["n_scored"] == 2 + + def test_evaluator_without_aggregators_is_skipped(self) -> None: + evaluator = _multiclass_evaluator( + "intent_match", classes=["yes", "no"], aggregators=[] ) - assert set(out) == {"precision"} - dto = out["precision"] - assert isinstance(dto, EvaluationResultDto) - # The unrelated 0.5 score from some_other_evaluator must NOT be in the - # matrix — only the two intent_match results count. - assert isinstance(dto.details, dict) - assert dto.details["n_scored"] == 2 + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + assert out == {} def test_line_by_line_subresults_are_excluded(self) -> None: + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"), + ], + ) eval_results = [ UiPathEvalRunResult( evaluation_name="dp1", @@ -383,13 +444,18 @@ def test_line_by_line_subresults_are_excluded(self) -> None: ], ), ] - out = compute_dataset_evaluator_results( - eval_results, [_precision(["yes", "no"])] - ) - assert isinstance(out["precision"].details, dict) - assert out["precision"].details["n_scored"] == 1 + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + assert isinstance(out["intent_match.precision"].details, dict) + assert out["intent_match.precision"].details["n_scored"] == 1 def test_source_with_no_results_produces_zeroed_report(self) -> None: + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"), + ], + ) eval_results = [ UiPathEvalRunResult( evaluation_name="dp1", @@ -402,10 +468,8 @@ def test_source_with_no_results_produces_zeroed_report(self) -> None: ], ), ] - out = compute_dataset_evaluator_results( - eval_results, [_precision(["yes", "no"])] - ) - dto = out["precision"] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + dto = out["intent_match.precision"] assert dto.score == 0.0 assert isinstance(dto.details, dict) assert dto.details["n_scored"] == 0 From 77fcc109777dd2ba943e4ff3c2d3745dbed7dc21 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 21:27:47 -0700 Subject: [PATCH 06/13] feat(eval): wire sample classification evaluators to embedded aggregators Update binary_classification_agent and multiclass_classification_simple sample evaluator JSONs to include the new aggregators[] field. Each aggregator carries its own classes, averaging, and (for fscore) fValue. Update the e2e test to also assert the dataset-level results land in UiPathEvalOutput.dataset_evaluator_results, keyed "{evaluator_name}.{aggregator_type}". Co-Authored-By: Claude Opus 4.7 --- .../evaluators/binary-classification.json | 22 +++++++++++++++++-- .../evaluators/multiclass-classification.json | 22 +++++++++++++++++-- .../eval/test_classification_samples_e2e.py | 21 ++++++++++++++++++ 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json index 21f7d6850..d2cc64b71 100644 --- a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json +++ b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json @@ -1,7 +1,7 @@ { "version": "1.0", "id": "BinarySpamPrecision", - "description": "Precision on the 'spam' positive class", + "description": "Precision on the 'spam' positive class, plus run-level aggregators", "evaluatorTypeId": "uipath-binary-classification", "evaluatorConfig": { "name": "BinarySpamPrecision", @@ -11,6 +11,24 @@ "fValue": 1.0, "defaultEvaluationCriteria": { "expectedClass": "ham" - } + }, + "aggregators": [ + { + "type": "precision", + "classes": ["spam", "ham"], + "averaging": "macro" + }, + { + "type": "recall", + "classes": ["spam", "ham"], + "averaging": "macro" + }, + { + "type": "fscore", + "classes": ["spam", "ham"], + "averaging": "macro", + "fValue": 1.0 + } + ] } } diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json index 859a18562..871afbc21 100644 --- a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json +++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json @@ -1,7 +1,7 @@ { "version": "1.0", "id": "EmailMulticlassFScore", - "description": "Macro-averaged F1 across payments / support / spam", + "description": "Macro-averaged F1 across payments / support / spam, plus run-level aggregators", "evaluatorTypeId": "uipath-multiclass-classification", "evaluatorConfig": { "name": "EmailMulticlassFScore", @@ -12,6 +12,24 @@ "fValue": 1.0, "defaultEvaluationCriteria": { "expectedClass": "support" - } + }, + "aggregators": [ + { + "type": "precision", + "classes": ["payments", "support", "spam"], + "averaging": "macro" + }, + { + "type": "recall", + "classes": ["payments", "support", "spam"], + "averaging": "macro" + }, + { + "type": "fscore", + "classes": ["payments", "support", "spam"], + "averaging": "macro", + "fValue": 1.0 + } + ] } } diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py index 202363221..f2bdfa3cb 100644 --- a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py +++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py @@ -170,6 +170,15 @@ async def test_binary_classification_sample_end_to_end(): # Precision = TP / (TP + FP) = 2 / (2 + 1) = 0.6666... assert averages["BinarySpamPrecision"] == pytest.approx(2 / 3, rel=1e-6) + # Dataset-level aggregators embedded on the evaluator config also fire. + # Each result keyed by "{evaluator_name}.{aggregator_type}". + keys = set(output.dataset_evaluator_results) + assert keys == { + "BinarySpamPrecision.precision", + "BinarySpamPrecision.recall", + "BinarySpamPrecision.fscore", + } + async def test_multiclass_classification_sample_end_to_end(): """Multiclass router: 6/7 correct, macro F1 = (0.8 + 0.8 + 1.0) / 3 = 0.8666...""" @@ -191,3 +200,15 @@ async def test_multiclass_classification_sample_end_to_end(): # payments F1=0.8 (P=2/3, R=1), support F1=0.8 (P=1, R=2/3), spam F1=1.0 # macro = mean = 2.6 / 3 assert averages["EmailMulticlassFScore"] == pytest.approx(2.6 / 3, rel=1e-6) + + # Three embedded aggregators ran in addition to reduce_scores. + keys = set(output.dataset_evaluator_results) + assert keys == { + "EmailMulticlassFScore.precision", + "EmailMulticlassFScore.recall", + "EmailMulticlassFScore.fscore", + } + # The macro F1 computed by the embedded fscore aggregator should match + # reduce_scores' result (both walk the same confusion matrix). + fscore_result = output.dataset_evaluator_results["EmailMulticlassFScore.fscore"] + assert fscore_result.score == pytest.approx(2.6 / 3, rel=1e-6) From c0436a3da061146b61b117dbe885606b4fd52fef Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 21:49:44 -0700 Subject: [PATCH 07/13] refactor(eval): apply ponytail-review cleanup - Collapse Precision/Recall/FScore into one ClassificationDatasetEvaluator switching on spec.type; factory becomes a one-liner. - Inline _precision_of/_recall_of/_f_score_of and the one-use _ConfusionData helpers; switch _ConfusionData to @dataclass(slots=True). - Drop dead get_evaluator_id() abstract + 3 overrides + matching EvaluatorType enum entries (factory dispatches on spec.type). - Pull repeated model_config into a private _AggregatorSpecBase. - Drop registry + impossible-case ValueError in dataset_evaluator_factory (pydantic discriminator catches unknown types). - Have _coerce_justification return the typed justification object. - Drop the _source_evaluator private/property pair on BaseDatasetEvaluator. No behavior change. Co-Authored-By: Claude Opus 4.7 --- .../eval/evaluators/_aggregator_specs.py | 16 +- .../eval/evaluators/base_dataset_evaluator.py | 16 +- .../classification_dataset_evaluators.py | 227 ++++++------------ .../evaluators/dataset_evaluator_factory.py | 50 +--- .../uipath/src/uipath/eval/models/models.py | 3 - .../uipath/src/uipath/eval/runtime/runtime.py | 10 +- .../test_dataset_classification_evaluators.py | 27 ++- 7 files changed, 116 insertions(+), 233 deletions(-) diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py index fde129506..6c0b2b880 100644 --- a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py +++ b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py @@ -16,31 +16,31 @@ from pydantic.alias_generators import to_camel -class PrecisionAggregatorSpec(BaseModel): - """Run-level precision aggregator (multiclass, micro or macro averaged).""" +class _AggregatorSpecBase(BaseModel): + """Shared pydantic config for every aggregator variant.""" model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + +class PrecisionAggregatorSpec(_AggregatorSpecBase): + """Run-level precision aggregator (multiclass, micro or macro averaged).""" + type: Literal["precision"] = "precision" classes: list[str] = Field(..., min_length=1) averaging: Literal["macro", "micro"] -class RecallAggregatorSpec(BaseModel): +class RecallAggregatorSpec(_AggregatorSpecBase): """Run-level recall aggregator (multiclass, micro or macro averaged).""" - model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) - type: Literal["recall"] = "recall" classes: list[str] = Field(..., min_length=1) averaging: Literal["macro", "micro"] -class FScoreAggregatorSpec(BaseModel): +class FScoreAggregatorSpec(_AggregatorSpecBase): """Run-level F-beta aggregator (multiclass, micro or macro averaged).""" - model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) - type: Literal["fscore"] = "fscore" classes: list[str] = Field(..., min_length=1) averaging: Literal["macro", "micro"] diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py index dcb33cc78..c00eb666a 100644 --- a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py @@ -39,27 +39,17 @@ class BaseDatasetEvaluator(ABC, Generic[SpecT]): """ spec: SpecT - _source_evaluator: str + source_evaluator: str def __init__(self, spec: SpecT, source_evaluator: str) -> None: """Store the aggregator spec and the source evaluator name.""" self.spec = spec - self._source_evaluator = source_evaluator - - @property - def source_evaluator(self) -> str: - """Name of the upstream evaluator whose results this one consumes.""" - return self._source_evaluator + self.source_evaluator = source_evaluator @property def name(self) -> str: """Stable key for this dataset evaluator's result in the output map.""" - return f"{self._source_evaluator}.{self.spec.type}" - - @classmethod - @abstractmethod - def get_evaluator_id(cls) -> str: - """Stable identifier matching the ``type`` discriminator on specs.""" + return f"{self.source_evaluator}.{self.spec.type}" @abstractmethod def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py index b15020c25..ef6063b4c 100644 --- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py +++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py @@ -10,34 +10,30 @@ from __future__ import annotations +from dataclasses import dataclass + from pydantic import BaseModel, ConfigDict from pydantic.alias_generators import to_camel from ..models.models import ( EvaluationResult, EvaluationResultDto, - EvaluatorType, NumericEvaluationResult, ) -from ._aggregator_specs import ( - FScoreAggregatorSpec, - PrecisionAggregatorSpec, - RecallAggregatorSpec, -) +from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec from .base_dataset_evaluator import BaseDatasetEvaluator from .base_evaluator import BaseEvaluatorJustification -def _coerce_justification(details: object) -> tuple[str, str] | None: - """Extract (expected, actual) from an EvaluationResultDto.details payload.""" +def _coerce_justification(details: object) -> BaseEvaluatorJustification | None: + """Extract the BaseEvaluatorJustification from an EvaluationResultDto.details payload.""" if isinstance(details, BaseEvaluatorJustification): - return details.expected, details.actual + return details if isinstance(details, dict): try: - j = BaseEvaluatorJustification.model_validate(details) + return BaseEvaluatorJustification.model_validate(details) except Exception: return None - return j.expected, j.actual return None @@ -71,33 +67,15 @@ class ClassificationDetails(BaseModel): n_skipped: int +@dataclass(slots=True) class _ConfusionData: """Internal: confusion matrix and per-class counts derived from results.""" - __slots__ = ("classes", "matrix", "n_total", "n_scored", "n_skipped") - - def __init__( - self, - classes: list[str], - matrix: list[list[int]], - n_total: int, - n_scored: int, - n_skipped: int, - ) -> None: - self.classes = classes - self.matrix = matrix - self.n_total = n_total - self.n_scored = n_scored - self.n_skipped = n_skipped - - def counts_for(self, class_index: int) -> tuple[int, int, int, int]: - """Return (tp, fp, fn, tn) for a class index.""" - k = len(self.classes) - tp = self.matrix[class_index][class_index] - fp = sum(self.matrix[class_index][j] for j in range(k)) - tp - fn = sum(self.matrix[j][class_index] for j in range(k)) - tp - tn = self.n_scored - tp - fp - fn - return tp, fp, fn, tn + classes: list[str] + matrix: list[list[int]] + n_total: int + n_scored: int + n_skipped: int def _build_confusion( @@ -125,8 +103,8 @@ def _build_confusion( if j is None: n_skipped += 1 continue - exp = j[0].lower() - act = j[1].lower() + exp = j.expected.lower() + act = j.actual.lower() if exp not in index_of or act not in index_of: n_skipped += 1 continue @@ -142,126 +120,77 @@ def _build_confusion( ) -def _precision_of(tp: int, fp: int, _fn: int, _tn: int) -> float: - return tp / (tp + fp) if (tp + fp) > 0 else 0.0 - - -def _recall_of(tp: int, _fp: int, fn: int, _tn: int) -> float: - return tp / (tp + fn) if (tp + fn) > 0 else 0.0 - - -def _f_score_of(beta: float): - beta_sq = beta * beta - - def compute(tp: int, fp: int, fn: int, _tn: int) -> float: - p = tp / (tp + fp) if (tp + fp) > 0 else 0.0 - r = tp / (tp + fn) if (tp + fn) > 0 else 0.0 - denom = beta_sq * p + r - return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0 - - return compute - - -def _build_details( - confusion: _ConfusionData, - metric_name: str, - average: str, - per_class_fn, -) -> tuple[ClassificationDetails, float]: - """Compute per-class values, micro, macro, and pick the headline.""" - per_class: dict[str, PerClassMetrics] = {} - total_tp = 0 - total_fp = 0 - total_fn = 0 - - for c, label in enumerate(confusion.classes): - tp, fp, fn, tn = confusion.counts_for(c) - total_tp += tp - total_fp += fp - total_fn += fn - per_class[label] = PerClassMetrics( - tp=tp, - tn=tn, - fp=fp, - fn=fn, - support=tp + fn, - value=per_class_fn(tp, fp, fn, tn), - ) - - micro = per_class_fn(total_tp, total_fp, total_fn, 0) - - k = len(confusion.classes) - macro = sum(per_class[c].value for c in confusion.classes) / k if k > 0 else 0.0 - - details = ClassificationDetails( - metric=metric_name, - average=average, - classes=confusion.classes, - confusion_matrix=confusion.matrix, - per_class=per_class, - micro=micro, - macro=macro, - n_total=confusion.n_total, - n_scored=confusion.n_scored, - n_skipped=confusion.n_skipped, - ) - - headline = micro if average == "micro" else macro - return details, headline - - -# ─── evaluators ─────────────────────────────────────────────────────────────── +_METRIC_NAME = {"precision": "precision", "recall": "recall", "fscore": "f_score"} -class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionAggregatorSpec]): - """Dataset-level precision evaluator (multiclass, micro or macro averaged).""" +class ClassificationDatasetEvaluator(BaseDatasetEvaluator[AggregatorSpec]): + """One implementation for all three classification aggregators. - @classmethod - def get_evaluator_id(cls) -> str: - """Identifier matching the type discriminator on specs.""" - return EvaluatorType.DATASET_PRECISION.value + Dispatches on ``self.spec.type`` to pick the per-class metric formula: + precision, recall, or F-beta. The math (confusion-matrix build, per-class + counts, micro/macro averaging) is identical across the three. + """ def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: - """Compute the precision report and return the headline as score.""" + """Compute the configured metric report and return the headline as score.""" confusion = _build_confusion(results, self.spec.classes) - details, headline = _build_details( - confusion, "precision", self.spec.averaging, _precision_of + beta_sq = ( + self.spec.f_value * self.spec.f_value + if isinstance(self.spec, FScoreAggregatorSpec) + else 0.0 ) - return NumericEvaluationResult(score=headline, details=details) - - -class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallAggregatorSpec]): - """Dataset-level recall evaluator (multiclass, micro or macro averaged).""" - - @classmethod - def get_evaluator_id(cls) -> str: - """Identifier matching the type discriminator on specs.""" - return EvaluatorType.DATASET_RECALL.value - - def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: - """Compute the recall report and return the headline as score.""" - confusion = _build_confusion(results, self.spec.classes) - details, headline = _build_details( - confusion, "recall", self.spec.averaging, _recall_of + metric_type = self.spec.type + + per_class: dict[str, PerClassMetrics] = {} + total_tp = 0 + total_fp = 0 + total_fn = 0 + k = len(confusion.classes) + + for c, label in enumerate(confusion.classes): + tp = confusion.matrix[c][c] + fp = sum(confusion.matrix[c][j] for j in range(k)) - tp + fn = sum(confusion.matrix[j][c] for j in range(k)) - tp + tn = confusion.n_scored - tp - fp - fn + total_tp += tp + total_fp += fp + total_fn += fn + per_class[label] = PerClassMetrics( + tp=tp, + tn=tn, + fp=fp, + fn=fn, + support=tp + fn, + value=_metric(metric_type, tp, fp, fn, beta_sq), + ) + + micro = _metric(metric_type, total_tp, total_fp, total_fn, beta_sq) + macro = sum(per_class[c].value for c in confusion.classes) / k + + details = ClassificationDetails( + metric=_METRIC_NAME[metric_type], + average=self.spec.averaging, + classes=confusion.classes, + confusion_matrix=confusion.matrix, + per_class=per_class, + micro=micro, + macro=macro, + n_total=confusion.n_total, + n_scored=confusion.n_scored, + n_skipped=confusion.n_skipped, ) - return NumericEvaluationResult(score=headline, details=details) + headline = micro if self.spec.averaging == "micro" else macro + return NumericEvaluationResult(score=headline, details=details) -class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreAggregatorSpec]): - """Dataset-level F-beta evaluator (multiclass, micro or macro averaged).""" - - @classmethod - def get_evaluator_id(cls) -> str: - """Identifier matching the type discriminator on specs.""" - return EvaluatorType.DATASET_F_SCORE.value - def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: - """Compute the F-beta report and return the headline as score.""" - confusion = _build_confusion(results, self.spec.classes) - details, headline = _build_details( - confusion, - "f_score", - self.spec.averaging, - _f_score_of(self.spec.f_value), - ) - return NumericEvaluationResult(score=headline, details=details) +def _metric(metric_type: str, tp: int, fp: int, fn: int, beta_sq: float) -> float: + """One formula switch covering precision / recall / F-beta.""" + if metric_type == "precision": + return tp / (tp + fp) if (tp + fp) > 0 else 0.0 + if metric_type == "recall": + return tp / (tp + fn) if (tp + fn) > 0 else 0.0 + p = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + r = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + denom = beta_sq * p + r + return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0 diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py index d597b9085..9cd895ad2 100644 --- a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py +++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py @@ -1,61 +1,27 @@ """Factory that instantiates dataset-level evaluators from aggregator specs. -Dataset evaluators are now built from a self-contained :class:`AggregatorSpec` +Dataset evaluators are built from a self-contained :class:`AggregatorSpec` embedded in a per-datapoint classification evaluator's config, plus the source -evaluator's name (supplied by the runtime when walking those configs). The -factory inspects the spec's ``type`` discriminator and returns the matching -evaluator instance. +evaluator's name (supplied by the runtime when walking those configs). All +three aggregator types share a single :class:`ClassificationDatasetEvaluator` +implementation that dispatches on ``spec.type`` internally. """ from __future__ import annotations -from typing import Any - -from ._aggregator_specs import ( - AggregatorSpec, - FScoreAggregatorSpec, - PrecisionAggregatorSpec, - RecallAggregatorSpec, -) -from .base_dataset_evaluator import BaseDatasetEvaluator -from .classification_dataset_evaluators import ( - FScoreDatasetEvaluator, - PrecisionDatasetEvaluator, - RecallDatasetEvaluator, -) - -_EVALUATOR_REGISTRY: dict[str, type[BaseDatasetEvaluator[Any]]] = { - "precision": PrecisionDatasetEvaluator, - "recall": RecallDatasetEvaluator, - "fscore": FScoreDatasetEvaluator, -} +from ._aggregator_specs import AggregatorSpec +from .classification_dataset_evaluators import ClassificationDatasetEvaluator def build_dataset_evaluator( spec: AggregatorSpec, source_evaluator: str, -) -> BaseDatasetEvaluator[Any]: +) -> ClassificationDatasetEvaluator: """Build a dataset evaluator instance from an aggregator spec. Args: spec: A validated :class:`AggregatorSpec` (precision / recall / fscore). source_evaluator: Name of the per-datapoint evaluator whose results this aggregator consumes. - - Raises: - ValueError: If ``spec.type`` doesn't match any known aggregator. """ - evaluator_cls = _EVALUATOR_REGISTRY.get(spec.type) - if evaluator_cls is None: - known = sorted(_EVALUATOR_REGISTRY.keys()) - raise ValueError(f"Unknown aggregator type '{spec.type}'. Known types: {known}") - return evaluator_cls(spec, source_evaluator) - - -__all__ = [ - "AggregatorSpec", - "PrecisionAggregatorSpec", - "RecallAggregatorSpec", - "FScoreAggregatorSpec", - "build_dataset_evaluator", -] + return ClassificationDatasetEvaluator(spec, source_evaluator) diff --git a/packages/uipath/src/uipath/eval/models/models.py b/packages/uipath/src/uipath/eval/models/models.py index 8945137e7..14c130c92 100644 --- a/packages/uipath/src/uipath/eval/models/models.py +++ b/packages/uipath/src/uipath/eval/models/models.py @@ -300,9 +300,6 @@ class EvaluatorType(str, Enum): TOOL_CALL_OUTPUT = "uipath-tool-call-output" BINARY_CLASSIFICATION = "uipath-binary-classification" MULTICLASS_CLASSIFICATION = "uipath-multiclass-classification" - DATASET_PRECISION = "uipath-dataset-precision" - DATASET_RECALL = "uipath-dataset-recall" - DATASET_F_SCORE = "uipath-dataset-f-score" class ToolCall(BaseModel): diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index c64f8f158..89f8f6c29 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -237,13 +237,11 @@ def compute_dataset_evaluator_results( dataset_results: dict[str, EvaluationResultDto] = {} for evaluator in evaluators: - evaluator_config = getattr(evaluator, "evaluator_config", None) - if evaluator_config is None: + config = getattr(evaluator, "evaluator_config", None) + aggregators = getattr(config, "aggregators", None) + if config is None or not aggregators: continue - aggregators = getattr(evaluator_config, "aggregators", None) - if not aggregators: - continue - source_name = evaluator_config.name + source_name = config.name source_results = results_by_evaluator.get(source_name, []) for spec in aggregators: dataset_evaluator = build_dataset_evaluator(spec, source_name) diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py index 53e1e9855..29343b170 100644 --- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py +++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py @@ -18,10 +18,8 @@ ) from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification from uipath.eval.evaluators.classification_dataset_evaluators import ( + ClassificationDatasetEvaluator, ClassificationDetails, - FScoreDatasetEvaluator, - PrecisionDatasetEvaluator, - RecallDatasetEvaluator, ) from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator from uipath.eval.evaluators.multiclass_classification_evaluator import ( @@ -53,25 +51,27 @@ def _result( def _precision( classes: list[str], averaging: str = "macro" -) -> PrecisionDatasetEvaluator: +) -> ClassificationDatasetEvaluator: spec = PrecisionAggregatorSpec(classes=classes, averaging=averaging) # type: ignore[arg-type] - return PrecisionDatasetEvaluator(spec, source_evaluator="intent_match") + return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match") -def _recall(classes: list[str], averaging: str = "macro") -> RecallDatasetEvaluator: +def _recall( + classes: list[str], averaging: str = "macro" +) -> ClassificationDatasetEvaluator: spec = RecallAggregatorSpec(classes=classes, averaging=averaging) # type: ignore[arg-type] - return RecallDatasetEvaluator(spec, source_evaluator="intent_match") + return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match") def _fscore( classes: list[str], averaging: str = "macro", f_value: float = 1.0 -) -> FScoreDatasetEvaluator: +) -> ClassificationDatasetEvaluator: spec = FScoreAggregatorSpec( classes=classes, averaging=averaging, # type: ignore[arg-type] f_value=f_value, ) - return FScoreDatasetEvaluator(spec, source_evaluator="intent_match") + return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match") def _details(result: object) -> ClassificationDetails: @@ -276,14 +276,16 @@ class TestFactory: def test_builds_precision_from_spec(self) -> None: spec = PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro") evaluator = build_dataset_evaluator(spec, "intent_match") - assert isinstance(evaluator, PrecisionDatasetEvaluator) + assert isinstance(evaluator, ClassificationDatasetEvaluator) + assert evaluator.spec.type == "precision" assert evaluator.source_evaluator == "intent_match" assert evaluator.name == "intent_match.precision" def test_builds_recall_from_spec(self) -> None: spec = RecallAggregatorSpec(classes=["yes", "no"], averaging="micro") evaluator = build_dataset_evaluator(spec, "intent_match") - assert isinstance(evaluator, RecallDatasetEvaluator) + assert isinstance(evaluator, ClassificationDatasetEvaluator) + assert evaluator.spec.type == "recall" assert evaluator.name == "intent_match.recall" def test_builds_fscore_from_spec(self) -> None: @@ -291,7 +293,8 @@ def test_builds_fscore_from_spec(self) -> None: classes=["yes", "no"], averaging="macro", f_value=2.0 ) evaluator = build_dataset_evaluator(spec, "intent_match") - assert isinstance(evaluator, FScoreDatasetEvaluator) + assert isinstance(evaluator, ClassificationDatasetEvaluator) + assert isinstance(evaluator.spec, FScoreAggregatorSpec) assert evaluator.spec.f_value == 2.0 From 50c64f4862c57834437b1dba59266106f29e3b66 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 21:53:00 -0700 Subject: [PATCH 08/13] refactor(eval): apply ponytail-review cleanup (justification + demo) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add BaseEvaluatorJustification.try_from classmethod and collapse the three duplicate "instance | dict | other" coercion blocks in classification_dataset_evaluators, binary_classification_evaluator, and multiclass_classification_evaluator down to one line each. - Replace the 80-line ASCII confusion-matrix pretty-printer in dataset_evaluators_demo with the structured JSON wire shape — the thing readers actually want to inspect. Deferred from this PR: dropping reduce_scores / _micro_metric / _macro_metric on Binary/Multiclass evaluators, and the matching metric_type/averaging/f_value config fields. The runtime calls GenericBaseEvaluator.reduce_scores per-evaluator to compute the top-level evaluator score; the dataset evaluator framework adds {source}.{type}-keyed metrics in addition to that score, it doesn't replace it. Removing them would break the existing per-evaluator headline. Worth a follow-up that either makes reduce_scores delegate to the dataset evaluator framework or formally splits the two paths. No behavior change. Co-Authored-By: Claude Opus 4.7 --- .../examples/dataset_evaluators_demo.py | 68 ++----------------- .../uipath/eval/evaluators/base_evaluator.py | 19 ++++++ .../binary_classification_evaluator.py | 10 +-- .../classification_dataset_evaluators.py | 10 +-- .../multiclass_classification_evaluator.py | 10 +-- 5 files changed, 30 insertions(+), 87 deletions(-) diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py index 2d13f3572..6d887f3dd 100644 --- a/packages/uipath/examples/dataset_evaluators_demo.py +++ b/packages/uipath/examples/dataset_evaluators_demo.py @@ -13,7 +13,6 @@ from __future__ import annotations -import json from typing import Iterable from uipath.eval.evaluators._aggregator_specs import ( @@ -56,75 +55,18 @@ def print_header(title: str) -> None: print("═" * 78) -def print_confusion(details: ClassificationDetails) -> None: - """Pretty-print the confusion matrix as a table.""" - classes = details.classes - cell_width = max(7, max(len(c) for c in classes) + 1) - header = ( - " " * cell_width - + " │ " - + " │ ".join(c.center(cell_width) for c in classes) - + " │ ← expected" - ) - print(header) - print("─" * len(header)) - for predicted_idx, predicted_label in enumerate(classes): - row_cells = [ - str(details.confusion_matrix[predicted_idx][expected_idx]).rjust(cell_width) - for expected_idx in range(len(classes)) - ] - print(predicted_label.ljust(cell_width) + " │ " + " │ ".join(row_cells) + " │") - print(" " * cell_width + "↑ predicted") - - -def print_per_class(details: ClassificationDetails) -> None: - """One-row-per-class table of TP/TN/FP/FN + the metric.""" - label_w = max(len("class"), max(len(c) for c in details.classes)) - metric = details.metric - header = f" {'class'.ljust(label_w)} │ TP TN FP FN support {metric}" - print(header) - print(" " + "─" * (len(header) - 2)) - for cls, m in details.per_class.items(): - print( - f" {cls.ljust(label_w)} │ " - f"{m.tp:>2} {m.tn:>2} {m.fp:>2} {m.fn:>2} {m.support:>7} " - f"{m.value:.3f}" - ) - - def report( title: str, result: NumericEvaluationResult, *, - show_json_tail: bool = False, + show_json_tail: bool = False, # kept for call-site compat; payload is always emitted ) -> None: - """Render one scenario's result block.""" + """Render one scenario's result block as JSON — the actual wire shape.""" + _ = show_json_tail print_header(title) assert isinstance(result.details, ClassificationDetails) - d = result.details - print( - f" metric = {d.metric} average = {d.average} " - f"score (headline) = {result.score:.4f}" - ) - print( - f" micro = {d.micro:.4f} macro = {d.macro:.4f} " - f"scored = {d.n_scored}/{d.n_total} skipped = {d.n_skipped}" - ) - print() - print_confusion(d) - print() - print_per_class(d) - if show_json_tail: - print() - print(" ── wire JSON (matches frontend zod schema) ──") - payload = d.model_dump(by_alias=True) - print( - " " - + json.dumps( - {k: payload[k] for k in ("metric", "average", "micro", "macro")}, - indent=2, - ).replace("\n", "\n ") - ) + print(f" headline score = {result.score:.4f}") + print(result.details.model_dump_json(indent=2, by_alias=True)) # ─── scenarios ──────────────────────────────────────────────────────────────── diff --git a/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py index 73fac46c6..285a022f4 100644 --- a/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py @@ -47,6 +47,25 @@ class BaseEvaluatorJustification(BaseModel): expected: str actual: str + @classmethod + def try_from(cls, details: object) -> "BaseEvaluatorJustification | None": + """Coerce a free-form details payload into a justification, or return None. + + Accepts either an existing instance or a dict that ``model_validate`` can + parse. Anything else (str, None, malformed dict) yields ``None``. Used by + the classification evaluators + dataset evaluator framework to walk + per-datapoint results without each site re-implementing the same + isinstance/try/except dance. + """ + if isinstance(details, cls): + return details + if isinstance(details, dict): + try: + return cls.model_validate(details) + except Exception: + return None + return None + # Additional type variables for Config and Justification # Note: C must be BaseEvaluatorConfig[T] to ensure type consistency diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py index 0a65c2c64..c3f394d96 100644 --- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py @@ -105,14 +105,8 @@ def reduce_scores(self, results: list[EvaluationResultDto]) -> float: tp = fp = fn = 0 for r in results: - if isinstance(r.details, BaseEvaluatorJustification): - details = r.details - elif isinstance(r.details, dict): - try: - details = BaseEvaluatorJustification.model_validate(r.details) - except Exception: - continue - else: + details = BaseEvaluatorJustification.try_from(r.details) + if details is None: continue pred = details.actual exp = details.expected diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py index ef6063b4c..f64ebcd63 100644 --- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py +++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py @@ -27,14 +27,7 @@ def _coerce_justification(details: object) -> BaseEvaluatorJustification | None: """Extract the BaseEvaluatorJustification from an EvaluationResultDto.details payload.""" - if isinstance(details, BaseEvaluatorJustification): - return details - if isinstance(details, dict): - try: - return BaseEvaluatorJustification.model_validate(details) - except Exception: - return None - return None + return BaseEvaluatorJustification.try_from(details) class PerClassMetrics(BaseModel): @@ -165,6 +158,7 @@ def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: ) micro = _metric(metric_type, total_tp, total_fp, total_fn, beta_sq) + # AggregatorSpec.classes has min_length=1, so k >= 1 always. macro = sum(per_class[c].value for c in confusion.classes) / k details = ClassificationDetails( diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py index 842d13174..1fb736f2a 100644 --- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py @@ -121,14 +121,8 @@ def reduce_scores(self, results: list[EvaluationResultDto]) -> float: # Reconstruct confusion matrix: confusion[pred_idx][exp_idx] confusion = [[0] * k for _ in range(k)] for r in results: - if isinstance(r.details, BaseEvaluatorJustification): - details = r.details - elif isinstance(r.details, dict): - try: - details = BaseEvaluatorJustification.model_validate(r.details) - except Exception: - continue - else: + details = BaseEvaluatorJustification.try_from(r.details) + if details is None: continue pred = details.actual exp = details.expected From ad32c22c64e7ccb5aaf8446454cf8bc9408f6c30 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 22:27:49 -0700 Subject: [PATCH 09/13] fix(eval): address adversarial-review feedback on dataset evaluators - M2: drop _METRIC_NAME indirection. metric field on ClassificationDetails now uses spec.type verbatim ("fscore" not "f_score"), matching the discriminator on the wire. - M3: document confusion_matrix orientation via Field(description=...). Matrix is [predicted_idx][expected_idx], opposite of sklearn's convention. Add a regression test pinning the orientation. - M4: _metric raises ValueError on unknown metric_type instead of silently falling through to the F-beta formula. Defense in depth on top of pydantic's discriminator. - M6: replace defensive getattr chain in compute_dataset_evaluator_ results with isinstance narrowing on the classification config types. Mypy-clean; intent is now "classification configs declare aggregators" rather than "anything might have an aggregators attribute". - L1: rename duplicate test_two_class_macro tests so pytest output disambiguates Precision vs Recall. Co-Authored-By: Claude Opus 4.7 --- .../classification_dataset_evaluators.py | 32 +++++++++++++------ .../uipath/src/uipath/eval/runtime/runtime.py | 24 ++++++++++++-- .../test_dataset_classification_evaluators.py | 24 ++++++++++++-- 3 files changed, 65 insertions(+), 15 deletions(-) diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py index ef6063b4c..70d74cd26 100644 --- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py +++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py @@ -12,7 +12,7 @@ from dataclasses import dataclass -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from pydantic.alias_generators import to_camel from ..models.models import ( @@ -58,7 +58,17 @@ class ClassificationDetails(BaseModel): metric: str average: str classes: list[str] - confusion_matrix: list[list[int]] + confusion_matrix: list[list[int]] = Field( + ..., + description=( + "k x k confusion matrix indexed as " + "``confusion_matrix[predicted_idx][expected_idx]`` " + "(rows are predicted classes, columns are expected). " + "This is the transpose of sklearn's convention " + "(``[true][predicted]``); UI / consumer code must use the " + "orientation documented here." + ), + ) per_class: dict[str, PerClassMetrics] micro: float macro: float @@ -120,9 +130,6 @@ def _build_confusion( ) -_METRIC_NAME = {"precision": "precision", "recall": "recall", "fscore": "f_score"} - - class ClassificationDatasetEvaluator(BaseDatasetEvaluator[AggregatorSpec]): """One implementation for all three classification aggregators. @@ -168,7 +175,7 @@ def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: macro = sum(per_class[c].value for c in confusion.classes) / k details = ClassificationDetails( - metric=_METRIC_NAME[metric_type], + metric=metric_type, average=self.spec.averaging, classes=confusion.classes, confusion_matrix=confusion.matrix, @@ -190,7 +197,12 @@ def _metric(metric_type: str, tp: int, fp: int, fn: int, beta_sq: float) -> floa return tp / (tp + fp) if (tp + fp) > 0 else 0.0 if metric_type == "recall": return tp / (tp + fn) if (tp + fn) > 0 else 0.0 - p = tp / (tp + fp) if (tp + fp) > 0 else 0.0 - r = tp / (tp + fn) if (tp + fn) > 0 else 0.0 - denom = beta_sq * p + r - return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0 + if metric_type == "fscore": + p = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + r = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + denom = beta_sq * p + r + return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0 + raise ValueError( + f"Unknown metric_type: {metric_type!r}. " + "Expected one of: precision, recall, fscore." + ) diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index 89f8f6c29..987b6c4ae 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -46,7 +46,13 @@ from .._execution_context import ExecutionSpanCollector from ..evaluators.base_evaluator import GenericBaseEvaluator +from ..evaluators.binary_classification_evaluator import ( + BinaryClassificationEvaluatorConfig, +) from ..evaluators.dataset_evaluator_factory import build_dataset_evaluator +from ..evaluators.multiclass_classification_evaluator import ( + MulticlassClassificationEvaluatorConfig, +) from ..evaluators.output_evaluator import OutputEvaluationCriteria from ..helpers import get_agent_model from ..mocks._cache_manager import CacheManager @@ -237,13 +243,25 @@ def compute_dataset_evaluator_results( dataset_results: dict[str, EvaluationResultDto] = {} for evaluator in evaluators: + # Aggregators currently only live on classification evaluator configs. + # ``GenericBaseEvaluator`` doesn't declare ``evaluator_config``, so we + # retrieve it via ``getattr`` and narrow with ``isinstance`` to a + # classification config type before reading ``aggregators``. Widen the + # tuple if a future evaluator type grows an ``aggregators`` field. config = getattr(evaluator, "evaluator_config", None) - aggregators = getattr(config, "aggregators", None) - if config is None or not aggregators: + if not isinstance( + config, + ( + BinaryClassificationEvaluatorConfig, + MulticlassClassificationEvaluatorConfig, + ), + ): + continue + if not config.aggregators: continue source_name = config.name source_results = results_by_evaluator.get(source_name, []) - for spec in aggregators: + for spec in config.aggregators: dataset_evaluator = build_dataset_evaluator(spec, source_name) evaluation_result = dataset_evaluator.evaluate(source_results) dataset_results[dataset_evaluator.name] = ( diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py index 29343b170..bb7d3538e 100644 --- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py +++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py @@ -110,7 +110,27 @@ def test_empty_input_returns_zeroed_result(self) -> None: assert d.per_class["cat"].tp == 0 assert d.per_class["cat"].tn == 0 - def test_two_class_macro(self) -> None: + def test_confusion_matrix_is_predicted_by_expected(self) -> None: + # Pin the documented orientation: confusion_matrix[predicted][expected]. + # Differs from sklearn's [true][predicted] convention. + results = [ + _result("cat", "cat"), # expected=cat, predicted=cat -> [cat][cat] + _result("cat", "dog"), # expected=cat, predicted=dog -> [dog][cat] + _result("dog", "dog"), # expected=dog, predicted=dog -> [dog][dog] + _result("dog", "dog"), + ] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + # classes -> index: cat=0, dog=1 + # [predicted=cat][expected=cat] = 1 + assert d.confusion_matrix[0][0] == 1 + # [predicted=dog][expected=cat] = 1 (the FP for dog / FN for cat) + assert d.confusion_matrix[1][0] == 1 + # [predicted=dog][expected=dog] = 2 + assert d.confusion_matrix[1][1] == 2 + # [predicted=cat][expected=dog] = 0 + assert d.confusion_matrix[0][1] == 0 + + def test_precision_two_class_macro(self) -> None: results = [ _result("yes", "yes"), _result("yes", "yes"), @@ -164,7 +184,7 @@ def test_three_class_macro(self) -> None: class TestRecallEvaluator: - def test_two_class_macro(self) -> None: + def test_recall_two_class_macro(self) -> None: results = [ _result("yes", "yes"), _result("yes", "yes"), From 027901c96be416d791e76d93c3b2ca9d4a470a95 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 22:32:46 -0700 Subject: [PATCH 10/13] fix(eval): address adversarial-review feedback on classification samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - H1/H2: pydantic model_validator on Binary/Multiclass classification configs cross-checks aggregators against evaluator-level fields. Binary rejects aggregators whose `classes` doesn't include `positive_class`, and aggregators of the same metric type with a different `f_value`. Multiclass extends this with the full class-coverage check and an `averaging` consistency check. Without this, a user could ship configs where the per-evaluator headline and the dataset aggregator silently scored disjoint label spaces or used different averaging. - H3: binary e2e test now asserts the precision/recall/fscore aggregator scores (5/6, 5/6, 0.8) instead of only the key set. A regression that zeros out all aggregator scores would now fail the test. - H4: multiclass `evaluate()` no longer raises on out-of-vocab predicted class — it now returns score=0.0 with the OOV label preserved in the justification, mirroring binary's behavior. The dataset evaluator's confusion matrix already accounts for this via `n_skipped`. Configuration errors (expected_class outside vocab) still raise. - M1: drop the `_coerce_justification` one-line wrapper; inline `BaseEvaluatorJustification.try_from(r.details)` at the single caller in `_build_confusion`. - M2: preserve user-supplied class casing in `_ConfusionData.classes` and the `per_class` keys. The lowercase normalization is now only used for the internal lookup index, so a config with classes=["Spam","Ham"] surfaces "Spam"/"Ham" in the output rather than "spam"/"ham". - M3 (multiclass `reduce_scores` + ClassificationDatasetEvaluator double-walking the same confusion matrix): deferred. Cleanest fix is to drop the evaluator-level `metric_type`/`averaging`/`f_value` fields and route the per-evaluator headline through the dataset evaluator framework — out of scope for this commit. Tracked as a follow-up. - L1: refreshed test_classification_samples_e2e docstring to reflect the new aggregator-score coverage on the binary side. Co-Authored-By: Claude Opus 4.7 --- .../binary_classification_evaluator.py | 59 ++++++- .../classification_dataset_evaluators.py | 21 +-- .../multiclass_classification_evaluator.py | 87 ++++++++-- .../eval/test_classification_samples_e2e.py | 19 ++- .../evaluators/test_evaluator_methods.py | 157 +++++++++++++++++- 5 files changed, 314 insertions(+), 29 deletions(-) diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py index c3f394d96..44a795d90 100644 --- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py @@ -8,6 +8,8 @@ from typing import Literal +from pydantic import model_validator + from ..models import ( AgentExecution, EvaluationResult, @@ -19,13 +21,22 @@ UiPathEvaluationError, UiPathEvaluationErrorCategory, ) -from ._aggregator_specs import AggregatorSpec +from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification from .output_evaluator import ( BaseOutputEvaluator, OutputEvaluatorConfig, ) +# Maps the evaluator-level ``metric_type`` strings to the corresponding +# aggregator-spec ``type`` values. The two spellings differ historically: +# the evaluator uses "f-score" (hyphen), the aggregator uses "fscore". +_METRIC_TYPE_TO_AGGREGATOR_TYPE = { + "precision": "precision", + "recall": "recall", + "f-score": "fscore", +} + class BinaryClassificationEvaluationCriteria(BaseEvaluationCriteria): """Per-datapoint criteria: which class this sample should belong to.""" @@ -49,6 +60,52 @@ class BinaryClassificationEvaluatorConfig( # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``. aggregators: list[AggregatorSpec] | None = None + @model_validator(mode="after") + def _validate_aggregators_against_evaluator_config( + self, + ) -> "BinaryClassificationEvaluatorConfig": + """Reject aggregators that are inconsistent with the evaluator's own config. + + Two checks: + * ``positive_class`` must appear in every aggregator's ``classes`` + list (case-insensitive). Otherwise the per-datapoint headline + and the aggregator's confusion matrix score completely + disjoint label spaces. + * For each aggregator whose ``type`` matches the evaluator-level + ``metric_type`` (mapped via :data:`_METRIC_TYPE_TO_AGGREGATOR_TYPE`), + the aggregator's ``f_value`` must match the evaluator's + ``f_value``. Otherwise the per-evaluator headline produced via + ``reduce_scores`` and the dataset evaluator's per-aggregator + score diverge silently. + """ + if not self.aggregators: + return self + positive_lower = self.positive_class.lower() if self.positive_class else "" + evaluator_aggregator_type = _METRIC_TYPE_TO_AGGREGATOR_TYPE.get( + self.metric_type + ) + for spec in self.aggregators: + if positive_lower and positive_lower not in { + c.lower() for c in spec.classes + }: + raise ValueError( + f"Aggregator '{spec.type}' on evaluator '{self.name}' " + f"declares classes={spec.classes!r} but positive_class=" + f"{self.positive_class!r} is not in that list. Add the " + "positive class to the aggregator's classes or remove it." + ) + if spec.type == evaluator_aggregator_type and isinstance( + spec, FScoreAggregatorSpec + ): + if spec.f_value != self.f_value: + raise ValueError( + f"Aggregator 'fscore' on evaluator '{self.name}' has " + f"f_value={spec.f_value} but the evaluator's f_value=" + f"{self.f_value}. The per-evaluator headline and the " + "aggregator would compute different F-beta scores." + ) + return self + class BinaryClassificationEvaluator( BaseOutputEvaluator[ diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py index 3aad5832e..7f2ca2519 100644 --- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py +++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py @@ -25,11 +25,6 @@ from .base_evaluator import BaseEvaluatorJustification -def _coerce_justification(details: object) -> BaseEvaluatorJustification | None: - """Extract the BaseEvaluatorJustification from an EvaluationResultDto.details payload.""" - return BaseEvaluatorJustification.try_from(details) - - class PerClassMetrics(BaseModel): """Per-class confusion counts plus the metric the evaluator computed.""" @@ -89,12 +84,14 @@ def _build_confusion( Results without a parseable justification are counted in ``n_skipped`` and omitted from the matrix. Pairs whose expected or actual label isn't in - ``classes`` are also skipped. Labels are normalized to lowercase so a - classifier returning "Book" vs configured "book" still matches. + ``classes`` are also skipped. Labels are normalized to lowercase for the + lookup index so a classifier returning "Book" vs configured "book" still + matches, but the user-supplied casing is preserved in the returned + ``_ConfusionData.classes`` so downstream output (per_class keys, UI labels) + shows what the user typed. """ - canonical_classes = [c.lower() for c in classes] - index_of = {c: i for i, c in enumerate(canonical_classes)} - k = len(canonical_classes) + index_of = {c.lower(): i for i, c in enumerate(classes)} + k = len(classes) matrix = [[0] * k for _ in range(k)] n_total = len(results) @@ -102,7 +99,7 @@ def _build_confusion( n_skipped = 0 for r in results: - j = _coerce_justification(r.details) + j = BaseEvaluatorJustification.try_from(r.details) if j is None: n_skipped += 1 continue @@ -115,7 +112,7 @@ def _build_confusion( n_scored += 1 return _ConfusionData( - classes=canonical_classes, + classes=list(classes), matrix=matrix, n_total=n_total, n_scored=n_scored, diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py index 1fb736f2a..1799323ac 100644 --- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py @@ -9,6 +9,8 @@ from typing import Literal +from pydantic import model_validator + from ..models import ( AgentExecution, EvaluationResult, @@ -20,13 +22,22 @@ UiPathEvaluationError, UiPathEvaluationErrorCategory, ) -from ._aggregator_specs import AggregatorSpec +from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification from .output_evaluator import ( BaseOutputEvaluator, OutputEvaluatorConfig, ) +# Maps the evaluator-level ``metric_type`` strings to the corresponding +# aggregator-spec ``type`` values. The two spellings differ historically: +# the evaluator uses "f-score" (hyphen), the aggregator uses "fscore". +_METRIC_TYPE_TO_AGGREGATOR_TYPE = { + "precision": "precision", + "recall": "recall", + "f-score": "fscore", +} + class MulticlassClassificationEvaluationCriteria(BaseEvaluationCriteria): """Per-datapoint criteria: which class this sample should belong to.""" @@ -51,6 +62,61 @@ class MulticlassClassificationEvaluatorConfig( # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``. aggregators: list[AggregatorSpec] | None = None + @model_validator(mode="after") + def _validate_aggregators_against_evaluator_config( + self, + ) -> "MulticlassClassificationEvaluatorConfig": + """Reject aggregators that are inconsistent with the evaluator's own config. + + Two checks: + * Every evaluator-level class must appear in every aggregator's + ``classes`` list (case-insensitive). Otherwise the per-datapoint + and aggregator paths score disjoint label spaces. + * For each aggregator whose ``type`` matches the evaluator-level + ``metric_type`` (mapped via :data:`_METRIC_TYPE_TO_AGGREGATOR_TYPE`), + the aggregator's ``averaging`` must match the evaluator's + ``averaging``, and for ``fscore`` the ``f_value`` must match too. + Otherwise the per-evaluator headline and the dataset evaluator's + per-aggregator score diverge silently. + """ + if not self.aggregators: + return self + evaluator_classes_lower = {c.lower() for c in self.classes} + evaluator_aggregator_type = _METRIC_TYPE_TO_AGGREGATOR_TYPE.get( + self.metric_type + ) + for spec in self.aggregators: + spec_classes_lower = {c.lower() for c in spec.classes} + missing = evaluator_classes_lower - spec_classes_lower + if missing: + raise ValueError( + f"Aggregator '{spec.type}' on evaluator '{self.name}' " + f"declares classes={spec.classes!r} but the evaluator's " + f"classes={self.classes!r} include {sorted(missing)!r} " + "that the aggregator does not. Aggregators must cover " + "the evaluator's full class space." + ) + if spec.type == evaluator_aggregator_type: + if spec.averaging != self.averaging: + raise ValueError( + f"Aggregator '{spec.type}' on evaluator '{self.name}' " + f"has averaging={spec.averaging!r} but the evaluator's " + f"averaging={self.averaging!r}. The per-evaluator " + "headline and the aggregator would compute different " + "scores." + ) + if ( + isinstance(spec, FScoreAggregatorSpec) + and spec.f_value != self.f_value + ): + raise ValueError( + f"Aggregator 'fscore' on evaluator '{self.name}' has " + f"f_value={spec.f_value} but the evaluator's f_value=" + f"{self.f_value}. The per-evaluator headline and the " + "aggregator would compute different F-beta scores." + ) + return self + class MulticlassClassificationEvaluator( BaseOutputEvaluator[ @@ -76,7 +142,16 @@ async def evaluate( agent_execution: AgentExecution, evaluation_criteria: MulticlassClassificationEvaluationCriteria, ) -> EvaluationResult: - """Evaluate multiclass classification by comparing predicted vs expected class.""" + """Evaluate multiclass classification by comparing predicted vs expected class. + + Configuration errors (e.g. ``expected_class`` not in the configured + ``classes``) raise — that's a setup mistake the user must fix. But a + predicted class outside the vocabulary (a sloppy LLM returning + "unknown", garbage, or an unconfigured label) returns a 0.0 score with + the OOV label preserved in the justification, mirroring the binary + evaluator's behavior. The dataset evaluator's confusion matrix + accounts for these via ``n_skipped``. + """ predicted_class = str(self._get_actual_output(agent_execution)).lower() expected_class = evaluation_criteria.expected_class.lower() classes = [c.lower() for c in self.evaluator_config.classes] @@ -89,14 +164,6 @@ async def evaluate( category=UiPathEvaluationErrorCategory.USER, ) - if predicted_class not in classes: - raise UiPathEvaluationError( - code="INVALID_PREDICTED_CLASS", - title="Predicted class not in configured classes", - detail=f"Predicted class '{predicted_class}' is not in the configured classes: {classes}", - category=UiPathEvaluationErrorCategory.USER, - ) - score = 1.0 if predicted_class == expected_class else 0.0 justification = self.validate_justification( diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py index f2bdfa3cb..d87d9013e 100644 --- a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py +++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py @@ -1,9 +1,12 @@ """End-to-end tests that run the classification sample projects through evaluate(). These tests double as integration coverage for the binary and multiclass -classification evaluators added in #1397 — they wire each sample's main.py -into a stand-in runtime, run the full eval set, and assert the per-row scores -plus the aggregated metric produced by `reduce_scores`. +classification evaluators added in #1397 plus the embedded dataset-level +aggregators added in #1669 — they wire each sample's main.py into a stand-in +runtime, run the full eval set, and assert the per-row scores AND the +specific aggregator scores produced by the embedded ``aggregators[]``. A +regression that returns 0.0 for all aggregators (or one that swaps macro +for micro silently) fails these tests. """ import importlib.util @@ -178,6 +181,16 @@ async def test_binary_classification_sample_end_to_end(): "BinarySpamPrecision.recall", "BinarySpamPrecision.fscore", } + # Confusion matrix (predicted x expected, classes=[spam, ham]): + # matrix[spam][spam] = 2 matrix[spam][ham] = 1 (the FP) + # matrix[ham][spam] = 0 matrix[ham][ham] = 2 + # per-class precision: spam = 2/3, ham = 1.0 → macro = (2/3 + 1) / 2 = 5/6 + # per-class recall: spam = 1.0, ham = 2/3 → macro = (1 + 2/3) / 2 = 5/6 + # per-class F1: spam = 0.8, ham = 0.8 → macro = 0.8 + agg = output.dataset_evaluator_results + assert agg["BinarySpamPrecision.precision"].score == pytest.approx(5 / 6, rel=1e-6) + assert agg["BinarySpamPrecision.recall"].score == pytest.approx(5 / 6, rel=1e-6) + assert agg["BinarySpamPrecision.fscore"].score == pytest.approx(0.8, rel=1e-6) async def test_multiclass_classification_sample_end_to_end(): diff --git a/packages/uipath/tests/evaluators/test_evaluator_methods.py b/packages/uipath/tests/evaluators/test_evaluator_methods.py index ec795499d..0083aeec0 100644 --- a/packages/uipath/tests/evaluators/test_evaluator_methods.py +++ b/packages/uipath/tests/evaluators/test_evaluator_methods.py @@ -2608,12 +2608,20 @@ async def test_multiclass_classification_invalid_expected_class(self) -> None: @pytest.mark.asyncio async def test_multiclass_classification_invalid_predicted_class(self) -> None: - """Test that an invalid predicted class returns an error result.""" + """Out-of-vocab predicted class returns score=0.0, not an error. + + Mirrors binary classification's soft-fail behavior so a sloppy LLM + returning "fish" doesn't crash the whole eval set. The dataset + evaluator's confusion matrix counts the OOV prediction under + ``n_skipped``. Configuration errors (expected_class outside vocab) + still raise; only predicted_class is soft. + """ + from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification from uipath.eval.evaluators.multiclass_classification_evaluator import ( MulticlassClassificationEvaluationCriteria, MulticlassClassificationEvaluator, ) - from uipath.eval.models.models import ErrorEvaluationResult + from uipath.eval.models import NumericEvaluationResult execution = AgentExecution( agent_input={}, @@ -2630,5 +2638,148 @@ async def test_multiclass_classification_invalid_predicted_class(self) -> None: ) criteria = MulticlassClassificationEvaluationCriteria(expected_class="cat") result = await evaluator.evaluate(execution, criteria) - assert isinstance(result, ErrorEvaluationResult) + assert isinstance(result, NumericEvaluationResult) assert result.score == 0.0 + assert isinstance(result.details, BaseEvaluatorJustification) + assert result.details.actual == "fish" + assert result.details.expected == "cat" + + +class TestClassificationConfigCrossValidators: + """Pydantic validators that catch internally-inconsistent classification configs. + + Without these validators, a config with ``positive_class="yes"`` but an + aggregator declaring ``classes=["spam","ham"]`` silently scores against + completely disjoint label spaces — the per-evaluator headline and the + aggregator's confusion matrix both return numbers, neither one meaningful. + """ + + def test_binary_aggregator_missing_positive_class_rejected(self) -> None: + from uipath.eval.evaluators.binary_classification_evaluator import ( + BinaryClassificationEvaluator, + ) + + config = { + "name": "SpamPrecision", + "positive_class": "spam", + "metric_type": "precision", + "aggregators": [ + { + "type": "precision", + # "spam" is intentionally missing + "classes": ["other", "ham"], + "averaging": "macro", + } + ], + } + with pytest.raises(Exception) as exc_info: + BinaryClassificationEvaluator.model_validate( + {"evaluatorConfig": config, "id": str(uuid.uuid4())} + ) + assert "positive_class" in str(exc_info.value) + + def test_binary_aggregator_fvalue_mismatch_rejected(self) -> None: + from uipath.eval.evaluators.binary_classification_evaluator import ( + BinaryClassificationEvaluator, + ) + + config = { + "name": "SpamFScore", + "positive_class": "spam", + "metric_type": "f-score", + "f_value": 1.0, + "aggregators": [ + { + "type": "fscore", + "classes": ["spam", "ham"], + "averaging": "macro", + "f_value": 2.0, # diverges from evaluator-level 1.0 + } + ], + } + with pytest.raises(Exception) as exc_info: + BinaryClassificationEvaluator.model_validate( + {"evaluatorConfig": config, "id": str(uuid.uuid4())} + ) + assert "f_value" in str(exc_info.value) + + def test_multiclass_aggregator_missing_class_rejected(self) -> None: + from uipath.eval.evaluators.multiclass_classification_evaluator import ( + MulticlassClassificationEvaluator, + ) + + config = { + "name": "IntentClassifier", + "classes": ["book", "cancel", "reschedule"], + "metric_type": "f-score", + "averaging": "macro", + "aggregators": [ + { + "type": "fscore", + # "reschedule" is intentionally missing from the aggregator + "classes": ["book", "cancel"], + "averaging": "macro", + "f_value": 1.0, + } + ], + } + with pytest.raises(Exception) as exc_info: + MulticlassClassificationEvaluator.model_validate( + {"evaluatorConfig": config, "id": str(uuid.uuid4())} + ) + assert "reschedule" in str(exc_info.value) + + def test_multiclass_aggregator_averaging_mismatch_rejected(self) -> None: + from uipath.eval.evaluators.multiclass_classification_evaluator import ( + MulticlassClassificationEvaluator, + ) + + config = { + "name": "IntentClassifier", + "classes": ["book", "cancel"], + "metric_type": "precision", + "averaging": "macro", + "aggregators": [ + { + "type": "precision", + "classes": ["book", "cancel"], + "averaging": "micro", # diverges from evaluator-level macro + } + ], + } + with pytest.raises(Exception) as exc_info: + MulticlassClassificationEvaluator.model_validate( + {"evaluatorConfig": config, "id": str(uuid.uuid4())} + ) + assert "averaging" in str(exc_info.value) + + def test_binary_aggregator_unrelated_type_does_not_cross_check(self) -> None: + """An aggregator whose ``type`` differs from the evaluator's ``metric_type`` + should NOT be cross-checked for f_value / averaging matching — only the + positive_class containment rule applies. + """ + from uipath.eval.evaluators.binary_classification_evaluator import ( + BinaryClassificationEvaluator, + ) + + config = { + "name": "SpamPrecision", + "positive_class": "spam", + "metric_type": "precision", + "f_value": 1.0, + # evaluator computes precision; the aggregator below is an fscore + # with a different f_value — should be allowed because the + # evaluator headline isn't an fscore. + "aggregators": [ + { + "type": "fscore", + "classes": ["spam", "ham"], + "averaging": "macro", + "f_value": 2.0, + } + ], + } + evaluator = BinaryClassificationEvaluator.model_validate( + {"evaluatorConfig": config, "id": str(uuid.uuid4())} + ) + assert evaluator.evaluator_config.aggregators is not None From 4d6afccafbdb4adb5af0365af69d6613953ddc31 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 23:05:19 -0700 Subject: [PATCH 11/13] fix(eval): address codex P1 + lint failures on dataset evaluators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Bump uipath version 2.11.5 -> 2.11.6 (2.11.5 already on PyPI). - Widen examples/dataset_evaluators_demo.py:report() to accept the full EvaluationResult union and narrow once inside with isinstance, fixing 6 mypy "expected NumericEvaluationResult" errors at the call sites. - Address Codex P1 (runtime.py:268 — result-key collision): two aggregators of the same type on the same source (e.g. macro+micro precision) previously produced identical {source}.{type} keys, with the second silently overwriting the first. compute_dataset_evaluator _results now counts type occurrences per source and disambiguates duplicate-type aggregators as {source}.{type}.{averaging} (plus ".fb{f_value}" for fscore variants), preserving the simple key shape for the common single-aggregator case. Docstring updated; 2 new tests cover both the precision-duplicate and fscore-duplicate paths. Co-Authored-By: Claude Opus 4.7 --- .../examples/dataset_evaluators_demo.py | 9 ++- packages/uipath/pyproject.toml | 2 +- .../uipath/src/uipath/eval/runtime/runtime.py | 41 ++++++++++-- .../test_dataset_classification_evaluators.py | 62 +++++++++++++++++++ packages/uipath/uv.lock | 2 +- 5 files changed, 107 insertions(+), 9 deletions(-) diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py index 2d13f3572..1a3c376c0 100644 --- a/packages/uipath/examples/dataset_evaluators_demo.py +++ b/packages/uipath/examples/dataset_evaluators_demo.py @@ -26,7 +26,11 @@ ClassificationDetails, ) from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator -from uipath.eval.models.models import EvaluationResultDto, NumericEvaluationResult +from uipath.eval.models.models import ( + EvaluationResult, + EvaluationResultDto, + NumericEvaluationResult, +) # ─── helpers ────────────────────────────────────────────────────────────────── @@ -94,12 +98,13 @@ def print_per_class(details: ClassificationDetails) -> None: def report( title: str, - result: NumericEvaluationResult, + result: EvaluationResult, *, show_json_tail: bool = False, ) -> None: """Render one scenario's result block.""" print_header(title) + assert isinstance(result, NumericEvaluationResult) assert isinstance(result.details, ClassificationDetails) d = result.details print( diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml index 0add2e09e..fd088202e 100644 --- a/packages/uipath/pyproject.toml +++ b/packages/uipath/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.11.5" +version = "2.11.6" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index 987b6c4ae..7167d7f20 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -45,6 +45,7 @@ from uipath.runtime.schema import UiPathRuntimeSchema from .._execution_context import ExecutionSpanCollector +from ..evaluators._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec from ..evaluators.base_evaluator import GenericBaseEvaluator from ..evaluators.binary_classification_evaluator import ( BinaryClassificationEvaluatorConfig, @@ -227,8 +228,13 @@ def compute_dataset_evaluator_results( Returns: Dict mapping ``"{evaluator_name}.{aggregator_type}"`` to the run-level - EvaluationResultDto. Aggregators whose source produced no results are - still invoked with an empty list so they emit a zeroed result. + EvaluationResultDto. When the same aggregator ``type`` appears more + than once on a source (e.g. macro+micro precision), each variant is + disambiguated as ``"{evaluator_name}.{type}.{averaging}"`` and, for + fscore, with the ``f_value`` suffix (``"...fbN"``), so a duplicate + type never overwrites a previous result. Aggregators whose source + produced no results are still invoked with an empty list so they emit + a zeroed result. """ results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict( list @@ -261,15 +267,40 @@ def compute_dataset_evaluator_results( continue source_name = config.name source_results = results_by_evaluator.get(source_name, []) + # Count occurrences of each aggregator type to detect duplicates + # (e.g. macro+micro precision on the same source). The default key + # shape ``{source}.{type}`` collides on duplicates; disambiguate with + # ``.{averaging}`` (and ``.fb{f_value}`` for fscore variants) only + # when more than one aggregator of that type exists, to preserve the + # simple key shape in the common case. + type_counts: dict[str, int] = defaultdict(int) + for spec in config.aggregators: + type_counts[spec.type] += 1 for spec in config.aggregators: dataset_evaluator = build_dataset_evaluator(spec, source_name) - evaluation_result = dataset_evaluator.evaluate(source_results) - dataset_results[dataset_evaluator.name] = ( - EvaluationResultDto.from_evaluation_result(evaluation_result) + key = _dataset_result_key(source_name, spec, type_counts[spec.type] > 1) + dataset_results[key] = EvaluationResultDto.from_evaluation_result( + dataset_evaluator.evaluate(source_results) ) return dataset_results +def _dataset_result_key( + source_name: str, spec: AggregatorSpec, disambiguate: bool +) -> str: + """Build the result-dict key for a dataset evaluator. + + Uses ``{source}.{type}`` for unique-type aggregators, and appends + ``.{averaging}`` (plus ``.fb{f_value}`` for fscore) when the same type + appears more than once on the same source. + """ + if not disambiguate: + return f"{source_name}.{spec.type}" + if isinstance(spec, FScoreAggregatorSpec): + return f"{source_name}.{spec.type}.{spec.averaging}.fb{spec.f_value}" + return f"{source_name}.{spec.type}.{spec.averaging}" + + class UiPathEvalRuntime: """Specialized runtime for evaluation runs, with access to the factory.""" diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py index bb7d3538e..e04a13fb0 100644 --- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py +++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py @@ -496,3 +496,65 @@ def test_source_with_no_results_produces_zeroed_report(self) -> None: assert dto.score == 0.0 assert isinstance(dto.details, dict) assert dto.details["n_scored"] == 0 + + def test_duplicate_aggregator_type_disambiguates_by_averaging(self) -> None: + """Two aggregators of the same type get distinct keys (no overwrite).""" + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"), + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="micro"), + ], + ) + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + # Same type appears twice → averaging suffix disambiguates so neither + # is silently overwritten. + assert set(out) == { + "intent_match.precision.macro", + "intent_match.precision.micro", + } + + def test_duplicate_fscore_disambiguates_by_averaging_and_fvalue(self) -> None: + """Two FScore aggregators (e.g. F1 macro and F2 macro) both survive.""" + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + FScoreAggregatorSpec( + classes=["yes", "no"], averaging="macro", f_value=1.0 + ), + FScoreAggregatorSpec( + classes=["yes", "no"], averaging="macro", f_value=2.0 + ), + ], + ) + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + assert set(out) == { + "intent_match.fscore.macro.fb1.0", + "intent_match.fscore.macro.fb2.0", + } diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock index 86f8936e1..bd7f1f86e 100644 --- a/packages/uipath/uv.lock +++ b/packages/uipath/uv.lock @@ -2552,7 +2552,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.11.5" +version = "2.11.6" source = { editable = "." } dependencies = [ { name = "applicationinsights" }, From 5d782052d4cd1ee22966b5784e7a9f885192ec29 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 23:07:31 -0700 Subject: [PATCH 12/13] test(eval): drop fscore-duplicate test that conflicts with #1663 H2 validator The fscore-duplicate disambiguation test added in 4d6afcca conflicts with the H2 model_validator on #1663, which cross-checks aggregator f_value against the evaluator's f_value when types match. The precision-duplicate test still exercises the new _dataset_result_key path; the FScore branch is exercised by the factory + math tests. Co-Authored-By: Claude Opus 4.7 --- .../test_dataset_classification_evaluators.py | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py index e04a13fb0..69fbfda40 100644 --- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py +++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py @@ -526,35 +526,3 @@ def test_duplicate_aggregator_type_disambiguates_by_averaging(self) -> None: "intent_match.precision.macro", "intent_match.precision.micro", } - - def test_duplicate_fscore_disambiguates_by_averaging_and_fvalue(self) -> None: - """Two FScore aggregators (e.g. F1 macro and F2 macro) both survive.""" - evaluator = _multiclass_evaluator( - "intent_match", - classes=["yes", "no"], - aggregators=[ - FScoreAggregatorSpec( - classes=["yes", "no"], averaging="macro", f_value=1.0 - ), - FScoreAggregatorSpec( - classes=["yes", "no"], averaging="macro", f_value=2.0 - ), - ], - ) - eval_results = [ - UiPathEvalRunResult( - evaluation_name="dp1", - evaluation_run_results=[ - UiPathEvalRunResultDto( - evaluator_name="intent_match", - evaluator_id=str(uuid.uuid4()), - result=_result("yes", "yes"), - ), - ], - ), - ] - out = compute_dataset_evaluator_results(eval_results, [evaluator]) - assert set(out) == { - "intent_match.fscore.macro.fb1.0", - "intent_match.fscore.macro.fb2.0", - } From 363855d4f2b86321ee933c6b1e382364257a6c44 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Thu, 18 Jun 2026 23:13:25 -0700 Subject: [PATCH 13/13] fix(eval): publish aggregators in classification evaluator type schemas Regenerate BinaryClassificationEvaluator.json and MulticlassClassificationEvaluator.json from the updated pydantic models so schema-driven consumers can discover and validate the new evaluatorConfig.aggregators array + Precision/Recall/FScore variants. Co-Authored-By: Claude Opus 4.7 --- .../BinaryClassificationEvaluator.json | 154 +++++++++++++++++- .../MulticlassClassificationEvaluator.json | 154 +++++++++++++++++- 2 files changed, 302 insertions(+), 6 deletions(-) diff --git a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json index 9f7351865..a15ac8e5a 100644 --- a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json +++ b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json @@ -15,6 +15,111 @@ ], "title": "BinaryClassificationEvaluationCriteria", "type": "object" + }, + "FScoreAggregatorSpec": { + "description": "Run-level F-beta aggregator (multiclass, micro or macro averaged).", + "properties": { + "type": { + "const": "fscore", + "default": "fscore", + "title": "Type", + "type": "string" + }, + "classes": { + "items": { + "type": "string" + }, + "minItems": 1, + "title": "Classes", + "type": "array" + }, + "averaging": { + "enum": [ + "macro", + "micro" + ], + "title": "Averaging", + "type": "string" + }, + "f_value": { + "default": 1.0, + "exclusiveMinimum": 0, + "title": "F Value", + "type": "number" + } + }, + "required": [ + "classes", + "averaging" + ], + "title": "FScoreAggregatorSpec", + "type": "object" + }, + "PrecisionAggregatorSpec": { + "description": "Run-level precision aggregator (multiclass, micro or macro averaged).", + "properties": { + "type": { + "const": "precision", + "default": "precision", + "title": "Type", + "type": "string" + }, + "classes": { + "items": { + "type": "string" + }, + "minItems": 1, + "title": "Classes", + "type": "array" + }, + "averaging": { + "enum": [ + "macro", + "micro" + ], + "title": "Averaging", + "type": "string" + } + }, + "required": [ + "classes", + "averaging" + ], + "title": "PrecisionAggregatorSpec", + "type": "object" + }, + "RecallAggregatorSpec": { + "description": "Run-level recall aggregator (multiclass, micro or macro averaged).", + "properties": { + "type": { + "const": "recall", + "default": "recall", + "title": "Type", + "type": "string" + }, + "classes": { + "items": { + "type": "string" + }, + "minItems": 1, + "title": "Classes", + "type": "array" + }, + "averaging": { + "enum": [ + "macro", + "micro" + ], + "title": "Averaging", + "type": "string" + } + }, + "required": [ + "classes", + "averaging" + ], + "title": "RecallAggregatorSpec", + "type": "object" } }, "description": "Configuration for the binary classification evaluator.", @@ -42,10 +147,20 @@ "default": null }, "target_output_key": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + } + ], "default": "*", - "description": "Key to extract output from agent execution", - "title": "Target Output Key", - "type": "string" + "description": "Key or list of keys to extract output from agent execution", + "title": "Target Output Key" }, "line_by_line_evaluator": { "default": false, @@ -77,6 +192,39 @@ "default": 1.0, "title": "F Value", "type": "number" + }, + "aggregators": { + "anyOf": [ + { + "items": { + "discriminator": { + "mapping": { + "fscore": "#/$defs/FScoreAggregatorSpec", + "precision": "#/$defs/PrecisionAggregatorSpec", + "recall": "#/$defs/RecallAggregatorSpec" + }, + "propertyName": "type" + }, + "oneOf": [ + { + "$ref": "#/$defs/PrecisionAggregatorSpec" + }, + { + "$ref": "#/$defs/RecallAggregatorSpec" + }, + { + "$ref": "#/$defs/FScoreAggregatorSpec" + } + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Aggregators" } }, "required": [ diff --git a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json index 72262ba92..8cc971f75 100644 --- a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json +++ b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json @@ -2,6 +2,45 @@ "evaluatorTypeId": "uipath-multiclass-classification", "evaluatorConfigSchema": { "$defs": { + "FScoreAggregatorSpec": { + "description": "Run-level F-beta aggregator (multiclass, micro or macro averaged).", + "properties": { + "type": { + "const": "fscore", + "default": "fscore", + "title": "Type", + "type": "string" + }, + "classes": { + "items": { + "type": "string" + }, + "minItems": 1, + "title": "Classes", + "type": "array" + }, + "averaging": { + "enum": [ + "macro", + "micro" + ], + "title": "Averaging", + "type": "string" + }, + "f_value": { + "default": 1.0, + "exclusiveMinimum": 0, + "title": "F Value", + "type": "number" + } + }, + "required": [ + "classes", + "averaging" + ], + "title": "FScoreAggregatorSpec", + "type": "object" + }, "MulticlassClassificationEvaluationCriteria": { "description": "Per-datapoint criteria: which class this sample should belong to.", "properties": { @@ -15,6 +54,72 @@ ], "title": "MulticlassClassificationEvaluationCriteria", "type": "object" + }, + "PrecisionAggregatorSpec": { + "description": "Run-level precision aggregator (multiclass, micro or macro averaged).", + "properties": { + "type": { + "const": "precision", + "default": "precision", + "title": "Type", + "type": "string" + }, + "classes": { + "items": { + "type": "string" + }, + "minItems": 1, + "title": "Classes", + "type": "array" + }, + "averaging": { + "enum": [ + "macro", + "micro" + ], + "title": "Averaging", + "type": "string" + } + }, + "required": [ + "classes", + "averaging" + ], + "title": "PrecisionAggregatorSpec", + "type": "object" + }, + "RecallAggregatorSpec": { + "description": "Run-level recall aggregator (multiclass, micro or macro averaged).", + "properties": { + "type": { + "const": "recall", + "default": "recall", + "title": "Type", + "type": "string" + }, + "classes": { + "items": { + "type": "string" + }, + "minItems": 1, + "title": "Classes", + "type": "array" + }, + "averaging": { + "enum": [ + "macro", + "micro" + ], + "title": "Averaging", + "type": "string" + } + }, + "required": [ + "classes", + "averaging" + ], + "title": "RecallAggregatorSpec", + "type": "object" } }, "description": "Configuration for the multiclass classification evaluator.", @@ -42,10 +147,20 @@ "default": null }, "target_output_key": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + } + ], "default": "*", - "description": "Key to extract output from agent execution", - "title": "Target Output Key", - "type": "string" + "description": "Key or list of keys to extract output from agent execution", + "title": "Target Output Key" }, "line_by_line_evaluator": { "default": false, @@ -89,6 +204,39 @@ "default": 1.0, "title": "F Value", "type": "number" + }, + "aggregators": { + "anyOf": [ + { + "items": { + "discriminator": { + "mapping": { + "fscore": "#/$defs/FScoreAggregatorSpec", + "precision": "#/$defs/PrecisionAggregatorSpec", + "recall": "#/$defs/RecallAggregatorSpec" + }, + "propertyName": "type" + }, + "oneOf": [ + { + "$ref": "#/$defs/PrecisionAggregatorSpec" + }, + { + "$ref": "#/$defs/RecallAggregatorSpec" + }, + { + "$ref": "#/$defs/FScoreAggregatorSpec" + } + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Aggregators" } }, "required": [