From 6b11767d30fb08969146d4bb58ac8570cc20c34f Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Tue, 19 May 2026 17:54:34 -0700
Subject: [PATCH 01/13] feat(eval): add evaluator type schemas for
 classification evaluators

Generates BinaryClassificationEvaluator.json and MulticlassClassificationEvaluator.json
from the new evaluators added in #1397 so external tooling (Flow UI evaluator
picker, `uip maestro flow eval`) can read the config / criteria / justification
schemas.

Files produced by `python -m uipath.eval.evaluators_types.generate_types`,
restricted to the two new evaluator types. A companion PR refreshes the other
11 stale schemas in evaluators_types/.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../BinaryClassificationEvaluator.json        | 121 ++++++++++++++++
 .../MulticlassClassificationEvaluator.json    | 133 ++++++++++++++++++
 2 files changed, 254 insertions(+)
 create mode 100644 packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
 create mode 100644 packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json

diff --git a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
new file mode 100644
index 000000000..9f7351865
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
@@ -0,0 +1,121 @@
+{
+  "evaluatorTypeId": "uipath-binary-classification",
+  "evaluatorConfigSchema": {
+    "$defs": {
+      "BinaryClassificationEvaluationCriteria": {
+        "description": "Per-datapoint criteria: which class this sample should belong to.",
+        "properties": {
+          "expected_class": {
+            "title": "Expected Class",
+            "type": "string"
+          }
+        },
+        "required": [
+          "expected_class"
+        ],
+        "title": "BinaryClassificationEvaluationCriteria",
+        "type": "object"
+      }
+    },
+    "description": "Configuration for the binary classification evaluator.",
+    "properties": {
+      "name": {
+        "default": "BinaryClassificationEvaluator",
+        "title": "Name",
+        "type": "string"
+      },
+      "description": {
+        "default": "",
+        "description": "The description of the evaluator",
+        "title": "Description",
+        "type": "string"
+      },
+      "default_evaluation_criteria": {
+        "anyOf": [
+          {
+            "$ref": "#/$defs/BinaryClassificationEvaluationCriteria"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null
+      },
+      "target_output_key": {
+        "default": "*",
+        "description": "Key to extract output from agent execution",
+        "title": "Target Output Key",
+        "type": "string"
+      },
+      "line_by_line_evaluator": {
+        "default": false,
+        "description": "If True, split output by delimiter and evaluate each line separately",
+        "title": "Line By Line Evaluator",
+        "type": "boolean"
+      },
+      "line_delimiter": {
+        "default": "\n",
+        "description": "Delimiter to split output when line_by_line_evaluator is True",
+        "title": "Line Delimiter",
+        "type": "string"
+      },
+      "positive_class": {
+        "title": "Positive Class",
+        "type": "string"
+      },
+      "metric_type": {
+        "default": "precision",
+        "enum": [
+          "precision",
+          "recall",
+          "f-score"
+        ],
+        "title": "Metric Type",
+        "type": "string"
+      },
+      "f_value": {
+        "default": 1.0,
+        "title": "F Value",
+        "type": "number"
+      }
+    },
+    "required": [
+      "positive_class"
+    ],
+    "title": "BinaryClassificationEvaluatorConfig",
+    "type": "object"
+  },
+  "evaluationCriteriaSchema": {
+    "description": "Per-datapoint criteria: which class this sample should belong to.",
+    "properties": {
+      "expected_class": {
+        "title": "Expected Class",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected_class"
+    ],
+    "title": "BinaryClassificationEvaluationCriteria",
+    "type": "object"
+  },
+  "justificationSchema": {
+    "description": "Base class for all evaluator justifications.",
+    "properties": {
+      "expected": {
+        "title": "Expected",
+        "type": "string"
+      },
+      "actual": {
+        "title": "Actual",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected",
+      "actual"
+    ],
+    "title": "BaseEvaluatorJustification",
+    "type": "object"
+  }
+}
\ No newline at end of file
diff --git a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
new file mode 100644
index 000000000..72262ba92
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
@@ -0,0 +1,133 @@
+{
+  "evaluatorTypeId": "uipath-multiclass-classification",
+  "evaluatorConfigSchema": {
+    "$defs": {
+      "MulticlassClassificationEvaluationCriteria": {
+        "description": "Per-datapoint criteria: which class this sample should belong to.",
+        "properties": {
+          "expected_class": {
+            "title": "Expected Class",
+            "type": "string"
+          }
+        },
+        "required": [
+          "expected_class"
+        ],
+        "title": "MulticlassClassificationEvaluationCriteria",
+        "type": "object"
+      }
+    },
+    "description": "Configuration for the multiclass classification evaluator.",
+    "properties": {
+      "name": {
+        "default": "MulticlassClassificationEvaluator",
+        "title": "Name",
+        "type": "string"
+      },
+      "description": {
+        "default": "",
+        "description": "The description of the evaluator",
+        "title": "Description",
+        "type": "string"
+      },
+      "default_evaluation_criteria": {
+        "anyOf": [
+          {
+            "$ref": "#/$defs/MulticlassClassificationEvaluationCriteria"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null
+      },
+      "target_output_key": {
+        "default": "*",
+        "description": "Key to extract output from agent execution",
+        "title": "Target Output Key",
+        "type": "string"
+      },
+      "line_by_line_evaluator": {
+        "default": false,
+        "description": "If True, split output by delimiter and evaluate each line separately",
+        "title": "Line By Line Evaluator",
+        "type": "boolean"
+      },
+      "line_delimiter": {
+        "default": "\n",
+        "description": "Delimiter to split output when line_by_line_evaluator is True",
+        "title": "Line Delimiter",
+        "type": "string"
+      },
+      "classes": {
+        "items": {
+          "type": "string"
+        },
+        "title": "Classes",
+        "type": "array"
+      },
+      "metric_type": {
+        "default": "f-score",
+        "enum": [
+          "precision",
+          "recall",
+          "f-score"
+        ],
+        "title": "Metric Type",
+        "type": "string"
+      },
+      "averaging": {
+        "default": "macro",
+        "enum": [
+          "micro",
+          "macro"
+        ],
+        "title": "Averaging",
+        "type": "string"
+      },
+      "f_value": {
+        "default": 1.0,
+        "title": "F Value",
+        "type": "number"
+      }
+    },
+    "required": [
+      "classes"
+    ],
+    "title": "MulticlassClassificationEvaluatorConfig",
+    "type": "object"
+  },
+  "evaluationCriteriaSchema": {
+    "description": "Per-datapoint criteria: which class this sample should belong to.",
+    "properties": {
+      "expected_class": {
+        "title": "Expected Class",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected_class"
+    ],
+    "title": "MulticlassClassificationEvaluationCriteria",
+    "type": "object"
+  },
+  "justificationSchema": {
+    "description": "Base class for all evaluator justifications.",
+    "properties": {
+      "expected": {
+        "title": "Expected",
+        "type": "string"
+      },
+      "actual": {
+        "title": "Actual",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected",
+      "actual"
+    ],
+    "title": "BaseEvaluatorJustification",
+    "type": "object"
+  }
+}
\ No newline at end of file

From 037b60cdb6e721c494b2b4fd173e6bf1bdb450ed Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Tue, 19 May 2026 18:27:58 -0700
Subject: [PATCH 02/13] test(eval): add e2e tests + sample projects for
 classification evaluators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two sample projects under packages/uipath/samples/ that double as
end-to-end test fixtures for the binary and multiclass classification
evaluators added in #1397:

- binary_classification_agent — rule-based spam/ham classifier wired up
  to the binary classification evaluator with metric_type=precision.
  Eval set is designed so 4/5 datapoints pass but precision is 2/3
  because of one deliberate false positive.
- multiclass_classification_simple — rule-based 3-class router (payments
  / support / spam) wired up to the multiclass classification evaluator
  with macro-averaged F1. Eval set forces a misroute that hurts both
  payments precision and support recall, giving macro F1 = 26/30.

Adds tests/cli/eval/test_classification_samples_e2e.py which loads each
sample's eval-sets/default.json, wires its main.py into a stand-in runtime,
calls evaluate(), and asserts both the per-row scores and the aggregated
metric produced by reduce_scores. Locks in the dataset-level math, not just
per-row correct/incorrect.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../binary_classification_agent/bindings.json |   4 +
 .../evaluations/eval-sets/default.json        |  63 ++++++
 .../evaluators/binary-classification.json     |  16 ++
 .../binary_classification_agent/main.py       |  39 ++++
 .../pyproject.toml                            |   9 +
 .../binary_classification_agent/uipath.json   |   5 +
 .../bindings.json                             |   4 +
 .../evaluations/eval-sets/default.json        |  85 ++++++++
 .../evaluators/multiclass-classification.json |  17 ++
 .../multiclass_classification_simple/main.py  |  51 +++++
 .../pyproject.toml                            |   9 +
 .../uipath.json                               |   5 +
 .../eval/test_classification_samples_e2e.py   | 193 ++++++++++++++++++
 13 files changed, 500 insertions(+)
 create mode 100644 packages/uipath/samples/binary_classification_agent/bindings.json
 create mode 100644 packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
 create mode 100644 packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
 create mode 100644 packages/uipath/samples/binary_classification_agent/main.py
 create mode 100644 packages/uipath/samples/binary_classification_agent/pyproject.toml
 create mode 100644 packages/uipath/samples/binary_classification_agent/uipath.json
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/bindings.json
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/main.py
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/pyproject.toml
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/uipath.json
 create mode 100644 packages/uipath/tests/cli/eval/test_classification_samples_e2e.py

diff --git a/packages/uipath/samples/binary_classification_agent/bindings.json b/packages/uipath/samples/binary_classification_agent/bindings.json
new file mode 100644
index 000000000..5e9beeb01
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
new file mode 100644
index 000000000..f47cd25b8
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
@@ -0,0 +1,63 @@
+{
+  "version": "1.0",
+  "id": "SpamBinaryEval",
+  "name": "Binary spam classifier — precision",
+  "evaluatorRefs": ["BinarySpamPrecision"],
+  "evaluations": [
+    {
+      "id": "spam-prize",
+      "name": "Spam: prize giveaway",
+      "inputs": {
+        "email_subject": "You won a FREE iPhone!!!",
+        "email_body": "Congratulations! Click here to claim your prize now."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "spam-promo",
+      "name": "Spam: unsolicited promo",
+      "inputs": {
+        "email_subject": "Winner of the monthly drawing",
+        "email_body": "You've been selected. Click here to redeem."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "ham-invoice",
+      "name": "Ham: legitimate invoice",
+      "inputs": {
+        "email_subject": "Your March invoice is ready",
+        "email_body": "Your monthly invoice of $45.99 is attached. Payment is due March 15."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    },
+    {
+      "id": "ham-meeting",
+      "name": "Ham: meeting request",
+      "inputs": {
+        "email_subject": "Sync on Q2 planning",
+        "email_body": "Can we meet Wednesday at 2pm to align on next quarter's roadmap?"
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    },
+    {
+      "id": "ham-mislabeled",
+      "name": "Ham mislabeled as spam (forces a false positive)",
+      "inputs": {
+        "email_subject": "Free coffee in the break room!!!",
+        "email_body": "Just a heads up — the new espresso machine is set up."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    }
+  ]
+}
diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
new file mode 100644
index 000000000..21f7d6850
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
@@ -0,0 +1,16 @@
+{
+  "version": "1.0",
+  "id": "BinarySpamPrecision",
+  "description": "Precision on the 'spam' positive class",
+  "evaluatorTypeId": "uipath-binary-classification",
+  "evaluatorConfig": {
+    "name": "BinarySpamPrecision",
+    "targetOutputKey": "category",
+    "positiveClass": "spam",
+    "metricType": "precision",
+    "fValue": 1.0,
+    "defaultEvaluationCriteria": {
+      "expectedClass": "ham"
+    }
+  }
+}
diff --git a/packages/uipath/samples/binary_classification_agent/main.py b/packages/uipath/samples/binary_classification_agent/main.py
new file mode 100644
index 000000000..1df5dea15
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/main.py
@@ -0,0 +1,39 @@
+"""Rule-based spam/ham classifier demonstrating the binary classification evaluator."""
+
+from dataclasses import dataclass
+
+from uipath.tracing import traced
+
+SPAMMY_TOKENS = {
+    "free",
+    "winner",
+    "congratulations",
+    "click here",
+    "prize",
+    "!!!",
+}
+
+
+@dataclass
+class EmailInput:
+    email_subject: str
+    email_body: str
+
+
+@dataclass
+class Classification:
+    category: str
+
+
+@traced(name="classify_email", span_type="tool")
+def classify_email(subject: str, body: str) -> str:
+    """Return 'spam' if any spam-indicator token appears in the subject or body."""
+    text = f"{subject} {body}".lower()
+    return "spam" if any(token in text for token in SPAMMY_TOKENS) else "ham"
+
+
+@traced()
+async def main(input: EmailInput) -> Classification:
+    """Classify an email as 'spam' or 'ham'."""
+    category = classify_email(input.email_subject, input.email_body)
+    return Classification(category=category)
diff --git a/packages/uipath/samples/binary_classification_agent/pyproject.toml b/packages/uipath/samples/binary_classification_agent/pyproject.toml
new file mode 100644
index 000000000..7d81d251a
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "binary-classification-agent"
+version = "0.0.1"
+description = "Rule-based spam/ham classifier demonstrating the binary classification evaluator"
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]
diff --git a/packages/uipath/samples/binary_classification_agent/uipath.json b/packages/uipath/samples/binary_classification_agent/uipath.json
new file mode 100644
index 000000000..9b02c2654
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "main.py:main"
+  }
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/bindings.json b/packages/uipath/samples/multiclass_classification_simple/bindings.json
new file mode 100644
index 000000000..5e9beeb01
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json
new file mode 100644
index 000000000..27e66c25d
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json
@@ -0,0 +1,85 @@
+{
+  "version": "1.0",
+  "id": "EmailMulticlassEval",
+  "name": "3-class email router — macro F1",
+  "evaluatorRefs": ["EmailMulticlassFScore"],
+  "evaluations": [
+    {
+      "id": "pay-invoice",
+      "name": "Payments: invoice reminder",
+      "inputs": {
+        "email_subject": "Your March invoice is ready",
+        "email_body": "Your monthly invoice of $45.99 is now available. Payment is due March 15."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "payments" }
+      }
+    },
+    {
+      "id": "pay-refund",
+      "name": "Payments: refund request",
+      "inputs": {
+        "email_subject": "Refund for last month's charge",
+        "email_body": "I was charged twice for the same service. Please process a refund."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "payments" }
+      }
+    },
+    {
+      "id": "support-broken",
+      "name": "Support: feature broken",
+      "inputs": {
+        "email_subject": "Login is broken",
+        "email_body": "I'm getting an error when trying to sign in. Need help."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    },
+    {
+      "id": "support-question",
+      "name": "Support: how-to question",
+      "inputs": {
+        "email_subject": "How do I export my data?",
+        "email_body": "Can you help me figure out where the export button is?"
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    },
+    {
+      "id": "spam-prize",
+      "name": "Spam: prize giveaway",
+      "inputs": {
+        "email_subject": "You won a FREE iPhone!!!",
+        "email_body": "Congratulations! Click here to claim your prize."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "spam-promo",
+      "name": "Spam: marketing winner",
+      "inputs": {
+        "email_subject": "Winner of the monthly drawing",
+        "email_body": "Congratulations, click here to redeem your reward."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "support-misrouted-by-payment-word",
+      "name": "Support email accidentally routed to payments (forces an FP for payments)",
+      "inputs": {
+        "email_subject": "Question about my billing portal access",
+        "email_body": "I cannot log into the billing portal. The page just spins. Can you help?"
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    }
+  ]
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
new file mode 100644
index 000000000..859a18562
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
@@ -0,0 +1,17 @@
+{
+  "version": "1.0",
+  "id": "EmailMulticlassFScore",
+  "description": "Macro-averaged F1 across payments / support / spam",
+  "evaluatorTypeId": "uipath-multiclass-classification",
+  "evaluatorConfig": {
+    "name": "EmailMulticlassFScore",
+    "targetOutputKey": "category",
+    "classes": ["payments", "support", "spam"],
+    "metricType": "f-score",
+    "averaging": "macro",
+    "fValue": 1.0,
+    "defaultEvaluationCriteria": {
+      "expectedClass": "support"
+    }
+  }
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/main.py b/packages/uipath/samples/multiclass_classification_simple/main.py
new file mode 100644
index 000000000..3ab684298
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/main.py
@@ -0,0 +1,51 @@
+"""Rule-based 3-class email router demonstrating the multiclass classification evaluator."""
+
+from dataclasses import dataclass
+
+from uipath.tracing import traced
+
+SPAM_TOKENS = {"free", "winner", "congratulations", "click here", "prize", "!!!"}
+PAYMENT_TOKENS = {"invoice", "payment", "refund", "charge", "billing", "$"}
+SUPPORT_TOKENS = {
+    "help",
+    "support",
+    "issue",
+    "error",
+    "ticket",
+    "broken",
+    "not working",
+}
+
+
+@dataclass
+class EmailInput:
+    email_subject: str
+    email_body: str
+
+
+@dataclass
+class Classification:
+    category: str
+
+
+@traced(name="classify_email", span_type="tool")
+def classify_email(subject: str, body: str) -> str:
+    """Classify into 'spam', 'payments', or 'support' using priority rules.
+
+    Spam is checked first so promos with billing-flavored words still route to spam.
+    Payments is checked before support because it is the more specific intent.
+    Support is the catch-all default.
+    """
+    text = f"{subject} {body}".lower()
+    if any(token in text for token in SPAM_TOKENS):
+        return "spam"
+    if any(token in text for token in PAYMENT_TOKENS):
+        return "payments"
+    return "support"
+
+
+@traced()
+async def main(input: EmailInput) -> Classification:
+    """Route an email to one of three queues."""
+    category = classify_email(input.email_subject, input.email_body)
+    return Classification(category=category)
diff --git a/packages/uipath/samples/multiclass_classification_simple/pyproject.toml b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml
new file mode 100644
index 000000000..e803a2a43
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "multiclass-classification-simple"
+version = "0.0.1"
+description = "Rule-based 3-class email router demonstrating the multiclass classification evaluator with macro-averaged F1"
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]
diff --git a/packages/uipath/samples/multiclass_classification_simple/uipath.json b/packages/uipath/samples/multiclass_classification_simple/uipath.json
new file mode 100644
index 000000000..9b02c2654
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "main.py:main"
+  }
+}
diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
new file mode 100644
index 000000000..202363221
--- /dev/null
+++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
@@ -0,0 +1,193 @@
+"""End-to-end tests that run the classification sample projects through evaluate().
+
+These tests double as integration coverage for the binary and multiclass
+classification evaluators added in #1397 — they wire each sample's main.py
+into a stand-in runtime, run the full eval set, and assert the per-row scores
+plus the aggregated metric produced by `reduce_scores`.
+"""
+
+import importlib.util
+import uuid
+from pathlib import Path
+from types import ModuleType
+from typing import Any, AsyncGenerator
+
+import pytest
+
+from uipath.core.events import EventBus
+from uipath.core.tracing import UiPathTraceManager
+from uipath.eval.helpers import EvalHelpers
+from uipath.eval.runtime import UiPathEvalContext, evaluate
+from uipath.eval.runtime._types import UiPathEvalOutput
+from uipath.eval.runtime.runtime import compute_evaluator_scores
+from uipath.runtime import (
+    UiPathExecuteOptions,
+    UiPathRuntimeEvent,
+    UiPathRuntimeFactorySettings,
+    UiPathRuntimeProtocol,
+    UiPathRuntimeResult,
+    UiPathRuntimeStatus,
+    UiPathRuntimeStorageProtocol,
+    UiPathStreamOptions,
+)
+from uipath.runtime.schema import UiPathRuntimeSchema
+
+SAMPLES_DIR = Path(__file__).resolve().parents[3] / "samples"
+
+
+def _load_sample_main(sample_dir: Path) -> ModuleType:
+    """Import a sample's main.py as an isolated module."""
+    module_name = f"_eval_sample_{sample_dir.name}"
+    spec = importlib.util.spec_from_file_location(module_name, sample_dir / "main.py")
+    assert spec is not None and spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+class _SampleRuntime:
+    """Runtime that delegates execution to the sample's `main` function."""
+
+    def __init__(self, sample_main: Any) -> None:
+        self._sample_main = sample_main
+
+    async def execute(
+        self,
+        input: dict[str, Any] | None = None,
+        options: UiPathExecuteOptions | None = None,
+    ) -> UiPathRuntimeResult:
+        input_model = self._sample_main.EmailInput(**(input or {}))
+        output = await self._sample_main.main(input_model)
+        return UiPathRuntimeResult(
+            output={"category": output.category},
+            status=UiPathRuntimeStatus.SUCCESSFUL,
+        )
+
+    async def stream(
+        self,
+        input: dict[str, Any] | None = None,
+        options: UiPathStreamOptions | None = None,
+    ) -> AsyncGenerator[UiPathRuntimeEvent, None]:
+        yield await self.execute(input, None)
+
+    async def get_schema(self) -> UiPathRuntimeSchema:
+        return UiPathRuntimeSchema(
+            filePath="main.py",
+            uniqueId="main",
+            type="agent",
+            input={
+                "type": "object",
+                "properties": {
+                    "email_subject": {"type": "string"},
+                    "email_body": {"type": "string"},
+                },
+            },
+            output={
+                "type": "object",
+                "properties": {"category": {"type": "string"}},
+            },
+        )
+
+    async def dispose(self) -> None:
+        pass
+
+
+class _SampleFactory:
+    def __init__(self, sample_main: Any) -> None:
+        self._sample_main = sample_main
+
+    def discover_entrypoints(self) -> list[str]:
+        return ["main"]
+
+    async def get_storage(self) -> UiPathRuntimeStorageProtocol | None:
+        return None
+
+    async def get_settings(self) -> UiPathRuntimeFactorySettings | None:
+        return None
+
+    async def new_runtime(
+        self, entrypoint: str, runtime_id: str, **kwargs: Any
+    ) -> UiPathRuntimeProtocol:
+        return _SampleRuntime(self._sample_main)
+
+    async def dispose(self) -> None:
+        pass
+
+
+async def _run_sample(sample_dir: Path) -> tuple[UiPathEvalOutput, dict[str, float]]:
+    """Run the sample's eval set and return (per-row output, evaluator_averages)."""
+    sample_main = _load_sample_main(sample_dir)
+    factory = _SampleFactory(sample_main)
+
+    eval_set_path = str(sample_dir / "evaluations" / "eval-sets" / "default.json")
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    runtime = await factory.new_runtime("main", "test-runtime-id")
+    runtime_schema = await runtime.get_schema()
+
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+
+    result = await evaluate(
+        factory,
+        UiPathTraceManager(),
+        context,
+        EventBus(),
+    )
+
+    eval_output = UiPathEvalOutput.model_validate(result.output)
+    _, evaluator_averages = compute_evaluator_scores(
+        eval_output.evaluation_set_results, evaluators
+    )
+    return eval_output, evaluator_averages
+
+
+def _per_row_scores(output: UiPathEvalOutput) -> dict[str, float]:
+    return {
+        row.evaluation_name: row.evaluation_run_results[0].result.score
+        for row in output.evaluation_set_results
+    }
+
+
+async def test_binary_classification_sample_end_to_end():
+    """Binary spam classifier: 4/5 datapoints correct, but precision is 2/3 because of one FP."""
+    output, averages = await _run_sample(SAMPLES_DIR / "binary_classification_agent")
+
+    per_row = _per_row_scores(output)
+    assert per_row == {
+        "Spam: prize giveaway": 1.0,
+        "Spam: unsolicited promo": 1.0,
+        "Ham: legitimate invoice": 1.0,
+        "Ham: meeting request": 1.0,
+        "Ham mislabeled as spam (forces a false positive)": 0.0,
+    }
+    # Precision = TP / (TP + FP) = 2 / (2 + 1) = 0.6666...
+    assert averages["BinarySpamPrecision"] == pytest.approx(2 / 3, rel=1e-6)
+
+
+async def test_multiclass_classification_sample_end_to_end():
+    """Multiclass router: 6/7 correct, macro F1 = (0.8 + 0.8 + 1.0) / 3 = 0.8666..."""
+    output, averages = await _run_sample(
+        SAMPLES_DIR / "multiclass_classification_simple"
+    )
+
+    per_row = _per_row_scores(output)
+    assert per_row == {
+        "Payments: invoice reminder": 1.0,
+        "Payments: refund request": 1.0,
+        "Support: feature broken": 1.0,
+        "Support: how-to question": 1.0,
+        "Spam: prize giveaway": 1.0,
+        "Spam: marketing winner": 1.0,
+        "Support email accidentally routed to payments "
+        "(forces an FP for payments)": 0.0,
+    }
+    # payments F1=0.8 (P=2/3, R=1), support F1=0.8 (P=1, R=2/3), spam F1=1.0
+    # macro = mean = 2.6 / 3
+    assert averages["EmailMulticlassFScore"] == pytest.approx(2.6 / 3, rel=1e-6)

From 5e574f1895feccb314fd929d57e15dd69580c5f0 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Wed, 20 May 2026 14:05:44 -0700
Subject: [PATCH 03/13] feat(eval): add dataset-level evaluator framework with
 precision/recall/f-score
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces a new BaseDatasetEvaluator concept that runs once per evaluation
set after all per-datapoint evaluators complete. It consumes per-datapoint
EvaluationResultDto values from a named source evaluator and emits a single
run-level EvaluationResult.

Includes three starter evaluators for multiclass classification metrics:

- PrecisionDatasetEvaluator
- RecallDatasetEvaluator
- FScoreDatasetEvaluator (configurable beta)

Each takes a required classes list (populated from the UI), supports micro
or macro averaging, and emits per-class TP/TN/FP/FN plus the confusion
matrix in details. Binary is the 2-class case — no separate binary path.

Architecture: BaseDatasetEvaluator is a parallel hierarchy to
GenericBaseEvaluator (not a subclass) so the per-datapoint dispatch loop
cannot accidentally pick up a dataset evaluator. Each dataset evaluator
declares a single source_evaluator by name; the runtime groups
per-datapoint results by evaluator name and routes the right list to each
dataset evaluator. Configs load from <eval_set>/../dataset_evaluators/*.json
mirroring the evaluators directory layout.

Patch version bumped: 2.10.68 -> 2.10.69.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/uipath/pyproject.toml                |   2 +-
 packages/uipath/src/uipath/_cli/cli_eval.py   |   7 +
 .../eval/evaluators/base_dataset_evaluator.py |  75 ++++
 .../classification_dataset_evaluators.py      | 311 +++++++++++++
 .../evaluators/dataset_evaluator_factory.py   |  52 +++
 packages/uipath/src/uipath/eval/helpers.py    |  88 ++++
 .../src/uipath/eval/models/evaluation_set.py  |   3 +
 .../uipath/src/uipath/eval/models/models.py   |   3 +
 .../uipath/src/uipath/eval/runtime/_types.py  |   5 +-
 .../uipath/src/uipath/eval/runtime/context.py |   2 +
 .../uipath/src/uipath/eval/runtime/runtime.py |  50 +++
 .../test_dataset_classification_evaluators.py | 411 ++++++++++++++++++
 12 files changed, 1007 insertions(+), 2 deletions(-)
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
 create mode 100644 packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py

diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
index 36550f54d..0d70cb383 100644
--- a/packages/uipath/pyproject.toml
+++ b/packages/uipath/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.10.68"
+version = "2.10.69"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath/src/uipath/_cli/cli_eval.py b/packages/uipath/src/uipath/_cli/cli_eval.py
index e101717d6..2e35db849 100644
--- a/packages/uipath/src/uipath/_cli/cli_eval.py
+++ b/packages/uipath/src/uipath/_cli/cli_eval.py
@@ -412,6 +412,13 @@ async def execute_eval():
                             get_agent_model(eval_context.runtime_schema),
                         )
 
+                        eval_context.dataset_evaluators = (
+                            await EvalHelpers.load_dataset_evaluators(
+                                resolved_eval_set_path,
+                                eval_context.evaluation_set,
+                            )
+                        )
+
                         # Runtime is not required anymore.
                         await runtime.dispose()
 
diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
new file mode 100644
index 000000000..ae818a421
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
@@ -0,0 +1,75 @@
+"""Base abstractions for dataset-level evaluators.
+
+A dataset-level evaluator runs once per evaluation set, after all per-datapoint
+evaluators have produced their results. It consumes the per-datapoint
+EvaluationResultDto values from one named source evaluator and emits a single
+EvaluationResult that summarizes the dataset.
+
+Concretely distinct from GenericBaseEvaluator: different evaluate() signature,
+different lifecycle. Kept as a parallel hierarchy rather than a subclass so
+the runtime cannot accidentally dispatch a dataset evaluator through the
+per-datapoint loop.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+from ..models.models import EvaluationResult, EvaluationResultDto
+
+
+class BaseDatasetEvaluatorConfig(BaseModel):
+    """Configuration shared by all dataset-level evaluators."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    id: str
+    name: str
+    type: str
+    source_evaluator: str = Field(
+        ...,
+        description=(
+            "Name of the per-datapoint evaluator whose EvaluationResultDto values "
+            "this dataset evaluator consumes."
+        ),
+    )
+
+
+ConfigT = TypeVar("ConfigT", bound=BaseDatasetEvaluatorConfig)
+
+
+class BaseDatasetEvaluator(ABC, Generic[ConfigT]):
+    """Abstract base for dataset-level evaluators.
+
+    Subclasses implement ``evaluate`` over the per-datapoint EvaluationResultDto
+    values produced by ``config.source_evaluator``.
+    """
+
+    config: ConfigT
+
+    def __init__(self, config: ConfigT) -> None:
+        """Store the evaluator's configuration."""
+        self.config = config
+
+    @property
+    def name(self) -> str:
+        """Logical name of this evaluator instance (used as result-dict key)."""
+        return self.config.name
+
+    @property
+    def source_evaluator(self) -> str:
+        """Name of the upstream evaluator whose results this one consumes."""
+        return self.config.source_evaluator
+
+    @classmethod
+    @abstractmethod
+    def get_evaluator_id(cls) -> str:
+        """Stable identifier matching the ``type`` discriminator on configs."""
+
+    @abstractmethod
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Reduce per-datapoint results into a single run-level EvaluationResult."""
diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
new file mode 100644
index 000000000..272541e21
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -0,0 +1,311 @@
+"""Dataset-level classification evaluators: Precision, Recall, F-score.
+
+All three share the same internal machinery — a k x k confusion matrix built
+from each per-datapoint result's BaseEvaluatorJustification (expected, actual)
+strings. They differ only in the final formula and (for F-score) the beta
+parameter. The headline ``score`` is the micro or macro average per config;
+``details`` carries the full per-class breakdown plus the confusion matrix.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+from ..models.models import (
+    EvaluationResult,
+    EvaluationResultDto,
+    EvaluatorType,
+    NumericEvaluationResult,
+)
+from .base_dataset_evaluator import BaseDatasetEvaluator, BaseDatasetEvaluatorConfig
+from .base_evaluator import BaseEvaluatorJustification
+
+
+def _coerce_justification(details: object) -> tuple[str, str] | None:
+    """Extract (expected, actual) from an EvaluationResultDto.details payload."""
+    if isinstance(details, BaseEvaluatorJustification):
+        return details.expected, details.actual
+    if isinstance(details, dict):
+        try:
+            j = BaseEvaluatorJustification.model_validate(details)
+        except Exception:
+            return None
+        return j.expected, j.actual
+    return None
+
+
+class PerClassMetrics(BaseModel):
+    """Per-class confusion counts plus the metric the evaluator computed."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    tp: int
+    tn: int
+    fp: int
+    fn: int
+    support: int
+    value: float
+
+
+class ClassificationDetails(BaseModel):
+    """Structured details payload emitted by every classification evaluator."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    metric: str
+    average: str
+    classes: list[str]
+    confusion_matrix: list[list[int]]
+    per_class: dict[str, PerClassMetrics]
+    micro: float
+    macro: float
+    n_total: int
+    n_scored: int
+    n_skipped: int
+
+
+class _ConfusionData:
+    """Internal: confusion matrix and per-class counts derived from results."""
+
+    __slots__ = ("classes", "matrix", "n_total", "n_scored", "n_skipped")
+
+    def __init__(
+        self,
+        classes: list[str],
+        matrix: list[list[int]],
+        n_total: int,
+        n_scored: int,
+        n_skipped: int,
+    ) -> None:
+        self.classes = classes
+        self.matrix = matrix
+        self.n_total = n_total
+        self.n_scored = n_scored
+        self.n_skipped = n_skipped
+
+    def counts_for(self, class_index: int) -> tuple[int, int, int, int]:
+        """Return (tp, fp, fn, tn) for a class index."""
+        k = len(self.classes)
+        tp = self.matrix[class_index][class_index]
+        fp = sum(self.matrix[class_index][j] for j in range(k)) - tp
+        fn = sum(self.matrix[j][class_index] for j in range(k)) - tp
+        tn = self.n_scored - tp - fp - fn
+        return tp, fp, fn, tn
+
+
+def _build_confusion(
+    results: list[EvaluationResultDto],
+    classes: list[str],
+    case_sensitive: bool,
+) -> _ConfusionData:
+    """Build a confusion matrix from per-datapoint results.
+
+    Results without a parseable justification are counted in ``n_skipped`` and
+    omitted from the matrix. Pairs whose expected or actual label isn't in
+    ``classes`` are also skipped.
+    """
+
+    def norm(label: str) -> str:
+        return label if case_sensitive else label.lower()
+
+    canonical_classes = [norm(c) for c in classes]
+    index_of = {c: i for i, c in enumerate(canonical_classes)}
+    k = len(canonical_classes)
+    matrix = [[0] * k for _ in range(k)]
+
+    n_total = len(results)
+    n_scored = 0
+    n_skipped = 0
+
+    for r in results:
+        j = _coerce_justification(r.details)
+        if j is None:
+            n_skipped += 1
+            continue
+        exp = norm(j[0])
+        act = norm(j[1])
+        if exp not in index_of or act not in index_of:
+            n_skipped += 1
+            continue
+        matrix[index_of[act]][index_of[exp]] += 1
+        n_scored += 1
+
+    return _ConfusionData(
+        classes=canonical_classes,
+        matrix=matrix,
+        n_total=n_total,
+        n_scored=n_scored,
+        n_skipped=n_skipped,
+    )
+
+
+def _precision_of(tp: int, fp: int, _fn: int, _tn: int) -> float:
+    return tp / (tp + fp) if (tp + fp) > 0 else 0.0
+
+
+def _recall_of(tp: int, _fp: int, fn: int, _tn: int) -> float:
+    return tp / (tp + fn) if (tp + fn) > 0 else 0.0
+
+
+def _f_score_of(beta: float):
+    beta_sq = beta * beta
+
+    def compute(tp: int, fp: int, fn: int, _tn: int) -> float:
+        p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        denom = beta_sq * p + r
+        return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0
+
+    return compute
+
+
+def _build_details(
+    confusion: _ConfusionData,
+    metric_name: str,
+    average: str,
+    per_class_fn,
+) -> tuple[ClassificationDetails, float]:
+    """Compute per-class values, micro, macro, and pick the headline.
+
+    Returns (details, headline_score). ``headline_score`` is the micro or macro
+    average per the evaluator's ``average`` setting.
+    """
+    per_class: dict[str, PerClassMetrics] = {}
+    total_tp = 0
+    total_fp = 0
+    total_fn = 0
+
+    for c, label in enumerate(confusion.classes):
+        tp, fp, fn, tn = confusion.counts_for(c)
+        total_tp += tp
+        total_fp += fp
+        total_fn += fn
+        per_class[label] = PerClassMetrics(
+            tp=tp,
+            tn=tn,
+            fp=fp,
+            fn=fn,
+            support=tp + fn,
+            value=per_class_fn(tp, fp, fn, tn),
+        )
+
+    micro = per_class_fn(total_tp, total_fp, total_fn, 0)
+
+    k = len(confusion.classes)
+    macro = sum(per_class[c].value for c in confusion.classes) / k if k > 0 else 0.0
+
+    details = ClassificationDetails(
+        metric=metric_name,
+        average=average,
+        classes=confusion.classes,
+        confusion_matrix=confusion.matrix,
+        per_class=per_class,
+        micro=micro,
+        macro=macro,
+        n_total=confusion.n_total,
+        n_scored=confusion.n_scored,
+        n_skipped=confusion.n_skipped,
+    )
+
+    headline = micro if average == "micro" else macro
+    return details, headline
+
+
+# ─── configs ──────────────────────────────────────────────────────────────────
+
+
+class _BaseClassificationConfig(BaseDatasetEvaluatorConfig):
+    """Shared config for the three classification evaluators."""
+
+    classes: list[str] = Field(
+        ...,
+        min_length=1,
+        description="Class labels expected in the upstream evaluator's justifications.",
+    )
+    average: Literal["micro", "macro"] = "macro"
+    case_sensitive: bool = False
+
+
+class PrecisionDatasetEvaluatorConfig(_BaseClassificationConfig):
+    """Configuration for the dataset-level precision evaluator."""
+
+    type: str = EvaluatorType.DATASET_PRECISION.value
+
+
+class RecallDatasetEvaluatorConfig(_BaseClassificationConfig):
+    """Configuration for the dataset-level recall evaluator."""
+
+    type: str = EvaluatorType.DATASET_RECALL.value
+
+
+class FScoreDatasetEvaluatorConfig(_BaseClassificationConfig):
+    """Configuration for the dataset-level F-score evaluator."""
+
+    type: str = EvaluatorType.DATASET_F_SCORE.value
+    f_value: float = Field(default=1.0, gt=0, description="Beta value for F_beta.")
+
+
+# ─── evaluators ───────────────────────────────────────────────────────────────
+
+
+class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionDatasetEvaluatorConfig]):
+    """Dataset-level precision evaluator (multiclass, micro or macro averaged)."""
+
+    @classmethod
+    def get_evaluator_id(cls) -> str:
+        """Identifier matching the type discriminator on configs."""
+        return EvaluatorType.DATASET_PRECISION.value
+
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Compute the precision report and return the headline as score."""
+        confusion = _build_confusion(
+            results, self.config.classes, self.config.case_sensitive
+        )
+        details, headline = _build_details(
+            confusion, "precision", self.config.average, _precision_of
+        )
+        return NumericEvaluationResult(score=headline, details=details)
+
+
+class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallDatasetEvaluatorConfig]):
+    """Dataset-level recall evaluator (multiclass, micro or macro averaged)."""
+
+    @classmethod
+    def get_evaluator_id(cls) -> str:
+        """Identifier matching the type discriminator on configs."""
+        return EvaluatorType.DATASET_RECALL.value
+
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Compute the recall report and return the headline as score."""
+        confusion = _build_confusion(
+            results, self.config.classes, self.config.case_sensitive
+        )
+        details, headline = _build_details(
+            confusion, "recall", self.config.average, _recall_of
+        )
+        return NumericEvaluationResult(score=headline, details=details)
+
+
+class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreDatasetEvaluatorConfig]):
+    """Dataset-level F-beta evaluator (multiclass, micro or macro averaged)."""
+
+    @classmethod
+    def get_evaluator_id(cls) -> str:
+        """Identifier matching the type discriminator on configs."""
+        return EvaluatorType.DATASET_F_SCORE.value
+
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Compute the F-beta report and return the headline as score."""
+        confusion = _build_confusion(
+            results, self.config.classes, self.config.case_sensitive
+        )
+        details, headline = _build_details(
+            confusion,
+            "f_score",
+            self.config.average,
+            _f_score_of(self.config.f_value),
+        )
+        return NumericEvaluationResult(score=headline, details=details)
diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
new file mode 100644
index 000000000..8ba0dbe62
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
@@ -0,0 +1,52 @@
+"""Factory that instantiates dataset-level evaluators from configuration."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..models.models import EvaluatorType
+from .base_dataset_evaluator import BaseDatasetEvaluator
+from .classification_dataset_evaluators import (
+    FScoreDatasetEvaluator,
+    FScoreDatasetEvaluatorConfig,
+    PrecisionDatasetEvaluator,
+    PrecisionDatasetEvaluatorConfig,
+    RecallDatasetEvaluator,
+    RecallDatasetEvaluatorConfig,
+)
+
+_EVALUATOR_REGISTRY: dict[str, type[BaseDatasetEvaluator[Any]]] = {
+    EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluator,
+    EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluator,
+    EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluator,
+}
+
+_CONFIG_REGISTRY: dict[str, type[Any]] = {
+    EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluatorConfig,
+    EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluatorConfig,
+    EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluatorConfig,
+}
+
+
+def build_dataset_evaluator(
+    config_data: dict[str, Any],
+) -> BaseDatasetEvaluator[Any]:
+    """Build a dataset evaluator instance from a parsed JSON config dict.
+
+    Raises:
+        ValueError: If ``type`` is missing or unknown.
+    """
+    evaluator_type = config_data.get("type")
+    if not evaluator_type:
+        raise ValueError("Dataset evaluator config is missing required field 'type'")
+
+    config_cls = _CONFIG_REGISTRY.get(evaluator_type)
+    evaluator_cls = _EVALUATOR_REGISTRY.get(evaluator_type)
+    if config_cls is None or evaluator_cls is None:
+        known = sorted(_EVALUATOR_REGISTRY.keys())
+        raise ValueError(
+            f"Unknown dataset evaluator type '{evaluator_type}'. Known types: {known}"
+        )
+
+    config = config_cls.model_validate(config_data)
+    return evaluator_cls(config)
diff --git a/packages/uipath/src/uipath/eval/helpers.py b/packages/uipath/src/uipath/eval/helpers.py
index 8405e4a7a..fbe210a93 100644
--- a/packages/uipath/src/uipath/eval/helpers.py
+++ b/packages/uipath/src/uipath/eval/helpers.py
@@ -9,7 +9,9 @@
 
 from uipath.runtime.schema import UiPathRuntimeSchema
 
+from .evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from .evaluators.base_evaluator import GenericBaseEvaluator
+from .evaluators.dataset_evaluator_factory import build_dataset_evaluator
 from .evaluators.evaluator_factory import EvaluatorFactory
 from .mocks._types import InputMockingStrategy, LLMMockingStrategy
 from .models._conversational_utils import UiPathLegacyEvalChatMessagesMapper
@@ -280,6 +282,92 @@ async def load_evaluators(
 
         return evaluators
 
+    @staticmethod
+    async def load_dataset_evaluators(
+        eval_set_path: str,
+        evaluation_set: EvaluationSet,
+    ) -> list[BaseDatasetEvaluator[Any]]:
+        """Load dataset-level evaluators referenced by the evaluation set.
+
+        Dataset evaluator config JSON files are expected to live under
+        ``<eval_set_dir>/../dataset_evaluators/``, mirroring the evaluators
+        layout. Each config is matched to a reference by its top-level ``id``.
+
+        Validates that every dataset evaluator's ``source_evaluator`` is one of
+        the per-datapoint evaluators declared on the eval set; raises if not.
+        """
+        if evaluation_set is None:
+            raise ValueError("eval_set cannot be None")
+
+        dataset_ref_ids = {
+            ref.ref for ref in evaluation_set.dataset_evaluator_refs
+        }
+        if not dataset_ref_ids:
+            return []
+
+        dataset_dir = Path(eval_set_path).parent.parent / "dataset_evaluators"
+        if not dataset_dir.exists():
+            raise ValueError(
+                f"Dataset evaluators directory not found at '{dataset_dir}', "
+                f"but evaluation set references dataset evaluators: "
+                f"{sorted(dataset_ref_ids)}"
+            )
+
+        # Build the set of per-datapoint evaluator names so we can validate
+        # source_evaluator references up front.
+        if evaluation_set.evaluator_configs:
+            known_evaluator_names = {
+                ref.ref for ref in evaluation_set.evaluator_configs
+            }
+        else:
+            known_evaluator_names = set(evaluation_set.evaluator_refs)
+
+        dataset_evaluators: list[BaseDatasetEvaluator[Any]] = []
+        found_ids: set[str] = set()
+
+        for file in dataset_dir.glob("*.json"):
+            try:
+                with open(file, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+            except json.JSONDecodeError as e:
+                raise ValueError(
+                    f"Invalid JSON in dataset evaluator file '{file}': {str(e)}."
+                ) from e
+
+            evaluator_id = data.get("id")
+            if evaluator_id not in dataset_ref_ids:
+                continue
+
+            try:
+                evaluator = build_dataset_evaluator(data)
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to create dataset evaluator from file '{file}': "
+                    f"{str(e)}."
+                ) from e
+
+            if (
+                known_evaluator_names
+                and evaluator.source_evaluator not in known_evaluator_names
+            ):
+                raise ValueError(
+                    f"Dataset evaluator '{evaluator.name}' references "
+                    f"source_evaluator='{evaluator.source_evaluator}' which is "
+                    f"not declared in this evaluation set. Known evaluators: "
+                    f"{sorted(known_evaluator_names)}"
+                )
+
+            dataset_evaluators.append(evaluator)
+            found_ids.add(evaluator_id)
+
+        missing = dataset_ref_ids - found_ids
+        if missing:
+            raise ValueError(
+                f"Could not find the following dataset evaluators: {missing}"
+            )
+
+        return dataset_evaluators
+
 
 def get_agent_model(schema: UiPathRuntimeSchema) -> str | None:
     """Get agent model from the runtime schema metadata.
diff --git a/packages/uipath/src/uipath/eval/models/evaluation_set.py b/packages/uipath/src/uipath/eval/models/evaluation_set.py
index 22e6ce244..711fedeb9 100644
--- a/packages/uipath/src/uipath/eval/models/evaluation_set.py
+++ b/packages/uipath/src/uipath/eval/models/evaluation_set.py
@@ -145,6 +145,9 @@ class EvaluationSet(BaseModel):
     evaluator_configs: list[EvaluatorReference] = Field(
         default_factory=list, alias="evaluatorConfigs"
     )
+    dataset_evaluator_refs: list[EvaluatorReference] = Field(
+        default_factory=list, alias="datasetEvaluatorRefs"
+    )
     evaluations: list[EvaluationItem] = Field(default_factory=list)
     model_settings: list[EvaluationSetModelSettings] = Field(
         default_factory=list, alias="modelSettings"
diff --git a/packages/uipath/src/uipath/eval/models/models.py b/packages/uipath/src/uipath/eval/models/models.py
index d2dc26df9..f3c9b57e1 100644
--- a/packages/uipath/src/uipath/eval/models/models.py
+++ b/packages/uipath/src/uipath/eval/models/models.py
@@ -300,6 +300,9 @@ class EvaluatorType(str, Enum):
     TOOL_CALL_OUTPUT = "uipath-tool-call-output"
     BINARY_CLASSIFICATION = "uipath-binary-classification"
     MULTICLASS_CLASSIFICATION = "uipath-multiclass-classification"
+    DATASET_PRECISION = "uipath-dataset-precision"
+    DATASET_RECALL = "uipath-dataset-recall"
+    DATASET_F_SCORE = "uipath-dataset-f-score"
 
 
 class ToolCall(BaseModel):
diff --git a/packages/uipath/src/uipath/eval/runtime/_types.py b/packages/uipath/src/uipath/eval/runtime/_types.py
index 2aee5e599..fa84f0d9e 100644
--- a/packages/uipath/src/uipath/eval/runtime/_types.py
+++ b/packages/uipath/src/uipath/eval/runtime/_types.py
@@ -1,7 +1,7 @@
 import logging
 
 from opentelemetry.sdk.trace import ReadableSpan
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 from pydantic.alias_generators import to_camel
 
 from uipath.runtime import UiPathRuntimeResult
@@ -78,6 +78,9 @@ class UiPathEvalOutput(BaseModel):
 
     evaluation_set_name: str
     evaluation_set_results: list[UiPathEvalRunResult]
+    dataset_evaluator_results: dict[str, EvaluationResultDto] = Field(
+        default_factory=dict
+    )
 
     @property
     def score(self) -> float:
diff --git a/packages/uipath/src/uipath/eval/runtime/context.py b/packages/uipath/src/uipath/eval/runtime/context.py
index b8224718c..f3b713320 100644
--- a/packages/uipath/src/uipath/eval/runtime/context.py
+++ b/packages/uipath/src/uipath/eval/runtime/context.py
@@ -4,6 +4,7 @@
 
 from uipath.runtime.schema import UiPathRuntimeSchema
 
+from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from ..evaluators.base_evaluator import GenericBaseEvaluator
 from ..models.evaluation_set import EvaluationSet
 
@@ -27,3 +28,4 @@ class UiPathEvalContext:
     input_overrides: dict[str, Any] | None = None
     resume: bool = False
     job_id: str | None = None
+    dataset_evaluators: list[BaseDatasetEvaluator[Any]] | None = None
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
index 7f7614446..5cadcc527 100644
--- a/packages/uipath/src/uipath/eval/runtime/runtime.py
+++ b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -45,6 +45,7 @@
 from uipath.runtime.schema import UiPathRuntimeSchema
 
 from .._execution_context import ExecutionSpanCollector
+from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from ..evaluators.base_evaluator import GenericBaseEvaluator
 from ..evaluators.output_evaluator import OutputEvaluationCriteria
 from ..helpers import get_agent_model
@@ -202,6 +203,43 @@ def compute_evaluator_scores(
     return final_score, agg_metrics_per_evaluator
 
 
+def compute_dataset_evaluator_results(
+    evaluation_set_results: list[UiPathEvalRunResult],
+    dataset_evaluators: Iterable[BaseDatasetEvaluator[Any]],
+) -> dict[str, EvaluationResultDto]:
+    """Run each dataset evaluator over its source evaluator's per-datapoint results.
+
+    Args:
+        evaluation_set_results: Per-datapoint results from the run.
+        dataset_evaluators: Dataset-level evaluator instances. Each is routed to
+            the per-datapoint results from ``evaluator.source_evaluator``.
+
+    Returns:
+        Dict mapping dataset evaluator name to its serialized EvaluationResultDto.
+        Dataset evaluators whose source produced no results are still invoked
+        with an empty list so they can emit a zeroed result.
+    """
+    results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict(
+        list
+    )
+    for eval_run_result in evaluation_set_results:
+        for eval_run_result_dto in eval_run_result.evaluation_run_results:
+            if eval_run_result_dto.is_line_result:
+                continue
+            results_by_evaluator[eval_run_result_dto.evaluator_name].append(
+                eval_run_result_dto.result
+            )
+
+    dataset_results: dict[str, EvaluationResultDto] = {}
+    for evaluator in dataset_evaluators:
+        source = evaluator.source_evaluator
+        evaluation_result = evaluator.evaluate(results_by_evaluator.get(source, []))
+        dataset_results[evaluator.name] = EvaluationResultDto.from_evaluation_result(
+            evaluation_result
+        )
+    return dataset_results
+
+
 class UiPathEvalRuntime:
     """Specialized runtime for evaluation runs, with access to the factory."""
 
@@ -381,6 +419,18 @@ async def execute(self) -> UiPathRuntimeResult:
                         evaluators,
                     )
 
+                    # Run any dataset-level evaluators configured on the eval
+                    # set. Each consumes the per-datapoint results from one
+                    # named source evaluator and emits a single run-level
+                    # EvaluationResultDto stored on UiPathEvalOutput.
+                    if self.context.dataset_evaluators:
+                        results.dataset_evaluator_results = (
+                            compute_dataset_evaluator_results(
+                                results.evaluation_set_results,
+                                self.context.dataset_evaluators,
+                            )
+                        )
+
                     # Configure span with output and metadata
                     await configure_eval_set_run_span(
                         span=span,
diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
new file mode 100644
index 000000000..08d81818d
--- /dev/null
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -0,0 +1,411 @@
+"""Tests for dataset-level classification evaluators (Precision, Recall, FScore).
+
+Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases
+(empty input, out-of-vocab labels, malformed details), and runtime-level
+routing where compute_dataset_evaluator_results selects results by name.
+"""
+
+import uuid
+
+import pytest
+
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDetails,
+    FScoreDatasetEvaluator,
+    FScoreDatasetEvaluatorConfig,
+    PrecisionDatasetEvaluator,
+    PrecisionDatasetEvaluatorConfig,
+    RecallDatasetEvaluator,
+    RecallDatasetEvaluatorConfig,
+)
+from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from uipath.eval.models.models import (
+    EvaluationResultDto,
+    EvaluatorType,
+    NumericEvaluationResult,
+)
+from uipath.eval.runtime._types import (
+    UiPathEvalRunResult,
+    UiPathEvalRunResultDto,
+)
+from uipath.eval.runtime.runtime import compute_dataset_evaluator_results
+
+
+def _result(
+    expected: str, actual: str, score: float | None = None
+) -> EvaluationResultDto:
+    """Build an EvaluationResultDto carrying an expected/actual justification."""
+    if score is None:
+        score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(
+        score=score,
+        details=justification.model_dump(),
+    )
+
+
+def _precision(classes: list[str], average: str = "macro") -> PrecisionDatasetEvaluator:
+    return PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="p1",
+            name="precision",
+            source_evaluator="intent_match",
+            classes=classes,
+            average=average,  # type: ignore[arg-type]
+        )
+    )
+
+
+def _recall(classes: list[str], average: str = "macro") -> RecallDatasetEvaluator:
+    return RecallDatasetEvaluator(
+        RecallDatasetEvaluatorConfig(
+            id="r1",
+            name="recall",
+            source_evaluator="intent_match",
+            classes=classes,
+            average=average,  # type: ignore[arg-type]
+        )
+    )
+
+
+def _fscore(
+    classes: list[str], average: str = "macro", f_value: float = 1.0
+) -> FScoreDatasetEvaluator:
+    return FScoreDatasetEvaluator(
+        FScoreDatasetEvaluatorConfig(
+            id="f1",
+            name="fscore",
+            source_evaluator="intent_match",
+            classes=classes,
+            average=average,  # type: ignore[arg-type]
+            f_value=f_value,
+        )
+    )
+
+
+def _details(result: NumericEvaluationResult) -> ClassificationDetails:
+    """Type-narrowing helper for asserting on details."""
+    assert isinstance(result.details, ClassificationDetails)
+    return result.details
+
+
+class TestPrecisionEvaluator:
+    def test_empty_input_returns_zeroed_result(self) -> None:
+        result = _precision(["cat", "dog"]).evaluate([])
+        assert isinstance(result, NumericEvaluationResult)
+        assert result.score == 0.0
+        d = _details(result)
+        assert d.n_total == 0 and d.n_scored == 0
+        assert d.confusion_matrix == [[0, 0], [0, 0]]
+        assert d.per_class["cat"].tp == 0
+        assert d.per_class["cat"].tn == 0
+
+    def test_two_class_macro(self) -> None:
+        # 4 datapoints: 2 TP_yes, 1 FN_yes (predicted no), 1 FP_yes (predicted yes when expected no).
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),  # FN for yes, FP for no
+            _result("no", "yes"),  # FP for yes, FN for no
+        ]
+        result = _precision(["yes", "no"], average="macro").evaluate(results)
+        d = _details(result)
+        # precision_yes = 2 / (2 + 1) = 2/3
+        # precision_no  = 0 / (0 + 1) = 0
+        # macro = (2/3 + 0) / 2 = 1/3
+        assert d.per_class["yes"].value == pytest.approx(2 / 3)
+        assert d.per_class["no"].value == pytest.approx(0.0)
+        assert d.macro == pytest.approx((2 / 3 + 0.0) / 2)
+        assert result.score == pytest.approx(d.macro)
+
+    def test_two_class_micro_equals_accuracy(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _precision(["yes", "no"], average="micro").evaluate(results)
+        d = _details(result)
+        # micro precision = sum(TP) / sum(TP + FP)
+        # sum(TP) = 2 (yes diag) + 0 (no diag) = 2
+        # sum(FP) = 1 (yes off-diag row) + 1 (no off-diag row) = 2
+        # micro = 2 / (2 + 2) = 0.5 — equals accuracy 2/4 in the 2-class case
+        assert d.micro == pytest.approx(0.5)
+        assert result.score == pytest.approx(0.5)
+
+    def test_three_class_macro(self) -> None:
+        # Each class gets 2 TP, 1 FP, 1 FN — symmetric setup
+        pairs = [
+            ("cat", "cat"),
+            ("cat", "cat"),
+            ("cat", "dog"),  # FN_cat, FP_dog
+            ("dog", "dog"),
+            ("dog", "dog"),
+            ("dog", "bird"),  # FN_dog, FP_bird
+            ("bird", "bird"),
+            ("bird", "bird"),
+            ("bird", "cat"),  # FN_bird, FP_cat
+        ]
+        result = _precision(["cat", "dog", "bird"], average="macro").evaluate(
+            [_result(e, a) for e, a in pairs]
+        )
+        d = _details(result)
+        # per-class precision = 2 / (2 + 1) = 2/3 for all three
+        for label in ("cat", "dog", "bird"):
+            m = d.per_class[label]
+            assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5
+            assert m.value == pytest.approx(2 / 3)
+        assert d.macro == pytest.approx(2 / 3)
+        assert result.score == pytest.approx(2 / 3)
+
+
+class TestRecallEvaluator:
+    def test_two_class_macro(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _recall(["yes", "no"], average="macro").evaluate(results)
+        d = _details(result)
+        # recall_yes = TP / (TP + FN) = 2 / (2 + 1) = 2/3
+        # recall_no  = 0 / (0 + 1) = 0
+        # macro = 1/3
+        assert d.per_class["yes"].value == pytest.approx(2 / 3)
+        assert d.per_class["no"].value == pytest.approx(0.0)
+        assert result.score == pytest.approx(1 / 3)
+
+    def test_recall_differs_from_precision(self) -> None:
+        # Asymmetric example so precision != recall.
+        results = [
+            _result("yes", "yes"),  # TP
+            _result("yes", "yes"),  # TP
+            _result("no", "yes"),  # FP for yes
+            _result("no", "yes"),  # FP for yes
+            _result("no", "no"),  # TP for no
+        ]
+        p = _details(_precision(["yes", "no"], average="macro").evaluate(results))
+        r = _details(_recall(["yes", "no"], average="macro").evaluate(results))
+        # precision_yes = 2/(2+2)=0.5, precision_no = 1/(1+0)=1.0
+        assert p.per_class["yes"].value == pytest.approx(0.5)
+        assert p.per_class["no"].value == pytest.approx(1.0)
+        # recall_yes = 2/(2+0)=1.0, recall_no = 1/(1+2)=1/3
+        assert r.per_class["yes"].value == pytest.approx(1.0)
+        assert r.per_class["no"].value == pytest.approx(1 / 3)
+
+
+class TestFScoreEvaluator:
+    def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        f = _details(
+            _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results)
+        )
+        # precision_yes = 2/3, recall_yes = 2/3 -> F1_yes = 2/3
+        # precision_no  = 0,   recall_no  = 0    -> F1_no  = 0
+        assert f.per_class["yes"].value == pytest.approx(2 / 3)
+        assert f.per_class["no"].value == pytest.approx(0.0)
+        assert f.macro == pytest.approx((2 / 3 + 0.0) / 2)
+
+    def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None:
+        # Asymmetric setup: precision_yes = 0.5, recall_yes = 1.0.
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("no", "yes"),
+            _result("no", "yes"),
+            _result("no", "no"),
+        ]
+        f1 = _details(
+            _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results)
+        )
+        f2 = _details(
+            _fscore(["yes", "no"], average="macro", f_value=2.0).evaluate(results)
+        )
+        # F_beta with beta>1 weighs recall higher. Since recall_yes > precision_yes,
+        # F2_yes should be > F1_yes.
+        assert f2.per_class["yes"].value > f1.per_class["yes"].value
+
+    def test_three_class_micro_pools_across_classes(self) -> None:
+        # Same symmetric setup as the precision macro test.
+        pairs = [
+            ("cat", "cat"),
+            ("cat", "cat"),
+            ("cat", "dog"),
+            ("dog", "dog"),
+            ("dog", "dog"),
+            ("dog", "bird"),
+            ("bird", "bird"),
+            ("bird", "bird"),
+            ("bird", "cat"),
+        ]
+        d = _details(
+            _fscore(["cat", "dog", "bird"], average="micro", f_value=1.0).evaluate(
+                [_result(e, a) for e, a in pairs]
+            )
+        )
+        # micro precision == micro recall == 6/9 (accuracy when each off-diag
+        # contributes once to FP and once to FN globally). micro F1 = 6/9.
+        assert d.micro == pytest.approx(6 / 9)
+
+
+class TestSkippingAndEdgeCases:
+    def test_out_of_vocab_labels_are_skipped(self) -> None:
+        results = [
+            _result("cat", "cat"),
+            _result("cat", "platypus"),  # actual not in classes
+            _result("zebra", "dog"),  # expected not in classes
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
+
+    def test_results_without_justification_are_skipped(self) -> None:
+        results = [
+            _result("cat", "cat"),
+            EvaluationResultDto(score=1.0, details="just a string"),
+            EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
+
+    def test_case_insensitive_by_default(self) -> None:
+        results = [_result("Cat", "CAT"), _result("DOG", "dog")]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.per_class["cat"].tp == 1
+        assert d.per_class["dog"].tp == 1
+
+
+class TestFactory:
+    def test_builds_evaluator_from_dict(self) -> None:
+        config_data = {
+            "id": "precision_intent",
+            "name": "precision_intent",
+            "type": EvaluatorType.DATASET_PRECISION.value,
+            "sourceEvaluator": "intent_match",
+            "classes": ["yes", "no"],
+            "average": "macro",
+        }
+        evaluator = build_dataset_evaluator(config_data)
+        assert isinstance(evaluator, PrecisionDatasetEvaluator)
+        assert evaluator.source_evaluator == "intent_match"
+        assert evaluator.name == "precision_intent"
+
+    def test_unknown_type_raises(self) -> None:
+        with pytest.raises(ValueError, match="Unknown dataset evaluator type"):
+            build_dataset_evaluator(
+                {
+                    "id": "x",
+                    "name": "x",
+                    "type": "uipath-not-a-thing",
+                    "sourceEvaluator": "intent_match",
+                    "classes": ["yes", "no"],
+                }
+            )
+
+    def test_missing_type_raises(self) -> None:
+        with pytest.raises(ValueError, match="missing required field 'type'"):
+            build_dataset_evaluator(
+                {
+                    "id": "x",
+                    "name": "x",
+                    "sourceEvaluator": "intent_match",
+                    "classes": ["yes", "no"],
+                }
+            )
+
+
+class TestComputeDatasetEvaluatorResults:
+    """End-to-end: dataset evaluator picks results by source_evaluator name."""
+
+    def test_routes_to_correct_source_and_ignores_others(self) -> None:
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                    UiPathEvalRunResultDto(
+                        evaluator_name="some_other_evaluator",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=EvaluationResultDto(score=0.5),
+                    ),
+                ],
+            ),
+            UiPathEvalRunResult(
+                evaluation_name="dp2",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "no"),
+                    ),
+                ],
+            ),
+        ]
+
+        out = compute_dataset_evaluator_results(
+            eval_results, [_precision(["yes", "no"], average="macro")]
+        )
+        assert set(out) == {"precision"}
+        dto = out["precision"]
+        assert isinstance(dto, EvaluationResultDto)
+        # The unrelated 0.5 score from some_other_evaluator must NOT be in the
+        # matrix — only the two intent_match results count.
+        assert isinstance(dto.details, dict)
+        assert dto.details["n_scored"] == 2
+
+    def test_line_by_line_subresults_are_excluded(self) -> None:
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                        is_line_result=True,
+                    ),
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("no", "no"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(
+            eval_results, [_precision(["yes", "no"])]
+        )
+        assert isinstance(out["precision"].details, dict)
+        assert out["precision"].details["n_scored"] == 1
+
+    def test_source_with_no_results_produces_zeroed_report(self) -> None:
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="some_other_evaluator",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=EvaluationResultDto(score=1.0),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(
+            eval_results, [_precision(["yes", "no"])]
+        )
+        dto = out["precision"]
+        assert dto.score == 0.0
+        assert isinstance(dto.details, dict)
+        assert dto.details["n_scored"] == 0

From d6b7ab5566d07a9e34611358a4b7539912982936 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Wed, 20 May 2026 16:14:00 -0700
Subject: [PATCH 04/13] docs(eval): add runnable dataset evaluator demo + bump
 uv.lock for 2.10.69
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

examples/dataset_evaluators_demo.py walks the new dataset-level evaluators
(Precision / Recall / F-score) through five scenarios that exercise the
math end-to-end at the SDK layer:

  1. Balanced 3-class — symmetric confusion matrix, macro == micro
  2. Imbalanced 2-class — shows where macro and micro diverge
  3. Same data, four metrics (Precision, Recall, F1, F2) — proves the
     F-beta knob actually moves per-class numbers
  4. Out-of-vocab + malformed details — n_skipped surfaces, no silent drops
  5. Realistic 4-class intent classifier — uneven per-class performance

Each scenario prints the confusion matrix as a table, the per-class
TP/TN/FP/FN + the metric, and a snippet of the wire JSON that AutoMapper
will surface to the frontend.

Run::

    cd packages/uipath && uv run python examples/dataset_evaluators_demo.py

uv.lock reflects the pyproject.toml version bump (2.10.68 -> 2.10.69)
already in this PR.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../examples/dataset_evaluators_demo.py       | 359 ++++++++++++++++++
 packages/uipath/uv.lock                       |   4 +-
 2 files changed, 361 insertions(+), 2 deletions(-)
 create mode 100644 packages/uipath/examples/dataset_evaluators_demo.py

diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py
new file mode 100644
index 000000000..a8f80858d
--- /dev/null
+++ b/packages/uipath/examples/dataset_evaluators_demo.py
@@ -0,0 +1,359 @@
+"""Runnable proof that the dataset-level evaluators work on realistic data.
+
+Five scenarios exercise the framework end-to-end at the SDK layer (no
+worker, no backend). Each prints the headline score plus a confusion
+matrix table, so the math is inspectable rather than a passing-test
+binary signal.
+
+Run::
+
+    cd packages/uipath
+    uv run python examples/dataset_evaluators_demo.py
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Iterable
+
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDetails,
+    FScoreDatasetEvaluator,
+    FScoreDatasetEvaluatorConfig,
+    PrecisionDatasetEvaluator,
+    PrecisionDatasetEvaluatorConfig,
+    RecallDatasetEvaluator,
+    RecallDatasetEvaluatorConfig,
+)
+from uipath.eval.models.models import EvaluationResultDto, NumericEvaluationResult
+
+
+# ─── helpers ──────────────────────────────────────────────────────────────────
+
+
+def make_result(expected: str, actual: str) -> EvaluationResultDto:
+    """Build a single per-datapoint EvaluationResultDto.
+
+    Models what an upstream ExactMatch evaluator would produce after running
+    on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with
+    the expected/actual labels carried in the justification.
+    """
+    score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(score=score, details=justification.model_dump())
+
+
+def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]:
+    return [make_result(e, a) for e, a in pairs]
+
+
+def print_header(title: str) -> None:
+    print()
+    print("═" * 78)
+    print(f" {title}")
+    print("═" * 78)
+
+
+def print_confusion(details: ClassificationDetails) -> None:
+    """Pretty-print the confusion matrix as a table."""
+    classes = details.classes
+    cell_width = max(7, max(len(c) for c in classes) + 1)
+    header = " " * cell_width + " │ " + " │ ".join(c.center(cell_width) for c in classes) + " │  ← expected"
+    print(header)
+    print("─" * len(header))
+    for predicted_idx, predicted_label in enumerate(classes):
+        row_cells = [
+            str(details.confusion_matrix[predicted_idx][expected_idx]).rjust(cell_width)
+            for expected_idx in range(len(classes))
+        ]
+        print(predicted_label.ljust(cell_width) + " │ " + " │ ".join(row_cells) + " │")
+    print(" " * cell_width + "↑ predicted")
+
+
+def print_per_class(details: ClassificationDetails) -> None:
+    """One-row-per-class table of TP/TN/FP/FN + the metric."""
+    label_w = max(len("class"), max(len(c) for c in details.classes))
+    metric = details.metric
+    header = f"  {'class'.ljust(label_w)}  │  TP  TN  FP  FN  support  {metric}"
+    print(header)
+    print("  " + "─" * (len(header) - 2))
+    for cls, m in details.per_class.items():
+        print(
+            f"  {cls.ljust(label_w)}  │  "
+            f"{m.tp:>2}  {m.tn:>2}  {m.fp:>2}  {m.fn:>2}  {m.support:>7}  "
+            f"{m.value:.3f}"
+        )
+
+
+def report(
+    title: str,
+    result: NumericEvaluationResult,
+    *,
+    show_json_tail: bool = False,
+) -> None:
+    """Render one scenario's result block."""
+    print_header(title)
+    assert isinstance(result.details, ClassificationDetails)
+    d = result.details
+    print(
+        f"  metric = {d.metric}   average = {d.average}   "
+        f"score (headline) = {result.score:.4f}"
+    )
+    print(
+        f"  micro = {d.micro:.4f}   macro = {d.macro:.4f}   "
+        f"scored = {d.n_scored}/{d.n_total}   skipped = {d.n_skipped}"
+    )
+    print()
+    print_confusion(d)
+    print()
+    print_per_class(d)
+    if show_json_tail:
+        print()
+        print("  ── wire JSON (matches frontend zod schema) ──")
+        # Just show a snippet to keep output focused.
+        payload = d.model_dump(by_alias=True)
+        print(
+            "  "
+            + json.dumps(
+                {k: payload[k] for k in ("metric", "average", "micro", "macro")},
+                indent=2,
+            ).replace("\n", "\n  ")
+        )
+
+
+# ─── scenarios ────────────────────────────────────────────────────────────────
+
+
+def scenario_1_balanced_three_class() -> None:
+    """Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong."""
+    pairs = [
+        ("book", "book"),
+        ("book", "book"),
+        ("book", "cancel"),  # FN_book, FP_cancel
+        ("cancel", "cancel"),
+        ("cancel", "cancel"),
+        ("cancel", "reschedule"),  # FN_cancel, FP_reschedule
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "book"),  # FN_reschedule, FP_book
+    ]
+    results = materialize_pairs(pairs)
+    evaluator = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="precision_intent",
+            name="precision_intent",
+            source_evaluator="intent_match",
+            classes=["book", "cancel", "reschedule"],
+            average="macro",
+        )
+    )
+    report(
+        "Scenario 1 — Balanced 3-class (intent recognition)\n"
+        "  Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.",
+        evaluator.evaluate(results),
+        show_json_tail=True,
+    )
+
+
+def scenario_2_imbalanced_two_class() -> None:
+    """Rare-positive case — why macro vs micro matters.
+
+    20 datapoints. Only 4 are actually positive (the rare class). A weak
+    classifier could trivially get high accuracy by predicting "negative"
+    everywhere — micro precision masks that, macro doesn't.
+    """
+    pairs: list[tuple[str, str]] = []
+    # 16 true negatives where the classifier said "negative" (correct).
+    pairs += [("negative", "negative")] * 13
+    # 3 false positives — classifier hallucinated "positive" on actual negatives.
+    pairs += [("negative", "positive")] * 3
+    # 2 true positives.
+    pairs += [("positive", "positive")] * 2
+    # 2 false negatives — classifier missed real positives.
+    pairs += [("positive", "negative")] * 2
+
+    results = materialize_pairs(pairs)
+    classes = ["positive", "negative"]
+
+    macro = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="p_macro",
+            name="precision (macro)",
+            source_evaluator="positive_match",
+            classes=classes,
+            average="macro",
+        )
+    )
+    micro = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="p_micro",
+            name="precision (micro)",
+            source_evaluator="positive_match",
+            classes=classes,
+            average="micro",
+        )
+    )
+    report(
+        "Scenario 2a — Imbalanced 2-class, MACRO precision\n"
+        "  Rare positive class. Macro averages per-class, so the rare class\n"
+        "  having precision = 2/(2+3) = 0.40 drags the score down.",
+        macro.evaluate(results),
+    )
+    report(
+        "Scenario 2b — Same data, MICRO precision\n"
+        "  Pools TP/FP across classes. In a 2-class case this equals accuracy.\n"
+        "  Notice macro << micro — that's the bias you'd miss with micro alone.",
+        micro.evaluate(results),
+    )
+
+
+def scenario_3_precision_vs_recall_vs_f() -> None:
+    """Same dataset, three different metrics — show they diverge on asymmetric data."""
+    pairs = [
+        ("yes", "yes"),
+        ("yes", "yes"),
+        ("no", "yes"),  # FP for yes
+        ("no", "yes"),  # FP for yes
+        ("no", "no"),
+        ("no", "no"),
+        ("yes", "no"),  # FN for yes
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["yes", "no"]
+
+    p = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="p",
+            name="precision",
+            source_evaluator="yes_match",
+            classes=classes,
+            average="macro",
+        )
+    )
+    r = RecallDatasetEvaluator(
+        RecallDatasetEvaluatorConfig(
+            id="r",
+            name="recall",
+            source_evaluator="yes_match",
+            classes=classes,
+            average="macro",
+        )
+    )
+    f1 = FScoreDatasetEvaluator(
+        FScoreDatasetEvaluatorConfig(
+            id="f1",
+            name="f1",
+            source_evaluator="yes_match",
+            classes=classes,
+            average="macro",
+            f_value=1.0,
+        )
+    )
+    f2 = FScoreDatasetEvaluator(
+        FScoreDatasetEvaluatorConfig(
+            id="f2",
+            name="f2",
+            source_evaluator="yes_match",
+            classes=classes,
+            average="macro",
+            f_value=2.0,
+        )
+    )
+    report(
+        "Scenario 3a — Precision on a recall-favourable dataset",
+        p.evaluate(results),
+    )
+    report(
+        "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)",
+        r.evaluate(results),
+    )
+    report(
+        "Scenario 3c — F1 (harmonic mean of P and R)",
+        f1.evaluate(results),
+    )
+    report(
+        "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)",
+        f2.evaluate(results),
+    )
+
+
+def scenario_4_skipped_datapoints() -> None:
+    """Show how malformed / out-of-vocab data is reported, not silently dropped."""
+    results = [
+        make_result("cat", "cat"),
+        make_result("dog", "dog"),
+        make_result("cat", "platypus"),  # actual not in classes → skipped
+        make_result("zebra", "cat"),  # expected not in classes → skipped
+        EvaluationResultDto(score=1.0, details="bare string — no justification"),
+        EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+    ]
+    evaluator = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="precision_robustness",
+            name="precision_robustness",
+            source_evaluator="any_match",
+            classes=["cat", "dog"],
+            average="macro",
+        )
+    )
+    report(
+        "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n"
+        "  6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n"
+        "  report so you can tell whether a low score is a real signal or\n"
+        "  just sparse data.",
+        evaluator.evaluate(results),
+    )
+
+
+def scenario_5_realistic_intent_classifier() -> None:
+    """A larger, more interesting 4-class dataset — uneven per-class performance."""
+    pairs = [
+        # 'book' is easy: classifier handles it well
+        *[("book", "book")] * 10,
+        ("book", "cancel"),
+        # 'cancel' is medium: a few errors
+        *[("cancel", "cancel")] * 6,
+        ("cancel", "book"),
+        ("cancel", "modify"),
+        # 'reschedule' is hard: classifier confuses it with 'modify'
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "modify"),
+        ("reschedule", "modify"),
+        # 'modify' is rare: only 2 cases, classifier gets one
+        ("modify", "modify"),
+        ("modify", "reschedule"),
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["book", "cancel", "reschedule", "modify"]
+    macro_f1 = FScoreDatasetEvaluator(
+        FScoreDatasetEvaluatorConfig(
+            id="f1_4class",
+            name="f1_4class",
+            source_evaluator="intent_match",
+            classes=classes,
+            average="macro",
+            f_value=1.0,
+        )
+    )
+    report(
+        "Scenario 5 — Realistic 4-class intent classifier\n"
+        "  Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n"
+        "  'modify' weakness; micro F1 would have hidden it under 'book' wins.",
+        macro_f1.evaluate(results),
+    )
+
+
+def main() -> None:
+    scenario_1_balanced_three_class()
+    scenario_2_imbalanced_two_class()
+    scenario_3_precision_vs_recall_vs_f()
+    scenario_4_skipped_datapoints()
+    scenario_5_realistic_intent_classifier()
+    print()
+    print("Done. All scenarios computed from real evaluator code.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock
index 41ae12119..19b0d047b 100644
--- a/packages/uipath/uv.lock
+++ b/packages/uipath/uv.lock
@@ -3,7 +3,7 @@ revision = 3
 requires-python = ">=3.11"
 
 [options]
-exclude-newer = "2026-05-17T17:25:34.9197064Z"
+exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values.
 exclude-newer-span = "P2D"
 
 [options.exclude-newer-package]
@@ -2552,7 +2552,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.10.68"
+version = "2.10.69"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },

From fb091e46c686da88958aa002cbfdb34527fe08ab Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 21:26:19 -0700
Subject: [PATCH 05/13] refactor(eval): embed aggregator specs in per-datapoint
 evaluator configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pivot dataset evaluators from a separate hierarchy with source_evaluator
pointers to an embedded aggregator-spec design: each per-datapoint
classification evaluator's config carries a self-contained list of
aggregators (precision / recall / fscore), each with its own classes,
averaging, and f_value. No properties are shared up to the evaluator
level — aggregators are fully self-describing.

- Drop source_evaluator pointer from BaseDatasetEvaluatorConfig.
- Add discriminated AggregatorSpec union (precision/recall/fscore).
- Add aggregators field to Binary/Multiclass classification configs.
- Refactor build_dataset_evaluator + compute_dataset_evaluator_results
  to consume aggregator specs from per-datapoint configs directly.
- Drop EvaluationSet.dataset_evaluator_refs (no separate list).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../examples/dataset_evaluators_demo.py       | 189 ++++------
 packages/uipath/src/uipath/_cli/cli_eval.py   |   7 -
 .../eval/evaluators/_aggregator_specs.py      |  53 +++
 .../eval/evaluators/base_dataset_evaluator.py |  67 ++--
 .../binary_classification_evaluator.py        |   7 +
 .../classification_dataset_evaluators.py      | 102 ++----
 .../evaluators/dataset_evaluator_factory.py   |  67 ++--
 .../multiclass_classification_evaluator.py    |   7 +
 packages/uipath/src/uipath/eval/helpers.py    |  88 -----
 .../src/uipath/eval/models/evaluation_set.py  |   3 -
 .../uipath/src/uipath/eval/runtime/context.py |   2 -
 .../uipath/src/uipath/eval/runtime/runtime.py |  63 ++--
 .../test_dataset_classification_evaluators.py | 332 +++++++++++-------
 13 files changed, 460 insertions(+), 527 deletions(-)
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py

diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py
index a8f80858d..2d13f3572 100644
--- a/packages/uipath/examples/dataset_evaluators_demo.py
+++ b/packages/uipath/examples/dataset_evaluators_demo.py
@@ -16,28 +16,27 @@
 import json
 from typing import Iterable
 
+from uipath.eval.evaluators._aggregator_specs import (
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
 from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
 from uipath.eval.evaluators.classification_dataset_evaluators import (
     ClassificationDetails,
-    FScoreDatasetEvaluator,
-    FScoreDatasetEvaluatorConfig,
-    PrecisionDatasetEvaluator,
-    PrecisionDatasetEvaluatorConfig,
-    RecallDatasetEvaluator,
-    RecallDatasetEvaluatorConfig,
 )
+from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
 from uipath.eval.models.models import EvaluationResultDto, NumericEvaluationResult
 
-
 # ─── helpers ──────────────────────────────────────────────────────────────────
 
 
 def make_result(expected: str, actual: str) -> EvaluationResultDto:
     """Build a single per-datapoint EvaluationResultDto.
 
-    Models what an upstream ExactMatch evaluator would produce after running
-    on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with
-    the expected/actual labels carried in the justification.
+    Models what an upstream classification evaluator would produce after running
+    on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with the
+    expected/actual labels carried in the justification.
     """
     score = 1.0 if expected.lower() == actual.lower() else 0.0
     justification = BaseEvaluatorJustification(expected=expected, actual=actual)
@@ -45,10 +44,12 @@ def make_result(expected: str, actual: str) -> EvaluationResultDto:
 
 
 def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]:
+    """Build a list of EvaluationResultDto from (expected, actual) pairs."""
     return [make_result(e, a) for e, a in pairs]
 
 
 def print_header(title: str) -> None:
+    """Print a section header banner."""
     print()
     print("═" * 78)
     print(f" {title}")
@@ -59,7 +60,12 @@ def print_confusion(details: ClassificationDetails) -> None:
     """Pretty-print the confusion matrix as a table."""
     classes = details.classes
     cell_width = max(7, max(len(c) for c in classes) + 1)
-    header = " " * cell_width + " │ " + " │ ".join(c.center(cell_width) for c in classes) + " │  ← expected"
+    header = (
+        " " * cell_width
+        + " │ "
+        + " │ ".join(c.center(cell_width) for c in classes)
+        + " │  ← expected"
+    )
     print(header)
     print("─" * len(header))
     for predicted_idx, predicted_label in enumerate(classes):
@@ -111,7 +117,6 @@ def report(
     if show_json_tail:
         print()
         print("  ── wire JSON (matches frontend zod schema) ──")
-        # Just show a snippet to keep output focused.
         payload = d.model_dump(by_alias=True)
         print(
             "  "
@@ -130,69 +135,44 @@ def scenario_1_balanced_three_class() -> None:
     pairs = [
         ("book", "book"),
         ("book", "book"),
-        ("book", "cancel"),  # FN_book, FP_cancel
+        ("book", "cancel"),
         ("cancel", "cancel"),
         ("cancel", "cancel"),
-        ("cancel", "reschedule"),  # FN_cancel, FP_reschedule
+        ("cancel", "reschedule"),
         ("reschedule", "reschedule"),
         ("reschedule", "reschedule"),
-        ("reschedule", "book"),  # FN_reschedule, FP_book
+        ("reschedule", "book"),
     ]
-    results = materialize_pairs(pairs)
-    evaluator = PrecisionDatasetEvaluator(
-        PrecisionDatasetEvaluatorConfig(
-            id="precision_intent",
-            name="precision_intent",
-            source_evaluator="intent_match",
-            classes=["book", "cancel", "reschedule"],
-            average="macro",
-        )
+    spec = PrecisionAggregatorSpec(
+        classes=["book", "cancel", "reschedule"], averaging="macro"
     )
+    evaluator = build_dataset_evaluator(spec, source_evaluator="intent_match")
     report(
         "Scenario 1 — Balanced 3-class (intent recognition)\n"
         "  Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.",
-        evaluator.evaluate(results),
+        evaluator.evaluate(materialize_pairs(pairs)),
         show_json_tail=True,
     )
 
 
 def scenario_2_imbalanced_two_class() -> None:
-    """Rare-positive case — why macro vs micro matters.
-
-    20 datapoints. Only 4 are actually positive (the rare class). A weak
-    classifier could trivially get high accuracy by predicting "negative"
-    everywhere — micro precision masks that, macro doesn't.
-    """
+    """Rare-positive case — why macro vs micro matters."""
     pairs: list[tuple[str, str]] = []
-    # 16 true negatives where the classifier said "negative" (correct).
     pairs += [("negative", "negative")] * 13
-    # 3 false positives — classifier hallucinated "positive" on actual negatives.
     pairs += [("negative", "positive")] * 3
-    # 2 true positives.
     pairs += [("positive", "positive")] * 2
-    # 2 false negatives — classifier missed real positives.
     pairs += [("positive", "negative")] * 2
 
     results = materialize_pairs(pairs)
     classes = ["positive", "negative"]
 
-    macro = PrecisionDatasetEvaluator(
-        PrecisionDatasetEvaluatorConfig(
-            id="p_macro",
-            name="precision (macro)",
-            source_evaluator="positive_match",
-            classes=classes,
-            average="macro",
-        )
+    macro = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=classes, averaging="macro"),
+        source_evaluator="positive_match",
     )
-    micro = PrecisionDatasetEvaluator(
-        PrecisionDatasetEvaluatorConfig(
-            id="p_micro",
-            name="precision (micro)",
-            source_evaluator="positive_match",
-            classes=classes,
-            average="micro",
-        )
+    micro = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=classes, averaging="micro"),
+        source_evaluator="positive_match",
     )
     report(
         "Scenario 2a — Imbalanced 2-class, MACRO precision\n"
@@ -202,8 +182,7 @@ def scenario_2_imbalanced_two_class() -> None:
     )
     report(
         "Scenario 2b — Same data, MICRO precision\n"
-        "  Pools TP/FP across classes. In a 2-class case this equals accuracy.\n"
-        "  Notice macro << micro — that's the bias you'd miss with micro alone.",
+        "  Pools TP/FP across classes. In a 2-class case this equals accuracy.",
         micro.evaluate(results),
     )
 
@@ -213,69 +192,35 @@ def scenario_3_precision_vs_recall_vs_f() -> None:
     pairs = [
         ("yes", "yes"),
         ("yes", "yes"),
-        ("no", "yes"),  # FP for yes
-        ("no", "yes"),  # FP for yes
+        ("no", "yes"),
+        ("no", "yes"),
         ("no", "no"),
         ("no", "no"),
-        ("yes", "no"),  # FN for yes
+        ("yes", "no"),
     ]
     results = materialize_pairs(pairs)
     classes = ["yes", "no"]
 
-    p = PrecisionDatasetEvaluator(
-        PrecisionDatasetEvaluatorConfig(
-            id="p",
-            name="precision",
+    evaluators = {
+        "Scenario 3a — Precision on a recall-favourable dataset": build_dataset_evaluator(
+            PrecisionAggregatorSpec(classes=classes, averaging="macro"),
             source_evaluator="yes_match",
-            classes=classes,
-            average="macro",
-        )
-    )
-    r = RecallDatasetEvaluator(
-        RecallDatasetEvaluatorConfig(
-            id="r",
-            name="recall",
+        ),
+        "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)": build_dataset_evaluator(
+            RecallAggregatorSpec(classes=classes, averaging="macro"),
             source_evaluator="yes_match",
-            classes=classes,
-            average="macro",
-        )
-    )
-    f1 = FScoreDatasetEvaluator(
-        FScoreDatasetEvaluatorConfig(
-            id="f1",
-            name="f1",
+        ),
+        "Scenario 3c — F1 (harmonic mean of P and R)": build_dataset_evaluator(
+            FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
             source_evaluator="yes_match",
-            classes=classes,
-            average="macro",
-            f_value=1.0,
-        )
-    )
-    f2 = FScoreDatasetEvaluator(
-        FScoreDatasetEvaluatorConfig(
-            id="f2",
-            name="f2",
+        ),
+        "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)": build_dataset_evaluator(
+            FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=2.0),
             source_evaluator="yes_match",
-            classes=classes,
-            average="macro",
-            f_value=2.0,
-        )
-    )
-    report(
-        "Scenario 3a — Precision on a recall-favourable dataset",
-        p.evaluate(results),
-    )
-    report(
-        "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)",
-        r.evaluate(results),
-    )
-    report(
-        "Scenario 3c — F1 (harmonic mean of P and R)",
-        f1.evaluate(results),
-    )
-    report(
-        "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)",
-        f2.evaluate(results),
-    )
+        ),
+    }
+    for title, evaluator in evaluators.items():
+        report(title, evaluator.evaluate(results))
 
 
 def scenario_4_skipped_datapoints() -> None:
@@ -283,19 +228,14 @@ def scenario_4_skipped_datapoints() -> None:
     results = [
         make_result("cat", "cat"),
         make_result("dog", "dog"),
-        make_result("cat", "platypus"),  # actual not in classes → skipped
-        make_result("zebra", "cat"),  # expected not in classes → skipped
+        make_result("cat", "platypus"),
+        make_result("zebra", "cat"),
         EvaluationResultDto(score=1.0, details="bare string — no justification"),
         EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
     ]
-    evaluator = PrecisionDatasetEvaluator(
-        PrecisionDatasetEvaluatorConfig(
-            id="precision_robustness",
-            name="precision_robustness",
-            source_evaluator="any_match",
-            classes=["cat", "dog"],
-            average="macro",
-        )
+    evaluator = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=["cat", "dog"], averaging="macro"),
+        source_evaluator="any_match",
     )
     report(
         "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n"
@@ -309,33 +249,23 @@ def scenario_4_skipped_datapoints() -> None:
 def scenario_5_realistic_intent_classifier() -> None:
     """A larger, more interesting 4-class dataset — uneven per-class performance."""
     pairs = [
-        # 'book' is easy: classifier handles it well
         *[("book", "book")] * 10,
         ("book", "cancel"),
-        # 'cancel' is medium: a few errors
         *[("cancel", "cancel")] * 6,
         ("cancel", "book"),
         ("cancel", "modify"),
-        # 'reschedule' is hard: classifier confuses it with 'modify'
         ("reschedule", "reschedule"),
         ("reschedule", "reschedule"),
         ("reschedule", "modify"),
         ("reschedule", "modify"),
-        # 'modify' is rare: only 2 cases, classifier gets one
         ("modify", "modify"),
         ("modify", "reschedule"),
     ]
     results = materialize_pairs(pairs)
     classes = ["book", "cancel", "reschedule", "modify"]
-    macro_f1 = FScoreDatasetEvaluator(
-        FScoreDatasetEvaluatorConfig(
-            id="f1_4class",
-            name="f1_4class",
-            source_evaluator="intent_match",
-            classes=classes,
-            average="macro",
-            f_value=1.0,
-        )
+    macro_f1 = build_dataset_evaluator(
+        FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
+        source_evaluator="intent_match",
     )
     report(
         "Scenario 5 — Realistic 4-class intent classifier\n"
@@ -346,6 +276,7 @@ def scenario_5_realistic_intent_classifier() -> None:
 
 
 def main() -> None:
+    """Run every scenario sequentially."""
     scenario_1_balanced_three_class()
     scenario_2_imbalanced_two_class()
     scenario_3_precision_vs_recall_vs_f()
diff --git a/packages/uipath/src/uipath/_cli/cli_eval.py b/packages/uipath/src/uipath/_cli/cli_eval.py
index 2e35db849..e101717d6 100644
--- a/packages/uipath/src/uipath/_cli/cli_eval.py
+++ b/packages/uipath/src/uipath/_cli/cli_eval.py
@@ -412,13 +412,6 @@ async def execute_eval():
                             get_agent_model(eval_context.runtime_schema),
                         )
 
-                        eval_context.dataset_evaluators = (
-                            await EvalHelpers.load_dataset_evaluators(
-                                resolved_eval_set_path,
-                                eval_context.evaluation_set,
-                            )
-                        )
-
                         # Runtime is not required anymore.
                         await runtime.dispose()
 
diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
new file mode 100644
index 000000000..fde129506
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
@@ -0,0 +1,53 @@
+"""Aggregator specs embedded in per-datapoint classification evaluator configs.
+
+Each aggregator is a self-contained run-level metric (precision / recall /
+f-score) attached to a classification evaluator. Specs do not share any
+properties — each variant declares its own ``classes``, ``averaging``, and
+(for fscore) ``f_value`` independently. This keeps each aggregator's contract
+explicit at the JSON level: nothing is hoisted up to the evaluator and silently
+applied to siblings.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Literal, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+
+class PrecisionAggregatorSpec(BaseModel):
+    """Run-level precision aggregator (multiclass, micro or macro averaged)."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    type: Literal["precision"] = "precision"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+
+
+class RecallAggregatorSpec(BaseModel):
+    """Run-level recall aggregator (multiclass, micro or macro averaged)."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    type: Literal["recall"] = "recall"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+
+
+class FScoreAggregatorSpec(BaseModel):
+    """Run-level F-beta aggregator (multiclass, micro or macro averaged)."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    type: Literal["fscore"] = "fscore"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+    f_value: float = Field(default=1.0, gt=0)
+
+
+AggregatorSpec = Annotated[
+    Union[PrecisionAggregatorSpec, RecallAggregatorSpec, FScoreAggregatorSpec],
+    Field(discriminator="type"),
+]
diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
index ae818a421..dcb33cc78 100644
--- a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
@@ -5,9 +5,15 @@
 EvaluationResultDto values from one named source evaluator and emits a single
 EvaluationResult that summarizes the dataset.
 
+Unlike the earlier pointer-style design, dataset evaluators no longer carry
+their own JSON config or a ``source_evaluator`` field. They are constructed by
+the factory directly from an :class:`AggregatorSpec` embedded in a per-datapoint
+classification evaluator's config, together with the source evaluator's name
+which is supplied externally by the runtime when walking those configs.
+
 Concretely distinct from GenericBaseEvaluator: different evaluate() signature,
-different lifecycle. Kept as a parallel hierarchy rather than a subclass so
-the runtime cannot accidentally dispatch a dataset evaluator through the
+different lifecycle. Kept as a parallel hierarchy rather than a subclass so the
+runtime cannot accidentally dispatch a dataset evaluator through the
 per-datapoint loop.
 """
 
@@ -16,59 +22,44 @@
 from abc import ABC, abstractmethod
 from typing import Generic, TypeVar
 
-from pydantic import BaseModel, ConfigDict, Field
-from pydantic.alias_generators import to_camel
-
 from ..models.models import EvaluationResult, EvaluationResultDto
+from ._aggregator_specs import AggregatorSpec
 
-
-class BaseDatasetEvaluatorConfig(BaseModel):
-    """Configuration shared by all dataset-level evaluators."""
-
-    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
-
-    id: str
-    name: str
-    type: str
-    source_evaluator: str = Field(
-        ...,
-        description=(
-            "Name of the per-datapoint evaluator whose EvaluationResultDto values "
-            "this dataset evaluator consumes."
-        ),
-    )
-
-
-ConfigT = TypeVar("ConfigT", bound=BaseDatasetEvaluatorConfig)
+SpecT = TypeVar("SpecT", bound="AggregatorSpec")
 
 
-class BaseDatasetEvaluator(ABC, Generic[ConfigT]):
+class BaseDatasetEvaluator(ABC, Generic[SpecT]):
     """Abstract base for dataset-level evaluators.
 
-    Subclasses implement ``evaluate`` over the per-datapoint EvaluationResultDto
-    values produced by ``config.source_evaluator``.
+    Constructed from an :class:`AggregatorSpec` and the name of the source
+    per-datapoint evaluator whose results this aggregator consumes. The
+    dataset evaluator's "name" used for result keying is derived from
+    ``"{source_evaluator}.{spec.type}"`` so two aggregators on the same source
+    don't collide.
     """
 
-    config: ConfigT
+    spec: SpecT
+    _source_evaluator: str
 
-    def __init__(self, config: ConfigT) -> None:
-        """Store the evaluator's configuration."""
-        self.config = config
-
-    @property
-    def name(self) -> str:
-        """Logical name of this evaluator instance (used as result-dict key)."""
-        return self.config.name
+    def __init__(self, spec: SpecT, source_evaluator: str) -> None:
+        """Store the aggregator spec and the source evaluator name."""
+        self.spec = spec
+        self._source_evaluator = source_evaluator
 
     @property
     def source_evaluator(self) -> str:
         """Name of the upstream evaluator whose results this one consumes."""
-        return self.config.source_evaluator
+        return self._source_evaluator
+
+    @property
+    def name(self) -> str:
+        """Stable key for this dataset evaluator's result in the output map."""
+        return f"{self._source_evaluator}.{self.spec.type}"
 
     @classmethod
     @abstractmethod
     def get_evaluator_id(cls) -> str:
-        """Stable identifier matching the ``type`` discriminator on configs."""
+        """Stable identifier matching the ``type`` discriminator on specs."""
 
     @abstractmethod
     def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
index d56509228..0a65c2c64 100644
--- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
@@ -19,6 +19,7 @@
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
+from ._aggregator_specs import AggregatorSpec
 from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification
 from .output_evaluator import (
     BaseOutputEvaluator,
@@ -41,6 +42,12 @@ class BinaryClassificationEvaluatorConfig(
     positive_class: str
     metric_type: Literal["precision", "recall", "f-score"] = "precision"
     f_value: float = 1.0
+    # Optional run-level aggregators (precision / recall / fscore). Each is a
+    # self-contained spec carrying its own ``classes``, ``averaging``, and
+    # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list
+    # after all per-datapoint evaluators complete and emits one structured
+    # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``.
+    aggregators: list[AggregatorSpec] | None = None
 
 
 class BinaryClassificationEvaluator(
diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
index 272541e21..b15020c25 100644
--- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -3,15 +3,14 @@
 All three share the same internal machinery — a k x k confusion matrix built
 from each per-datapoint result's BaseEvaluatorJustification (expected, actual)
 strings. They differ only in the final formula and (for F-score) the beta
-parameter. The headline ``score`` is the micro or macro average per config;
-``details`` carries the full per-class breakdown plus the confusion matrix.
+parameter. The headline ``score`` is the micro or macro average per the
+embedded :class:`AggregatorSpec`; ``details`` carries the full per-class
+breakdown plus the confusion matrix.
 """
 
 from __future__ import annotations
 
-from typing import Literal
-
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict
 from pydantic.alias_generators import to_camel
 
 from ..models.models import (
@@ -20,7 +19,12 @@
     EvaluatorType,
     NumericEvaluationResult,
 )
-from .base_dataset_evaluator import BaseDatasetEvaluator, BaseDatasetEvaluatorConfig
+from ._aggregator_specs import (
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
+from .base_dataset_evaluator import BaseDatasetEvaluator
 from .base_evaluator import BaseEvaluatorJustification
 
 
@@ -99,19 +103,15 @@ def counts_for(self, class_index: int) -> tuple[int, int, int, int]:
 def _build_confusion(
     results: list[EvaluationResultDto],
     classes: list[str],
-    case_sensitive: bool,
 ) -> _ConfusionData:
     """Build a confusion matrix from per-datapoint results.
 
     Results without a parseable justification are counted in ``n_skipped`` and
     omitted from the matrix. Pairs whose expected or actual label isn't in
-    ``classes`` are also skipped.
+    ``classes`` are also skipped. Labels are normalized to lowercase so a
+    classifier returning "Book" vs configured "book" still matches.
     """
-
-    def norm(label: str) -> str:
-        return label if case_sensitive else label.lower()
-
-    canonical_classes = [norm(c) for c in classes]
+    canonical_classes = [c.lower() for c in classes]
     index_of = {c: i for i, c in enumerate(canonical_classes)}
     k = len(canonical_classes)
     matrix = [[0] * k for _ in range(k)]
@@ -125,8 +125,8 @@ def norm(label: str) -> str:
         if j is None:
             n_skipped += 1
             continue
-        exp = norm(j[0])
-        act = norm(j[1])
+        exp = j[0].lower()
+        act = j[1].lower()
         if exp not in index_of or act not in index_of:
             n_skipped += 1
             continue
@@ -168,11 +168,7 @@ def _build_details(
     average: str,
     per_class_fn,
 ) -> tuple[ClassificationDetails, float]:
-    """Compute per-class values, micro, macro, and pick the headline.
-
-    Returns (details, headline_score). ``headline_score`` is the micro or macro
-    average per the evaluator's ``average`` setting.
-    """
+    """Compute per-class values, micro, macro, and pick the headline."""
     per_class: dict[str, PerClassMetrics] = {}
     total_tp = 0
     total_fp = 0
@@ -214,98 +210,58 @@ def _build_details(
     return details, headline
 
 
-# ─── configs ──────────────────────────────────────────────────────────────────
-
-
-class _BaseClassificationConfig(BaseDatasetEvaluatorConfig):
-    """Shared config for the three classification evaluators."""
-
-    classes: list[str] = Field(
-        ...,
-        min_length=1,
-        description="Class labels expected in the upstream evaluator's justifications.",
-    )
-    average: Literal["micro", "macro"] = "macro"
-    case_sensitive: bool = False
-
-
-class PrecisionDatasetEvaluatorConfig(_BaseClassificationConfig):
-    """Configuration for the dataset-level precision evaluator."""
-
-    type: str = EvaluatorType.DATASET_PRECISION.value
-
-
-class RecallDatasetEvaluatorConfig(_BaseClassificationConfig):
-    """Configuration for the dataset-level recall evaluator."""
-
-    type: str = EvaluatorType.DATASET_RECALL.value
-
-
-class FScoreDatasetEvaluatorConfig(_BaseClassificationConfig):
-    """Configuration for the dataset-level F-score evaluator."""
-
-    type: str = EvaluatorType.DATASET_F_SCORE.value
-    f_value: float = Field(default=1.0, gt=0, description="Beta value for F_beta.")
-
-
 # ─── evaluators ───────────────────────────────────────────────────────────────
 
 
-class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionDatasetEvaluatorConfig]):
+class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionAggregatorSpec]):
     """Dataset-level precision evaluator (multiclass, micro or macro averaged)."""
 
     @classmethod
     def get_evaluator_id(cls) -> str:
-        """Identifier matching the type discriminator on configs."""
+        """Identifier matching the type discriminator on specs."""
         return EvaluatorType.DATASET_PRECISION.value
 
     def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
         """Compute the precision report and return the headline as score."""
-        confusion = _build_confusion(
-            results, self.config.classes, self.config.case_sensitive
-        )
+        confusion = _build_confusion(results, self.spec.classes)
         details, headline = _build_details(
-            confusion, "precision", self.config.average, _precision_of
+            confusion, "precision", self.spec.averaging, _precision_of
         )
         return NumericEvaluationResult(score=headline, details=details)
 
 
-class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallDatasetEvaluatorConfig]):
+class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallAggregatorSpec]):
     """Dataset-level recall evaluator (multiclass, micro or macro averaged)."""
 
     @classmethod
     def get_evaluator_id(cls) -> str:
-        """Identifier matching the type discriminator on configs."""
+        """Identifier matching the type discriminator on specs."""
         return EvaluatorType.DATASET_RECALL.value
 
     def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
         """Compute the recall report and return the headline as score."""
-        confusion = _build_confusion(
-            results, self.config.classes, self.config.case_sensitive
-        )
+        confusion = _build_confusion(results, self.spec.classes)
         details, headline = _build_details(
-            confusion, "recall", self.config.average, _recall_of
+            confusion, "recall", self.spec.averaging, _recall_of
         )
         return NumericEvaluationResult(score=headline, details=details)
 
 
-class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreDatasetEvaluatorConfig]):
+class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreAggregatorSpec]):
     """Dataset-level F-beta evaluator (multiclass, micro or macro averaged)."""
 
     @classmethod
     def get_evaluator_id(cls) -> str:
-        """Identifier matching the type discriminator on configs."""
+        """Identifier matching the type discriminator on specs."""
         return EvaluatorType.DATASET_F_SCORE.value
 
     def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
         """Compute the F-beta report and return the headline as score."""
-        confusion = _build_confusion(
-            results, self.config.classes, self.config.case_sensitive
-        )
+        confusion = _build_confusion(results, self.spec.classes)
         details, headline = _build_details(
             confusion,
             "f_score",
-            self.config.average,
-            _f_score_of(self.config.f_value),
+            self.spec.averaging,
+            _f_score_of(self.spec.f_value),
         )
         return NumericEvaluationResult(score=headline, details=details)
diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
index 8ba0dbe62..d597b9085 100644
--- a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
+++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
@@ -1,52 +1,61 @@
-"""Factory that instantiates dataset-level evaluators from configuration."""
+"""Factory that instantiates dataset-level evaluators from aggregator specs.
+
+Dataset evaluators are now built from a self-contained :class:`AggregatorSpec`
+embedded in a per-datapoint classification evaluator's config, plus the source
+evaluator's name (supplied by the runtime when walking those configs). The
+factory inspects the spec's ``type`` discriminator and returns the matching
+evaluator instance.
+"""
 
 from __future__ import annotations
 
 from typing import Any
 
-from ..models.models import EvaluatorType
+from ._aggregator_specs import (
+    AggregatorSpec,
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
 from .base_dataset_evaluator import BaseDatasetEvaluator
 from .classification_dataset_evaluators import (
     FScoreDatasetEvaluator,
-    FScoreDatasetEvaluatorConfig,
     PrecisionDatasetEvaluator,
-    PrecisionDatasetEvaluatorConfig,
     RecallDatasetEvaluator,
-    RecallDatasetEvaluatorConfig,
 )
 
 _EVALUATOR_REGISTRY: dict[str, type[BaseDatasetEvaluator[Any]]] = {
-    EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluator,
-    EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluator,
-    EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluator,
-}
-
-_CONFIG_REGISTRY: dict[str, type[Any]] = {
-    EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluatorConfig,
-    EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluatorConfig,
-    EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluatorConfig,
+    "precision": PrecisionDatasetEvaluator,
+    "recall": RecallDatasetEvaluator,
+    "fscore": FScoreDatasetEvaluator,
 }
 
 
 def build_dataset_evaluator(
-    config_data: dict[str, Any],
+    spec: AggregatorSpec,
+    source_evaluator: str,
 ) -> BaseDatasetEvaluator[Any]:
-    """Build a dataset evaluator instance from a parsed JSON config dict.
+    """Build a dataset evaluator instance from an aggregator spec.
+
+    Args:
+        spec: A validated :class:`AggregatorSpec` (precision / recall / fscore).
+        source_evaluator: Name of the per-datapoint evaluator whose results
+            this aggregator consumes.
 
     Raises:
-        ValueError: If ``type`` is missing or unknown.
+        ValueError: If ``spec.type`` doesn't match any known aggregator.
     """
-    evaluator_type = config_data.get("type")
-    if not evaluator_type:
-        raise ValueError("Dataset evaluator config is missing required field 'type'")
-
-    config_cls = _CONFIG_REGISTRY.get(evaluator_type)
-    evaluator_cls = _EVALUATOR_REGISTRY.get(evaluator_type)
-    if config_cls is None or evaluator_cls is None:
+    evaluator_cls = _EVALUATOR_REGISTRY.get(spec.type)
+    if evaluator_cls is None:
         known = sorted(_EVALUATOR_REGISTRY.keys())
-        raise ValueError(
-            f"Unknown dataset evaluator type '{evaluator_type}'. Known types: {known}"
-        )
+        raise ValueError(f"Unknown aggregator type '{spec.type}'. Known types: {known}")
+    return evaluator_cls(spec, source_evaluator)
+
 
-    config = config_cls.model_validate(config_data)
-    return evaluator_cls(config)
+__all__ = [
+    "AggregatorSpec",
+    "PrecisionAggregatorSpec",
+    "RecallAggregatorSpec",
+    "FScoreAggregatorSpec",
+    "build_dataset_evaluator",
+]
diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
index 69790c3aa..842d13174 100644
--- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
@@ -20,6 +20,7 @@
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
+from ._aggregator_specs import AggregatorSpec
 from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification
 from .output_evaluator import (
     BaseOutputEvaluator,
@@ -43,6 +44,12 @@ class MulticlassClassificationEvaluatorConfig(
     metric_type: Literal["precision", "recall", "f-score"] = "f-score"
     averaging: Literal["micro", "macro"] = "macro"
     f_value: float = 1.0
+    # Optional run-level aggregators (precision / recall / fscore). Each is a
+    # self-contained spec carrying its own ``classes``, ``averaging``, and
+    # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list
+    # after all per-datapoint evaluators complete and emits one structured
+    # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``.
+    aggregators: list[AggregatorSpec] | None = None
 
 
 class MulticlassClassificationEvaluator(
diff --git a/packages/uipath/src/uipath/eval/helpers.py b/packages/uipath/src/uipath/eval/helpers.py
index fbe210a93..8405e4a7a 100644
--- a/packages/uipath/src/uipath/eval/helpers.py
+++ b/packages/uipath/src/uipath/eval/helpers.py
@@ -9,9 +9,7 @@
 
 from uipath.runtime.schema import UiPathRuntimeSchema
 
-from .evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from .evaluators.base_evaluator import GenericBaseEvaluator
-from .evaluators.dataset_evaluator_factory import build_dataset_evaluator
 from .evaluators.evaluator_factory import EvaluatorFactory
 from .mocks._types import InputMockingStrategy, LLMMockingStrategy
 from .models._conversational_utils import UiPathLegacyEvalChatMessagesMapper
@@ -282,92 +280,6 @@ async def load_evaluators(
 
         return evaluators
 
-    @staticmethod
-    async def load_dataset_evaluators(
-        eval_set_path: str,
-        evaluation_set: EvaluationSet,
-    ) -> list[BaseDatasetEvaluator[Any]]:
-        """Load dataset-level evaluators referenced by the evaluation set.
-
-        Dataset evaluator config JSON files are expected to live under
-        ``<eval_set_dir>/../dataset_evaluators/``, mirroring the evaluators
-        layout. Each config is matched to a reference by its top-level ``id``.
-
-        Validates that every dataset evaluator's ``source_evaluator`` is one of
-        the per-datapoint evaluators declared on the eval set; raises if not.
-        """
-        if evaluation_set is None:
-            raise ValueError("eval_set cannot be None")
-
-        dataset_ref_ids = {
-            ref.ref for ref in evaluation_set.dataset_evaluator_refs
-        }
-        if not dataset_ref_ids:
-            return []
-
-        dataset_dir = Path(eval_set_path).parent.parent / "dataset_evaluators"
-        if not dataset_dir.exists():
-            raise ValueError(
-                f"Dataset evaluators directory not found at '{dataset_dir}', "
-                f"but evaluation set references dataset evaluators: "
-                f"{sorted(dataset_ref_ids)}"
-            )
-
-        # Build the set of per-datapoint evaluator names so we can validate
-        # source_evaluator references up front.
-        if evaluation_set.evaluator_configs:
-            known_evaluator_names = {
-                ref.ref for ref in evaluation_set.evaluator_configs
-            }
-        else:
-            known_evaluator_names = set(evaluation_set.evaluator_refs)
-
-        dataset_evaluators: list[BaseDatasetEvaluator[Any]] = []
-        found_ids: set[str] = set()
-
-        for file in dataset_dir.glob("*.json"):
-            try:
-                with open(file, "r", encoding="utf-8") as f:
-                    data = json.load(f)
-            except json.JSONDecodeError as e:
-                raise ValueError(
-                    f"Invalid JSON in dataset evaluator file '{file}': {str(e)}."
-                ) from e
-
-            evaluator_id = data.get("id")
-            if evaluator_id not in dataset_ref_ids:
-                continue
-
-            try:
-                evaluator = build_dataset_evaluator(data)
-            except Exception as e:
-                raise ValueError(
-                    f"Failed to create dataset evaluator from file '{file}': "
-                    f"{str(e)}."
-                ) from e
-
-            if (
-                known_evaluator_names
-                and evaluator.source_evaluator not in known_evaluator_names
-            ):
-                raise ValueError(
-                    f"Dataset evaluator '{evaluator.name}' references "
-                    f"source_evaluator='{evaluator.source_evaluator}' which is "
-                    f"not declared in this evaluation set. Known evaluators: "
-                    f"{sorted(known_evaluator_names)}"
-                )
-
-            dataset_evaluators.append(evaluator)
-            found_ids.add(evaluator_id)
-
-        missing = dataset_ref_ids - found_ids
-        if missing:
-            raise ValueError(
-                f"Could not find the following dataset evaluators: {missing}"
-            )
-
-        return dataset_evaluators
-
 
 def get_agent_model(schema: UiPathRuntimeSchema) -> str | None:
     """Get agent model from the runtime schema metadata.
diff --git a/packages/uipath/src/uipath/eval/models/evaluation_set.py b/packages/uipath/src/uipath/eval/models/evaluation_set.py
index 74c822595..c80da8e14 100644
--- a/packages/uipath/src/uipath/eval/models/evaluation_set.py
+++ b/packages/uipath/src/uipath/eval/models/evaluation_set.py
@@ -173,9 +173,6 @@ class EvaluationSet(BaseModel):
     evaluator_configs: list[EvaluatorReference] = Field(
         default_factory=list, alias="evaluatorConfigs"
     )
-    dataset_evaluator_refs: list[EvaluatorReference] = Field(
-        default_factory=list, alias="datasetEvaluatorRefs"
-    )
     evaluations: list[EvaluationItem] = Field(default_factory=list)
     model_settings: list[EvaluationSetModelSettings] = Field(
         default_factory=list, alias="modelSettings"
diff --git a/packages/uipath/src/uipath/eval/runtime/context.py b/packages/uipath/src/uipath/eval/runtime/context.py
index f3b713320..b8224718c 100644
--- a/packages/uipath/src/uipath/eval/runtime/context.py
+++ b/packages/uipath/src/uipath/eval/runtime/context.py
@@ -4,7 +4,6 @@
 
 from uipath.runtime.schema import UiPathRuntimeSchema
 
-from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from ..evaluators.base_evaluator import GenericBaseEvaluator
 from ..models.evaluation_set import EvaluationSet
 
@@ -28,4 +27,3 @@ class UiPathEvalContext:
     input_overrides: dict[str, Any] | None = None
     resume: bool = False
     job_id: str | None = None
-    dataset_evaluators: list[BaseDatasetEvaluator[Any]] | None = None
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
index 5cadcc527..c64f8f158 100644
--- a/packages/uipath/src/uipath/eval/runtime/runtime.py
+++ b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -45,8 +45,8 @@
 from uipath.runtime.schema import UiPathRuntimeSchema
 
 from .._execution_context import ExecutionSpanCollector
-from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from ..evaluators.base_evaluator import GenericBaseEvaluator
+from ..evaluators.dataset_evaluator_factory import build_dataset_evaluator
 from ..evaluators.output_evaluator import OutputEvaluationCriteria
 from ..helpers import get_agent_model
 from ..mocks._cache_manager import CacheManager
@@ -205,19 +205,24 @@ def compute_evaluator_scores(
 
 def compute_dataset_evaluator_results(
     evaluation_set_results: list[UiPathEvalRunResult],
-    dataset_evaluators: Iterable[BaseDatasetEvaluator[Any]],
+    evaluators: Iterable[GenericBaseEvaluator[Any, Any, Any]],
 ) -> dict[str, EvaluationResultDto]:
-    """Run each dataset evaluator over its source evaluator's per-datapoint results.
+    """Run any dataset-level aggregators embedded in per-datapoint evaluator configs.
+
+    Walks ``evaluators`` looking for any whose config carries an ``aggregators``
+    list (currently only Binary/Multiclass classification). For each aggregator
+    spec, builds the corresponding dataset evaluator via the factory and runs it
+    over the per-datapoint results that came from that source evaluator.
 
     Args:
         evaluation_set_results: Per-datapoint results from the run.
-        dataset_evaluators: Dataset-level evaluator instances. Each is routed to
-            the per-datapoint results from ``evaluator.source_evaluator``.
+        evaluators: Per-datapoint evaluator instances that ran during this eval
+            set. Their configs may carry ``aggregators`` lists.
 
     Returns:
-        Dict mapping dataset evaluator name to its serialized EvaluationResultDto.
-        Dataset evaluators whose source produced no results are still invoked
-        with an empty list so they can emit a zeroed result.
+        Dict mapping ``"{evaluator_name}.{aggregator_type}"`` to the run-level
+        EvaluationResultDto. Aggregators whose source produced no results are
+        still invoked with an empty list so they emit a zeroed result.
     """
     results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict(
         list
@@ -231,12 +236,21 @@ def compute_dataset_evaluator_results(
             )
 
     dataset_results: dict[str, EvaluationResultDto] = {}
-    for evaluator in dataset_evaluators:
-        source = evaluator.source_evaluator
-        evaluation_result = evaluator.evaluate(results_by_evaluator.get(source, []))
-        dataset_results[evaluator.name] = EvaluationResultDto.from_evaluation_result(
-            evaluation_result
-        )
+    for evaluator in evaluators:
+        evaluator_config = getattr(evaluator, "evaluator_config", None)
+        if evaluator_config is None:
+            continue
+        aggregators = getattr(evaluator_config, "aggregators", None)
+        if not aggregators:
+            continue
+        source_name = evaluator_config.name
+        source_results = results_by_evaluator.get(source_name, [])
+        for spec in aggregators:
+            dataset_evaluator = build_dataset_evaluator(spec, source_name)
+            evaluation_result = dataset_evaluator.evaluate(source_results)
+            dataset_results[dataset_evaluator.name] = (
+                EvaluationResultDto.from_evaluation_result(evaluation_result)
+            )
     return dataset_results
 
 
@@ -419,17 +433,18 @@ async def execute(self) -> UiPathRuntimeResult:
                         evaluators,
                     )
 
-                    # Run any dataset-level evaluators configured on the eval
-                    # set. Each consumes the per-datapoint results from one
-                    # named source evaluator and emits a single run-level
-                    # EvaluationResultDto stored on UiPathEvalOutput.
-                    if self.context.dataset_evaluators:
-                        results.dataset_evaluator_results = (
-                            compute_dataset_evaluator_results(
-                                results.evaluation_set_results,
-                                self.context.dataset_evaluators,
-                            )
+                    # Run any dataset-level aggregators embedded in per-datapoint
+                    # classification evaluator configs (the ``aggregators`` list).
+                    # Each aggregator consumes per-datapoint results from its
+                    # parent evaluator and emits one run-level EvaluationResultDto
+                    # keyed ``{evaluator_name}.{aggregator_type}`` on
+                    # UiPathEvalOutput.dataset_evaluator_results.
+                    results.dataset_evaluator_results = (
+                        compute_dataset_evaluator_results(
+                            results.evaluation_set_results,
+                            evaluators,
                         )
+                    )
 
                     # Configure span with output and metadata
                     await configure_eval_set_run_span(
diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
index 08d81818d..53e1e9855 100644
--- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -1,28 +1,34 @@
 """Tests for dataset-level classification evaluators (Precision, Recall, FScore).
 
 Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases
-(empty input, out-of-vocab labels, malformed details), and runtime-level
-routing where compute_dataset_evaluator_results selects results by name.
+(empty input, out-of-vocab labels, malformed details), factory dispatch, and
+runtime-level routing where compute_dataset_evaluator_results walks
+per-datapoint evaluator configs' embedded ``aggregators`` lists.
 """
 
 import uuid
 
 import pytest
+from pydantic import BaseModel
 
+from uipath.eval.evaluators._aggregator_specs import (
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
 from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
 from uipath.eval.evaluators.classification_dataset_evaluators import (
     ClassificationDetails,
     FScoreDatasetEvaluator,
-    FScoreDatasetEvaluatorConfig,
     PrecisionDatasetEvaluator,
-    PrecisionDatasetEvaluatorConfig,
     RecallDatasetEvaluator,
-    RecallDatasetEvaluatorConfig,
 )
 from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from uipath.eval.evaluators.multiclass_classification_evaluator import (
+    MulticlassClassificationEvaluator,
+)
 from uipath.eval.models.models import (
     EvaluationResultDto,
-    EvaluatorType,
     NumericEvaluationResult,
 )
 from uipath.eval.runtime._types import (
@@ -45,51 +51,54 @@ def _result(
     )
 
 
-def _precision(classes: list[str], average: str = "macro") -> PrecisionDatasetEvaluator:
-    return PrecisionDatasetEvaluator(
-        PrecisionDatasetEvaluatorConfig(
-            id="p1",
-            name="precision",
-            source_evaluator="intent_match",
-            classes=classes,
-            average=average,  # type: ignore[arg-type]
-        )
-    )
+def _precision(
+    classes: list[str], averaging: str = "macro"
+) -> PrecisionDatasetEvaluator:
+    spec = PrecisionAggregatorSpec(classes=classes, averaging=averaging)  # type: ignore[arg-type]
+    return PrecisionDatasetEvaluator(spec, source_evaluator="intent_match")
 
 
-def _recall(classes: list[str], average: str = "macro") -> RecallDatasetEvaluator:
-    return RecallDatasetEvaluator(
-        RecallDatasetEvaluatorConfig(
-            id="r1",
-            name="recall",
-            source_evaluator="intent_match",
-            classes=classes,
-            average=average,  # type: ignore[arg-type]
-        )
-    )
+def _recall(classes: list[str], averaging: str = "macro") -> RecallDatasetEvaluator:
+    spec = RecallAggregatorSpec(classes=classes, averaging=averaging)  # type: ignore[arg-type]
+    return RecallDatasetEvaluator(spec, source_evaluator="intent_match")
 
 
 def _fscore(
-    classes: list[str], average: str = "macro", f_value: float = 1.0
+    classes: list[str], averaging: str = "macro", f_value: float = 1.0
 ) -> FScoreDatasetEvaluator:
-    return FScoreDatasetEvaluator(
-        FScoreDatasetEvaluatorConfig(
-            id="f1",
-            name="fscore",
-            source_evaluator="intent_match",
-            classes=classes,
-            average=average,  # type: ignore[arg-type]
-            f_value=f_value,
-        )
+    spec = FScoreAggregatorSpec(
+        classes=classes,
+        averaging=averaging,  # type: ignore[arg-type]
+        f_value=f_value,
     )
+    return FScoreDatasetEvaluator(spec, source_evaluator="intent_match")
 
 
-def _details(result: NumericEvaluationResult) -> ClassificationDetails:
+def _details(result: object) -> ClassificationDetails:
     """Type-narrowing helper for asserting on details."""
+    assert isinstance(result, NumericEvaluationResult)
     assert isinstance(result.details, ClassificationDetails)
     return result.details
 
 
+def _multiclass_evaluator(
+    name: str,
+    classes: list[str],
+    aggregators: list[BaseModel],
+) -> MulticlassClassificationEvaluator:
+    """Build a per-datapoint multiclass evaluator with embedded aggregators."""
+    return MulticlassClassificationEvaluator.model_validate(
+        {
+            "id": str(uuid.uuid4()),
+            "evaluatorConfig": {
+                "name": name,
+                "classes": classes,
+                "aggregators": [spec.model_dump(by_alias=True) for spec in aggregators],
+            },
+        }
+    )
+
+
 class TestPrecisionEvaluator:
     def test_empty_input_returns_zeroed_result(self) -> None:
         result = _precision(["cat", "dog"]).evaluate([])
@@ -102,14 +111,13 @@ def test_empty_input_returns_zeroed_result(self) -> None:
         assert d.per_class["cat"].tn == 0
 
     def test_two_class_macro(self) -> None:
-        # 4 datapoints: 2 TP_yes, 1 FN_yes (predicted no), 1 FP_yes (predicted yes when expected no).
         results = [
             _result("yes", "yes"),
             _result("yes", "yes"),
-            _result("yes", "no"),  # FN for yes, FP for no
-            _result("no", "yes"),  # FP for yes, FN for no
+            _result("yes", "no"),
+            _result("no", "yes"),
         ]
-        result = _precision(["yes", "no"], average="macro").evaluate(results)
+        result = _precision(["yes", "no"], averaging="macro").evaluate(results)
         d = _details(result)
         # precision_yes = 2 / (2 + 1) = 2/3
         # precision_no  = 0 / (0 + 1) = 0
@@ -126,33 +134,27 @@ def test_two_class_micro_equals_accuracy(self) -> None:
             _result("yes", "no"),
             _result("no", "yes"),
         ]
-        result = _precision(["yes", "no"], average="micro").evaluate(results)
+        result = _precision(["yes", "no"], averaging="micro").evaluate(results)
         d = _details(result)
-        # micro precision = sum(TP) / sum(TP + FP)
-        # sum(TP) = 2 (yes diag) + 0 (no diag) = 2
-        # sum(FP) = 1 (yes off-diag row) + 1 (no off-diag row) = 2
-        # micro = 2 / (2 + 2) = 0.5 — equals accuracy 2/4 in the 2-class case
         assert d.micro == pytest.approx(0.5)
         assert result.score == pytest.approx(0.5)
 
     def test_three_class_macro(self) -> None:
-        # Each class gets 2 TP, 1 FP, 1 FN — symmetric setup
         pairs = [
             ("cat", "cat"),
             ("cat", "cat"),
-            ("cat", "dog"),  # FN_cat, FP_dog
+            ("cat", "dog"),
             ("dog", "dog"),
             ("dog", "dog"),
-            ("dog", "bird"),  # FN_dog, FP_bird
+            ("dog", "bird"),
             ("bird", "bird"),
             ("bird", "bird"),
-            ("bird", "cat"),  # FN_bird, FP_cat
+            ("bird", "cat"),
         ]
-        result = _precision(["cat", "dog", "bird"], average="macro").evaluate(
+        result = _precision(["cat", "dog", "bird"], averaging="macro").evaluate(
             [_result(e, a) for e, a in pairs]
         )
         d = _details(result)
-        # per-class precision = 2 / (2 + 1) = 2/3 for all three
         for label in ("cat", "dog", "bird"):
             m = d.per_class[label]
             assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5
@@ -169,30 +171,24 @@ def test_two_class_macro(self) -> None:
             _result("yes", "no"),
             _result("no", "yes"),
         ]
-        result = _recall(["yes", "no"], average="macro").evaluate(results)
+        result = _recall(["yes", "no"], averaging="macro").evaluate(results)
         d = _details(result)
-        # recall_yes = TP / (TP + FN) = 2 / (2 + 1) = 2/3
-        # recall_no  = 0 / (0 + 1) = 0
-        # macro = 1/3
         assert d.per_class["yes"].value == pytest.approx(2 / 3)
         assert d.per_class["no"].value == pytest.approx(0.0)
         assert result.score == pytest.approx(1 / 3)
 
     def test_recall_differs_from_precision(self) -> None:
-        # Asymmetric example so precision != recall.
         results = [
-            _result("yes", "yes"),  # TP
-            _result("yes", "yes"),  # TP
-            _result("no", "yes"),  # FP for yes
-            _result("no", "yes"),  # FP for yes
-            _result("no", "no"),  # TP for no
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("no", "yes"),
+            _result("no", "yes"),
+            _result("no", "no"),
         ]
-        p = _details(_precision(["yes", "no"], average="macro").evaluate(results))
-        r = _details(_recall(["yes", "no"], average="macro").evaluate(results))
-        # precision_yes = 2/(2+2)=0.5, precision_no = 1/(1+0)=1.0
+        p = _details(_precision(["yes", "no"], averaging="macro").evaluate(results))
+        r = _details(_recall(["yes", "no"], averaging="macro").evaluate(results))
         assert p.per_class["yes"].value == pytest.approx(0.5)
         assert p.per_class["no"].value == pytest.approx(1.0)
-        # recall_yes = 2/(2+0)=1.0, recall_no = 1/(1+2)=1/3
         assert r.per_class["yes"].value == pytest.approx(1.0)
         assert r.per_class["no"].value == pytest.approx(1 / 3)
 
@@ -206,16 +202,13 @@ def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None:
             _result("no", "yes"),
         ]
         f = _details(
-            _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results)
+            _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results)
         )
-        # precision_yes = 2/3, recall_yes = 2/3 -> F1_yes = 2/3
-        # precision_no  = 0,   recall_no  = 0    -> F1_no  = 0
         assert f.per_class["yes"].value == pytest.approx(2 / 3)
         assert f.per_class["no"].value == pytest.approx(0.0)
         assert f.macro == pytest.approx((2 / 3 + 0.0) / 2)
 
     def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None:
-        # Asymmetric setup: precision_yes = 0.5, recall_yes = 1.0.
         results = [
             _result("yes", "yes"),
             _result("yes", "yes"),
@@ -224,17 +217,14 @@ def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None:
             _result("no", "no"),
         ]
         f1 = _details(
-            _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results)
+            _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results)
         )
         f2 = _details(
-            _fscore(["yes", "no"], average="macro", f_value=2.0).evaluate(results)
+            _fscore(["yes", "no"], averaging="macro", f_value=2.0).evaluate(results)
         )
-        # F_beta with beta>1 weighs recall higher. Since recall_yes > precision_yes,
-        # F2_yes should be > F1_yes.
         assert f2.per_class["yes"].value > f1.per_class["yes"].value
 
     def test_three_class_micro_pools_across_classes(self) -> None:
-        # Same symmetric setup as the precision macro test.
         pairs = [
             ("cat", "cat"),
             ("cat", "cat"),
@@ -247,12 +237,10 @@ def test_three_class_micro_pools_across_classes(self) -> None:
             ("bird", "cat"),
         ]
         d = _details(
-            _fscore(["cat", "dog", "bird"], average="micro", f_value=1.0).evaluate(
+            _fscore(["cat", "dog", "bird"], averaging="micro", f_value=1.0).evaluate(
                 [_result(e, a) for e, a in pairs]
             )
         )
-        # micro precision == micro recall == 6/9 (accuracy when each off-diag
-        # contributes once to FP and once to FN globally). micro F1 = 6/9.
         assert d.micro == pytest.approx(6 / 9)
 
 
@@ -260,8 +248,8 @@ class TestSkippingAndEdgeCases:
     def test_out_of_vocab_labels_are_skipped(self) -> None:
         results = [
             _result("cat", "cat"),
-            _result("cat", "platypus"),  # actual not in classes
-            _result("zebra", "dog"),  # expected not in classes
+            _result("cat", "platypus"),
+            _result("zebra", "dog"),
         ]
         d = _details(_precision(["cat", "dog"]).evaluate(results))
         assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
@@ -275,7 +263,7 @@ def test_results_without_justification_are_skipped(self) -> None:
         d = _details(_precision(["cat", "dog"]).evaluate(results))
         assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
 
-    def test_case_insensitive_by_default(self) -> None:
+    def test_case_insensitive(self) -> None:
         results = [_result("Cat", "CAT"), _result("DOG", "dog")]
         d = _details(_precision(["cat", "dog"]).evaluate(results))
         assert d.per_class["cat"].tp == 1
@@ -283,48 +271,97 @@ def test_case_insensitive_by_default(self) -> None:
 
 
 class TestFactory:
-    def test_builds_evaluator_from_dict(self) -> None:
-        config_data = {
-            "id": "precision_intent",
-            "name": "precision_intent",
-            "type": EvaluatorType.DATASET_PRECISION.value,
-            "sourceEvaluator": "intent_match",
-            "classes": ["yes", "no"],
-            "average": "macro",
-        }
-        evaluator = build_dataset_evaluator(config_data)
+    """The factory now takes an AggregatorSpec instance + source name, not a dict."""
+
+    def test_builds_precision_from_spec(self) -> None:
+        spec = PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro")
+        evaluator = build_dataset_evaluator(spec, "intent_match")
         assert isinstance(evaluator, PrecisionDatasetEvaluator)
         assert evaluator.source_evaluator == "intent_match"
-        assert evaluator.name == "precision_intent"
-
-    def test_unknown_type_raises(self) -> None:
-        with pytest.raises(ValueError, match="Unknown dataset evaluator type"):
-            build_dataset_evaluator(
-                {
-                    "id": "x",
-                    "name": "x",
-                    "type": "uipath-not-a-thing",
-                    "sourceEvaluator": "intent_match",
-                    "classes": ["yes", "no"],
-                }
-            )
+        assert evaluator.name == "intent_match.precision"
 
-    def test_missing_type_raises(self) -> None:
-        with pytest.raises(ValueError, match="missing required field 'type'"):
-            build_dataset_evaluator(
-                {
-                    "id": "x",
-                    "name": "x",
-                    "sourceEvaluator": "intent_match",
-                    "classes": ["yes", "no"],
-                }
-            )
+    def test_builds_recall_from_spec(self) -> None:
+        spec = RecallAggregatorSpec(classes=["yes", "no"], averaging="micro")
+        evaluator = build_dataset_evaluator(spec, "intent_match")
+        assert isinstance(evaluator, RecallDatasetEvaluator)
+        assert evaluator.name == "intent_match.recall"
+
+    def test_builds_fscore_from_spec(self) -> None:
+        spec = FScoreAggregatorSpec(
+            classes=["yes", "no"], averaging="macro", f_value=2.0
+        )
+        evaluator = build_dataset_evaluator(spec, "intent_match")
+        assert isinstance(evaluator, FScoreDatasetEvaluator)
+        assert evaluator.spec.f_value == 2.0
+
+
+class TestAggregatorSpecJsonRoundTrip:
+    """Pin the wire shape sent to the C# side."""
+
+    def test_precision_uses_self_contained_fields(self) -> None:
+        spec = PrecisionAggregatorSpec.model_validate(
+            {
+                "type": "precision",
+                "classes": ["book", "cancel", "reschedule"],
+                "averaging": "macro",
+            }
+        )
+        dumped = spec.model_dump(by_alias=True)
+        assert dumped == {
+            "type": "precision",
+            "classes": ["book", "cancel", "reschedule"],
+            "averaging": "macro",
+        }
+
+    def test_fscore_uses_camelcase_fvalue_on_wire(self) -> None:
+        spec = FScoreAggregatorSpec.model_validate(
+            {
+                "type": "fscore",
+                "classes": ["yes", "no"],
+                "averaging": "macro",
+                "fValue": 1.5,
+            }
+        )
+        assert spec.f_value == 1.5
+        dumped = spec.model_dump(by_alias=True)
+        assert dumped["fValue"] == 1.5
+        assert "f_value" not in dumped
+
+    def test_multiclass_evaluator_round_trips_aggregators(self) -> None:
+        """Per-datapoint evaluator config carries aggregators[]; survives dump+load."""
+        ev = _multiclass_evaluator(
+            "intent_classifier",
+            classes=["book", "cancel", "reschedule"],
+            aggregators=[
+                PrecisionAggregatorSpec(
+                    classes=["book", "cancel", "reschedule"], averaging="macro"
+                ),
+                FScoreAggregatorSpec(
+                    classes=["book", "cancel", "reschedule"],
+                    averaging="macro",
+                    f_value=1.0,
+                ),
+            ],
+        )
+        assert ev.evaluator_config.aggregators is not None
+        assert len(ev.evaluator_config.aggregators) == 2
+        assert ev.evaluator_config.aggregators[0].type == "precision"
+        assert ev.evaluator_config.aggregators[1].type == "fscore"
 
 
 class TestComputeDatasetEvaluatorResults:
-    """End-to-end: dataset evaluator picks results by source_evaluator name."""
+    """End-to-end: runtime walks evaluator configs' aggregators[]."""
+
+    def test_walks_aggregators_on_classification_evaluator(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+                RecallAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
 
-    def test_routes_to_correct_source_and_ignores_others(self) -> None:
         eval_results = [
             UiPathEvalRunResult(
                 evaluation_name="dp1",
@@ -353,18 +390,42 @@ def test_routes_to_correct_source_and_ignores_others(self) -> None:
             ),
         ]
 
-        out = compute_dataset_evaluator_results(
-            eval_results, [_precision(["yes", "no"], average="macro")]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        # Two aggregators on intent_match → two keys, prefixed by source name.
+        assert set(out) == {"intent_match.precision", "intent_match.recall"}
+        precision_dto = out["intent_match.precision"]
+        assert isinstance(precision_dto, EvaluationResultDto)
+        assert isinstance(precision_dto.details, dict)
+        # The unrelated 0.5 score from some_other_evaluator must NOT be in the matrix.
+        assert precision_dto.details["n_scored"] == 2
+
+    def test_evaluator_without_aggregators_is_skipped(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match", classes=["yes", "no"], aggregators=[]
         )
-        assert set(out) == {"precision"}
-        dto = out["precision"]
-        assert isinstance(dto, EvaluationResultDto)
-        # The unrelated 0.5 score from some_other_evaluator must NOT be in the
-        # matrix — only the two intent_match results count.
-        assert isinstance(dto.details, dict)
-        assert dto.details["n_scored"] == 2
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        assert out == {}
 
     def test_line_by_line_subresults_are_excluded(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
         eval_results = [
             UiPathEvalRunResult(
                 evaluation_name="dp1",
@@ -383,13 +444,18 @@ def test_line_by_line_subresults_are_excluded(self) -> None:
                 ],
             ),
         ]
-        out = compute_dataset_evaluator_results(
-            eval_results, [_precision(["yes", "no"])]
-        )
-        assert isinstance(out["precision"].details, dict)
-        assert out["precision"].details["n_scored"] == 1
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        assert isinstance(out["intent_match.precision"].details, dict)
+        assert out["intent_match.precision"].details["n_scored"] == 1
 
     def test_source_with_no_results_produces_zeroed_report(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
         eval_results = [
             UiPathEvalRunResult(
                 evaluation_name="dp1",
@@ -402,10 +468,8 @@ def test_source_with_no_results_produces_zeroed_report(self) -> None:
                 ],
             ),
         ]
-        out = compute_dataset_evaluator_results(
-            eval_results, [_precision(["yes", "no"])]
-        )
-        dto = out["precision"]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        dto = out["intent_match.precision"]
         assert dto.score == 0.0
         assert isinstance(dto.details, dict)
         assert dto.details["n_scored"] == 0

From 77fcc109777dd2ba943e4ff3c2d3745dbed7dc21 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 21:27:47 -0700
Subject: [PATCH 06/13] feat(eval): wire sample classification evaluators to
 embedded aggregators

Update binary_classification_agent and multiclass_classification_simple
sample evaluator JSONs to include the new aggregators[] field. Each
aggregator carries its own classes, averaging, and (for fscore) fValue.
Update the e2e test to also assert the dataset-level results land in
UiPathEvalOutput.dataset_evaluator_results, keyed
"{evaluator_name}.{aggregator_type}".

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../evaluators/binary-classification.json     | 22 +++++++++++++++++--
 .../evaluators/multiclass-classification.json | 22 +++++++++++++++++--
 .../eval/test_classification_samples_e2e.py   | 21 ++++++++++++++++++
 3 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
index 21f7d6850..d2cc64b71 100644
--- a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
+++ b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
@@ -1,7 +1,7 @@
 {
   "version": "1.0",
   "id": "BinarySpamPrecision",
-  "description": "Precision on the 'spam' positive class",
+  "description": "Precision on the 'spam' positive class, plus run-level aggregators",
   "evaluatorTypeId": "uipath-binary-classification",
   "evaluatorConfig": {
     "name": "BinarySpamPrecision",
@@ -11,6 +11,24 @@
     "fValue": 1.0,
     "defaultEvaluationCriteria": {
       "expectedClass": "ham"
-    }
+    },
+    "aggregators": [
+      {
+        "type": "precision",
+        "classes": ["spam", "ham"],
+        "averaging": "macro"
+      },
+      {
+        "type": "recall",
+        "classes": ["spam", "ham"],
+        "averaging": "macro"
+      },
+      {
+        "type": "fscore",
+        "classes": ["spam", "ham"],
+        "averaging": "macro",
+        "fValue": 1.0
+      }
+    ]
   }
 }
diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
index 859a18562..871afbc21 100644
--- a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
+++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
@@ -1,7 +1,7 @@
 {
   "version": "1.0",
   "id": "EmailMulticlassFScore",
-  "description": "Macro-averaged F1 across payments / support / spam",
+  "description": "Macro-averaged F1 across payments / support / spam, plus run-level aggregators",
   "evaluatorTypeId": "uipath-multiclass-classification",
   "evaluatorConfig": {
     "name": "EmailMulticlassFScore",
@@ -12,6 +12,24 @@
     "fValue": 1.0,
     "defaultEvaluationCriteria": {
       "expectedClass": "support"
-    }
+    },
+    "aggregators": [
+      {
+        "type": "precision",
+        "classes": ["payments", "support", "spam"],
+        "averaging": "macro"
+      },
+      {
+        "type": "recall",
+        "classes": ["payments", "support", "spam"],
+        "averaging": "macro"
+      },
+      {
+        "type": "fscore",
+        "classes": ["payments", "support", "spam"],
+        "averaging": "macro",
+        "fValue": 1.0
+      }
+    ]
   }
 }
diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
index 202363221..f2bdfa3cb 100644
--- a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
+++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
@@ -170,6 +170,15 @@ async def test_binary_classification_sample_end_to_end():
     # Precision = TP / (TP + FP) = 2 / (2 + 1) = 0.6666...
     assert averages["BinarySpamPrecision"] == pytest.approx(2 / 3, rel=1e-6)
 
+    # Dataset-level aggregators embedded on the evaluator config also fire.
+    # Each result keyed by "{evaluator_name}.{aggregator_type}".
+    keys = set(output.dataset_evaluator_results)
+    assert keys == {
+        "BinarySpamPrecision.precision",
+        "BinarySpamPrecision.recall",
+        "BinarySpamPrecision.fscore",
+    }
+
 
 async def test_multiclass_classification_sample_end_to_end():
     """Multiclass router: 6/7 correct, macro F1 = (0.8 + 0.8 + 1.0) / 3 = 0.8666..."""
@@ -191,3 +200,15 @@ async def test_multiclass_classification_sample_end_to_end():
     # payments F1=0.8 (P=2/3, R=1), support F1=0.8 (P=1, R=2/3), spam F1=1.0
     # macro = mean = 2.6 / 3
     assert averages["EmailMulticlassFScore"] == pytest.approx(2.6 / 3, rel=1e-6)
+
+    # Three embedded aggregators ran in addition to reduce_scores.
+    keys = set(output.dataset_evaluator_results)
+    assert keys == {
+        "EmailMulticlassFScore.precision",
+        "EmailMulticlassFScore.recall",
+        "EmailMulticlassFScore.fscore",
+    }
+    # The macro F1 computed by the embedded fscore aggregator should match
+    # reduce_scores' result (both walk the same confusion matrix).
+    fscore_result = output.dataset_evaluator_results["EmailMulticlassFScore.fscore"]
+    assert fscore_result.score == pytest.approx(2.6 / 3, rel=1e-6)

From c0436a3da061146b61b117dbe885606b4fd52fef Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 21:49:44 -0700
Subject: [PATCH 07/13] refactor(eval): apply ponytail-review cleanup

- Collapse Precision/Recall/FScore into one ClassificationDatasetEvaluator
  switching on spec.type; factory becomes a one-liner.
- Inline _precision_of/_recall_of/_f_score_of and the one-use _ConfusionData
  helpers; switch _ConfusionData to @dataclass(slots=True).
- Drop dead get_evaluator_id() abstract + 3 overrides + matching
  EvaluatorType enum entries (factory dispatches on spec.type).
- Pull repeated model_config into a private _AggregatorSpecBase.
- Drop registry + impossible-case ValueError in dataset_evaluator_factory
  (pydantic discriminator catches unknown types).
- Have _coerce_justification return the typed justification object.
- Drop the _source_evaluator private/property pair on BaseDatasetEvaluator.

No behavior change.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../eval/evaluators/_aggregator_specs.py      |  16 +-
 .../eval/evaluators/base_dataset_evaluator.py |  16 +-
 .../classification_dataset_evaluators.py      | 227 ++++++------------
 .../evaluators/dataset_evaluator_factory.py   |  50 +---
 .../uipath/src/uipath/eval/models/models.py   |   3 -
 .../uipath/src/uipath/eval/runtime/runtime.py |  10 +-
 .../test_dataset_classification_evaluators.py |  27 ++-
 7 files changed, 116 insertions(+), 233 deletions(-)

diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
index fde129506..6c0b2b880 100644
--- a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
+++ b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
@@ -16,31 +16,31 @@
 from pydantic.alias_generators import to_camel
 
 
-class PrecisionAggregatorSpec(BaseModel):
-    """Run-level precision aggregator (multiclass, micro or macro averaged)."""
+class _AggregatorSpecBase(BaseModel):
+    """Shared pydantic config for every aggregator variant."""
 
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
+
+class PrecisionAggregatorSpec(_AggregatorSpecBase):
+    """Run-level precision aggregator (multiclass, micro or macro averaged)."""
+
     type: Literal["precision"] = "precision"
     classes: list[str] = Field(..., min_length=1)
     averaging: Literal["macro", "micro"]
 
 
-class RecallAggregatorSpec(BaseModel):
+class RecallAggregatorSpec(_AggregatorSpecBase):
     """Run-level recall aggregator (multiclass, micro or macro averaged)."""
 
-    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
-
     type: Literal["recall"] = "recall"
     classes: list[str] = Field(..., min_length=1)
     averaging: Literal["macro", "micro"]
 
 
-class FScoreAggregatorSpec(BaseModel):
+class FScoreAggregatorSpec(_AggregatorSpecBase):
     """Run-level F-beta aggregator (multiclass, micro or macro averaged)."""
 
-    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
-
     type: Literal["fscore"] = "fscore"
     classes: list[str] = Field(..., min_length=1)
     averaging: Literal["macro", "micro"]
diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
index dcb33cc78..c00eb666a 100644
--- a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
@@ -39,27 +39,17 @@ class BaseDatasetEvaluator(ABC, Generic[SpecT]):
     """
 
     spec: SpecT
-    _source_evaluator: str
+    source_evaluator: str
 
     def __init__(self, spec: SpecT, source_evaluator: str) -> None:
         """Store the aggregator spec and the source evaluator name."""
         self.spec = spec
-        self._source_evaluator = source_evaluator
-
-    @property
-    def source_evaluator(self) -> str:
-        """Name of the upstream evaluator whose results this one consumes."""
-        return self._source_evaluator
+        self.source_evaluator = source_evaluator
 
     @property
     def name(self) -> str:
         """Stable key for this dataset evaluator's result in the output map."""
-        return f"{self._source_evaluator}.{self.spec.type}"
-
-    @classmethod
-    @abstractmethod
-    def get_evaluator_id(cls) -> str:
-        """Stable identifier matching the ``type`` discriminator on specs."""
+        return f"{self.source_evaluator}.{self.spec.type}"
 
     @abstractmethod
     def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
index b15020c25..ef6063b4c 100644
--- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -10,34 +10,30 @@
 
 from __future__ import annotations
 
+from dataclasses import dataclass
+
 from pydantic import BaseModel, ConfigDict
 from pydantic.alias_generators import to_camel
 
 from ..models.models import (
     EvaluationResult,
     EvaluationResultDto,
-    EvaluatorType,
     NumericEvaluationResult,
 )
-from ._aggregator_specs import (
-    FScoreAggregatorSpec,
-    PrecisionAggregatorSpec,
-    RecallAggregatorSpec,
-)
+from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
 from .base_dataset_evaluator import BaseDatasetEvaluator
 from .base_evaluator import BaseEvaluatorJustification
 
 
-def _coerce_justification(details: object) -> tuple[str, str] | None:
-    """Extract (expected, actual) from an EvaluationResultDto.details payload."""
+def _coerce_justification(details: object) -> BaseEvaluatorJustification | None:
+    """Extract the BaseEvaluatorJustification from an EvaluationResultDto.details payload."""
     if isinstance(details, BaseEvaluatorJustification):
-        return details.expected, details.actual
+        return details
     if isinstance(details, dict):
         try:
-            j = BaseEvaluatorJustification.model_validate(details)
+            return BaseEvaluatorJustification.model_validate(details)
         except Exception:
             return None
-        return j.expected, j.actual
     return None
 
 
@@ -71,33 +67,15 @@ class ClassificationDetails(BaseModel):
     n_skipped: int
 
 
+@dataclass(slots=True)
 class _ConfusionData:
     """Internal: confusion matrix and per-class counts derived from results."""
 
-    __slots__ = ("classes", "matrix", "n_total", "n_scored", "n_skipped")
-
-    def __init__(
-        self,
-        classes: list[str],
-        matrix: list[list[int]],
-        n_total: int,
-        n_scored: int,
-        n_skipped: int,
-    ) -> None:
-        self.classes = classes
-        self.matrix = matrix
-        self.n_total = n_total
-        self.n_scored = n_scored
-        self.n_skipped = n_skipped
-
-    def counts_for(self, class_index: int) -> tuple[int, int, int, int]:
-        """Return (tp, fp, fn, tn) for a class index."""
-        k = len(self.classes)
-        tp = self.matrix[class_index][class_index]
-        fp = sum(self.matrix[class_index][j] for j in range(k)) - tp
-        fn = sum(self.matrix[j][class_index] for j in range(k)) - tp
-        tn = self.n_scored - tp - fp - fn
-        return tp, fp, fn, tn
+    classes: list[str]
+    matrix: list[list[int]]
+    n_total: int
+    n_scored: int
+    n_skipped: int
 
 
 def _build_confusion(
@@ -125,8 +103,8 @@ def _build_confusion(
         if j is None:
             n_skipped += 1
             continue
-        exp = j[0].lower()
-        act = j[1].lower()
+        exp = j.expected.lower()
+        act = j.actual.lower()
         if exp not in index_of or act not in index_of:
             n_skipped += 1
             continue
@@ -142,126 +120,77 @@ def _build_confusion(
     )
 
 
-def _precision_of(tp: int, fp: int, _fn: int, _tn: int) -> float:
-    return tp / (tp + fp) if (tp + fp) > 0 else 0.0
-
-
-def _recall_of(tp: int, _fp: int, fn: int, _tn: int) -> float:
-    return tp / (tp + fn) if (tp + fn) > 0 else 0.0
-
-
-def _f_score_of(beta: float):
-    beta_sq = beta * beta
-
-    def compute(tp: int, fp: int, fn: int, _tn: int) -> float:
-        p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
-        r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
-        denom = beta_sq * p + r
-        return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0
-
-    return compute
-
-
-def _build_details(
-    confusion: _ConfusionData,
-    metric_name: str,
-    average: str,
-    per_class_fn,
-) -> tuple[ClassificationDetails, float]:
-    """Compute per-class values, micro, macro, and pick the headline."""
-    per_class: dict[str, PerClassMetrics] = {}
-    total_tp = 0
-    total_fp = 0
-    total_fn = 0
-
-    for c, label in enumerate(confusion.classes):
-        tp, fp, fn, tn = confusion.counts_for(c)
-        total_tp += tp
-        total_fp += fp
-        total_fn += fn
-        per_class[label] = PerClassMetrics(
-            tp=tp,
-            tn=tn,
-            fp=fp,
-            fn=fn,
-            support=tp + fn,
-            value=per_class_fn(tp, fp, fn, tn),
-        )
-
-    micro = per_class_fn(total_tp, total_fp, total_fn, 0)
-
-    k = len(confusion.classes)
-    macro = sum(per_class[c].value for c in confusion.classes) / k if k > 0 else 0.0
-
-    details = ClassificationDetails(
-        metric=metric_name,
-        average=average,
-        classes=confusion.classes,
-        confusion_matrix=confusion.matrix,
-        per_class=per_class,
-        micro=micro,
-        macro=macro,
-        n_total=confusion.n_total,
-        n_scored=confusion.n_scored,
-        n_skipped=confusion.n_skipped,
-    )
-
-    headline = micro if average == "micro" else macro
-    return details, headline
-
-
-# ─── evaluators ───────────────────────────────────────────────────────────────
+_METRIC_NAME = {"precision": "precision", "recall": "recall", "fscore": "f_score"}
 
 
-class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionAggregatorSpec]):
-    """Dataset-level precision evaluator (multiclass, micro or macro averaged)."""
+class ClassificationDatasetEvaluator(BaseDatasetEvaluator[AggregatorSpec]):
+    """One implementation for all three classification aggregators.
 
-    @classmethod
-    def get_evaluator_id(cls) -> str:
-        """Identifier matching the type discriminator on specs."""
-        return EvaluatorType.DATASET_PRECISION.value
+    Dispatches on ``self.spec.type`` to pick the per-class metric formula:
+    precision, recall, or F-beta. The math (confusion-matrix build, per-class
+    counts, micro/macro averaging) is identical across the three.
+    """
 
     def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
-        """Compute the precision report and return the headline as score."""
+        """Compute the configured metric report and return the headline as score."""
         confusion = _build_confusion(results, self.spec.classes)
-        details, headline = _build_details(
-            confusion, "precision", self.spec.averaging, _precision_of
+        beta_sq = (
+            self.spec.f_value * self.spec.f_value
+            if isinstance(self.spec, FScoreAggregatorSpec)
+            else 0.0
         )
-        return NumericEvaluationResult(score=headline, details=details)
-
-
-class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallAggregatorSpec]):
-    """Dataset-level recall evaluator (multiclass, micro or macro averaged)."""
-
-    @classmethod
-    def get_evaluator_id(cls) -> str:
-        """Identifier matching the type discriminator on specs."""
-        return EvaluatorType.DATASET_RECALL.value
-
-    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
-        """Compute the recall report and return the headline as score."""
-        confusion = _build_confusion(results, self.spec.classes)
-        details, headline = _build_details(
-            confusion, "recall", self.spec.averaging, _recall_of
+        metric_type = self.spec.type
+
+        per_class: dict[str, PerClassMetrics] = {}
+        total_tp = 0
+        total_fp = 0
+        total_fn = 0
+        k = len(confusion.classes)
+
+        for c, label in enumerate(confusion.classes):
+            tp = confusion.matrix[c][c]
+            fp = sum(confusion.matrix[c][j] for j in range(k)) - tp
+            fn = sum(confusion.matrix[j][c] for j in range(k)) - tp
+            tn = confusion.n_scored - tp - fp - fn
+            total_tp += tp
+            total_fp += fp
+            total_fn += fn
+            per_class[label] = PerClassMetrics(
+                tp=tp,
+                tn=tn,
+                fp=fp,
+                fn=fn,
+                support=tp + fn,
+                value=_metric(metric_type, tp, fp, fn, beta_sq),
+            )
+
+        micro = _metric(metric_type, total_tp, total_fp, total_fn, beta_sq)
+        macro = sum(per_class[c].value for c in confusion.classes) / k
+
+        details = ClassificationDetails(
+            metric=_METRIC_NAME[metric_type],
+            average=self.spec.averaging,
+            classes=confusion.classes,
+            confusion_matrix=confusion.matrix,
+            per_class=per_class,
+            micro=micro,
+            macro=macro,
+            n_total=confusion.n_total,
+            n_scored=confusion.n_scored,
+            n_skipped=confusion.n_skipped,
         )
-        return NumericEvaluationResult(score=headline, details=details)
 
+        headline = micro if self.spec.averaging == "micro" else macro
+        return NumericEvaluationResult(score=headline, details=details)
 
-class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreAggregatorSpec]):
-    """Dataset-level F-beta evaluator (multiclass, micro or macro averaged)."""
-
-    @classmethod
-    def get_evaluator_id(cls) -> str:
-        """Identifier matching the type discriminator on specs."""
-        return EvaluatorType.DATASET_F_SCORE.value
 
-    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
-        """Compute the F-beta report and return the headline as score."""
-        confusion = _build_confusion(results, self.spec.classes)
-        details, headline = _build_details(
-            confusion,
-            "f_score",
-            self.spec.averaging,
-            _f_score_of(self.spec.f_value),
-        )
-        return NumericEvaluationResult(score=headline, details=details)
+def _metric(metric_type: str, tp: int, fp: int, fn: int, beta_sq: float) -> float:
+    """One formula switch covering precision / recall / F-beta."""
+    if metric_type == "precision":
+        return tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    if metric_type == "recall":
+        return tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    denom = beta_sq * p + r
+    return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0
diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
index d597b9085..9cd895ad2 100644
--- a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
+++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
@@ -1,61 +1,27 @@
 """Factory that instantiates dataset-level evaluators from aggregator specs.
 
-Dataset evaluators are now built from a self-contained :class:`AggregatorSpec`
+Dataset evaluators are built from a self-contained :class:`AggregatorSpec`
 embedded in a per-datapoint classification evaluator's config, plus the source
-evaluator's name (supplied by the runtime when walking those configs). The
-factory inspects the spec's ``type`` discriminator and returns the matching
-evaluator instance.
+evaluator's name (supplied by the runtime when walking those configs). All
+three aggregator types share a single :class:`ClassificationDatasetEvaluator`
+implementation that dispatches on ``spec.type`` internally.
 """
 
 from __future__ import annotations
 
-from typing import Any
-
-from ._aggregator_specs import (
-    AggregatorSpec,
-    FScoreAggregatorSpec,
-    PrecisionAggregatorSpec,
-    RecallAggregatorSpec,
-)
-from .base_dataset_evaluator import BaseDatasetEvaluator
-from .classification_dataset_evaluators import (
-    FScoreDatasetEvaluator,
-    PrecisionDatasetEvaluator,
-    RecallDatasetEvaluator,
-)
-
-_EVALUATOR_REGISTRY: dict[str, type[BaseDatasetEvaluator[Any]]] = {
-    "precision": PrecisionDatasetEvaluator,
-    "recall": RecallDatasetEvaluator,
-    "fscore": FScoreDatasetEvaluator,
-}
+from ._aggregator_specs import AggregatorSpec
+from .classification_dataset_evaluators import ClassificationDatasetEvaluator
 
 
 def build_dataset_evaluator(
     spec: AggregatorSpec,
     source_evaluator: str,
-) -> BaseDatasetEvaluator[Any]:
+) -> ClassificationDatasetEvaluator:
     """Build a dataset evaluator instance from an aggregator spec.
 
     Args:
         spec: A validated :class:`AggregatorSpec` (precision / recall / fscore).
         source_evaluator: Name of the per-datapoint evaluator whose results
             this aggregator consumes.
-
-    Raises:
-        ValueError: If ``spec.type`` doesn't match any known aggregator.
     """
-    evaluator_cls = _EVALUATOR_REGISTRY.get(spec.type)
-    if evaluator_cls is None:
-        known = sorted(_EVALUATOR_REGISTRY.keys())
-        raise ValueError(f"Unknown aggregator type '{spec.type}'. Known types: {known}")
-    return evaluator_cls(spec, source_evaluator)
-
-
-__all__ = [
-    "AggregatorSpec",
-    "PrecisionAggregatorSpec",
-    "RecallAggregatorSpec",
-    "FScoreAggregatorSpec",
-    "build_dataset_evaluator",
-]
+    return ClassificationDatasetEvaluator(spec, source_evaluator)
diff --git a/packages/uipath/src/uipath/eval/models/models.py b/packages/uipath/src/uipath/eval/models/models.py
index 8945137e7..14c130c92 100644
--- a/packages/uipath/src/uipath/eval/models/models.py
+++ b/packages/uipath/src/uipath/eval/models/models.py
@@ -300,9 +300,6 @@ class EvaluatorType(str, Enum):
     TOOL_CALL_OUTPUT = "uipath-tool-call-output"
     BINARY_CLASSIFICATION = "uipath-binary-classification"
     MULTICLASS_CLASSIFICATION = "uipath-multiclass-classification"
-    DATASET_PRECISION = "uipath-dataset-precision"
-    DATASET_RECALL = "uipath-dataset-recall"
-    DATASET_F_SCORE = "uipath-dataset-f-score"
 
 
 class ToolCall(BaseModel):
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
index c64f8f158..89f8f6c29 100644
--- a/packages/uipath/src/uipath/eval/runtime/runtime.py
+++ b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -237,13 +237,11 @@ def compute_dataset_evaluator_results(
 
     dataset_results: dict[str, EvaluationResultDto] = {}
     for evaluator in evaluators:
-        evaluator_config = getattr(evaluator, "evaluator_config", None)
-        if evaluator_config is None:
+        config = getattr(evaluator, "evaluator_config", None)
+        aggregators = getattr(config, "aggregators", None)
+        if config is None or not aggregators:
             continue
-        aggregators = getattr(evaluator_config, "aggregators", None)
-        if not aggregators:
-            continue
-        source_name = evaluator_config.name
+        source_name = config.name
         source_results = results_by_evaluator.get(source_name, [])
         for spec in aggregators:
             dataset_evaluator = build_dataset_evaluator(spec, source_name)
diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
index 53e1e9855..29343b170 100644
--- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -18,10 +18,8 @@
 )
 from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
 from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDatasetEvaluator,
     ClassificationDetails,
-    FScoreDatasetEvaluator,
-    PrecisionDatasetEvaluator,
-    RecallDatasetEvaluator,
 )
 from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
 from uipath.eval.evaluators.multiclass_classification_evaluator import (
@@ -53,25 +51,27 @@ def _result(
 
 def _precision(
     classes: list[str], averaging: str = "macro"
-) -> PrecisionDatasetEvaluator:
+) -> ClassificationDatasetEvaluator:
     spec = PrecisionAggregatorSpec(classes=classes, averaging=averaging)  # type: ignore[arg-type]
-    return PrecisionDatasetEvaluator(spec, source_evaluator="intent_match")
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
 
 
-def _recall(classes: list[str], averaging: str = "macro") -> RecallDatasetEvaluator:
+def _recall(
+    classes: list[str], averaging: str = "macro"
+) -> ClassificationDatasetEvaluator:
     spec = RecallAggregatorSpec(classes=classes, averaging=averaging)  # type: ignore[arg-type]
-    return RecallDatasetEvaluator(spec, source_evaluator="intent_match")
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
 
 
 def _fscore(
     classes: list[str], averaging: str = "macro", f_value: float = 1.0
-) -> FScoreDatasetEvaluator:
+) -> ClassificationDatasetEvaluator:
     spec = FScoreAggregatorSpec(
         classes=classes,
         averaging=averaging,  # type: ignore[arg-type]
         f_value=f_value,
     )
-    return FScoreDatasetEvaluator(spec, source_evaluator="intent_match")
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
 
 
 def _details(result: object) -> ClassificationDetails:
@@ -276,14 +276,16 @@ class TestFactory:
     def test_builds_precision_from_spec(self) -> None:
         spec = PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro")
         evaluator = build_dataset_evaluator(spec, "intent_match")
-        assert isinstance(evaluator, PrecisionDatasetEvaluator)
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert evaluator.spec.type == "precision"
         assert evaluator.source_evaluator == "intent_match"
         assert evaluator.name == "intent_match.precision"
 
     def test_builds_recall_from_spec(self) -> None:
         spec = RecallAggregatorSpec(classes=["yes", "no"], averaging="micro")
         evaluator = build_dataset_evaluator(spec, "intent_match")
-        assert isinstance(evaluator, RecallDatasetEvaluator)
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert evaluator.spec.type == "recall"
         assert evaluator.name == "intent_match.recall"
 
     def test_builds_fscore_from_spec(self) -> None:
@@ -291,7 +293,8 @@ def test_builds_fscore_from_spec(self) -> None:
             classes=["yes", "no"], averaging="macro", f_value=2.0
         )
         evaluator = build_dataset_evaluator(spec, "intent_match")
-        assert isinstance(evaluator, FScoreDatasetEvaluator)
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert isinstance(evaluator.spec, FScoreAggregatorSpec)
         assert evaluator.spec.f_value == 2.0
 
 

From 50c64f4862c57834437b1dba59266106f29e3b66 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 21:53:00 -0700
Subject: [PATCH 08/13] refactor(eval): apply ponytail-review cleanup
 (justification + demo)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add BaseEvaluatorJustification.try_from classmethod and collapse the
  three duplicate "instance | dict | other" coercion blocks in
  classification_dataset_evaluators, binary_classification_evaluator,
  and multiclass_classification_evaluator down to one line each.
- Replace the 80-line ASCII confusion-matrix pretty-printer in
  dataset_evaluators_demo with the structured JSON wire shape — the
  thing readers actually want to inspect.

Deferred from this PR: dropping reduce_scores / _micro_metric /
_macro_metric on Binary/Multiclass evaluators, and the matching
metric_type/averaging/f_value config fields. The runtime calls
GenericBaseEvaluator.reduce_scores per-evaluator to compute the
top-level evaluator score; the dataset evaluator framework adds
{source}.{type}-keyed metrics in addition to that score, it doesn't
replace it. Removing them would break the existing per-evaluator
headline. Worth a follow-up that either makes reduce_scores delegate
to the dataset evaluator framework or formally splits the two paths.

No behavior change.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../examples/dataset_evaluators_demo.py       | 68 ++-----------------
 .../uipath/eval/evaluators/base_evaluator.py  | 19 ++++++
 .../binary_classification_evaluator.py        | 10 +--
 .../classification_dataset_evaluators.py      | 10 +--
 .../multiclass_classification_evaluator.py    | 10 +--
 5 files changed, 30 insertions(+), 87 deletions(-)

diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py
index 2d13f3572..6d887f3dd 100644
--- a/packages/uipath/examples/dataset_evaluators_demo.py
+++ b/packages/uipath/examples/dataset_evaluators_demo.py
@@ -13,7 +13,6 @@
 
 from __future__ import annotations
 
-import json
 from typing import Iterable
 
 from uipath.eval.evaluators._aggregator_specs import (
@@ -56,75 +55,18 @@ def print_header(title: str) -> None:
     print("═" * 78)
 
 
-def print_confusion(details: ClassificationDetails) -> None:
-    """Pretty-print the confusion matrix as a table."""
-    classes = details.classes
-    cell_width = max(7, max(len(c) for c in classes) + 1)
-    header = (
-        " " * cell_width
-        + " │ "
-        + " │ ".join(c.center(cell_width) for c in classes)
-        + " │  ← expected"
-    )
-    print(header)
-    print("─" * len(header))
-    for predicted_idx, predicted_label in enumerate(classes):
-        row_cells = [
-            str(details.confusion_matrix[predicted_idx][expected_idx]).rjust(cell_width)
-            for expected_idx in range(len(classes))
-        ]
-        print(predicted_label.ljust(cell_width) + " │ " + " │ ".join(row_cells) + " │")
-    print(" " * cell_width + "↑ predicted")
-
-
-def print_per_class(details: ClassificationDetails) -> None:
-    """One-row-per-class table of TP/TN/FP/FN + the metric."""
-    label_w = max(len("class"), max(len(c) for c in details.classes))
-    metric = details.metric
-    header = f"  {'class'.ljust(label_w)}  │  TP  TN  FP  FN  support  {metric}"
-    print(header)
-    print("  " + "─" * (len(header) - 2))
-    for cls, m in details.per_class.items():
-        print(
-            f"  {cls.ljust(label_w)}  │  "
-            f"{m.tp:>2}  {m.tn:>2}  {m.fp:>2}  {m.fn:>2}  {m.support:>7}  "
-            f"{m.value:.3f}"
-        )
-
-
 def report(
     title: str,
     result: NumericEvaluationResult,
     *,
-    show_json_tail: bool = False,
+    show_json_tail: bool = False,  # kept for call-site compat; payload is always emitted
 ) -> None:
-    """Render one scenario's result block."""
+    """Render one scenario's result block as JSON — the actual wire shape."""
+    _ = show_json_tail
     print_header(title)
     assert isinstance(result.details, ClassificationDetails)
-    d = result.details
-    print(
-        f"  metric = {d.metric}   average = {d.average}   "
-        f"score (headline) = {result.score:.4f}"
-    )
-    print(
-        f"  micro = {d.micro:.4f}   macro = {d.macro:.4f}   "
-        f"scored = {d.n_scored}/{d.n_total}   skipped = {d.n_skipped}"
-    )
-    print()
-    print_confusion(d)
-    print()
-    print_per_class(d)
-    if show_json_tail:
-        print()
-        print("  ── wire JSON (matches frontend zod schema) ──")
-        payload = d.model_dump(by_alias=True)
-        print(
-            "  "
-            + json.dumps(
-                {k: payload[k] for k in ("metric", "average", "micro", "macro")},
-                indent=2,
-            ).replace("\n", "\n  ")
-        )
+    print(f"  headline score = {result.score:.4f}")
+    print(result.details.model_dump_json(indent=2, by_alias=True))
 
 
 # ─── scenarios ────────────────────────────────────────────────────────────────
diff --git a/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py
index 73fac46c6..285a022f4 100644
--- a/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py
@@ -47,6 +47,25 @@ class BaseEvaluatorJustification(BaseModel):
     expected: str
     actual: str
 
+    @classmethod
+    def try_from(cls, details: object) -> "BaseEvaluatorJustification | None":
+        """Coerce a free-form details payload into a justification, or return None.
+
+        Accepts either an existing instance or a dict that ``model_validate`` can
+        parse. Anything else (str, None, malformed dict) yields ``None``. Used by
+        the classification evaluators + dataset evaluator framework to walk
+        per-datapoint results without each site re-implementing the same
+        isinstance/try/except dance.
+        """
+        if isinstance(details, cls):
+            return details
+        if isinstance(details, dict):
+            try:
+                return cls.model_validate(details)
+            except Exception:
+                return None
+        return None
+
 
 # Additional type variables for Config and Justification
 # Note: C must be BaseEvaluatorConfig[T] to ensure type consistency
diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
index 0a65c2c64..c3f394d96 100644
--- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
@@ -105,14 +105,8 @@ def reduce_scores(self, results: list[EvaluationResultDto]) -> float:
         tp = fp = fn = 0
 
         for r in results:
-            if isinstance(r.details, BaseEvaluatorJustification):
-                details = r.details
-            elif isinstance(r.details, dict):
-                try:
-                    details = BaseEvaluatorJustification.model_validate(r.details)
-                except Exception:
-                    continue
-            else:
+            details = BaseEvaluatorJustification.try_from(r.details)
+            if details is None:
                 continue
             pred = details.actual
             exp = details.expected
diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
index ef6063b4c..f64ebcd63 100644
--- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -27,14 +27,7 @@
 
 def _coerce_justification(details: object) -> BaseEvaluatorJustification | None:
     """Extract the BaseEvaluatorJustification from an EvaluationResultDto.details payload."""
-    if isinstance(details, BaseEvaluatorJustification):
-        return details
-    if isinstance(details, dict):
-        try:
-            return BaseEvaluatorJustification.model_validate(details)
-        except Exception:
-            return None
-    return None
+    return BaseEvaluatorJustification.try_from(details)
 
 
 class PerClassMetrics(BaseModel):
@@ -165,6 +158,7 @@ def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
             )
 
         micro = _metric(metric_type, total_tp, total_fp, total_fn, beta_sq)
+        # AggregatorSpec.classes has min_length=1, so k >= 1 always.
         macro = sum(per_class[c].value for c in confusion.classes) / k
 
         details = ClassificationDetails(
diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
index 842d13174..1fb736f2a 100644
--- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
@@ -121,14 +121,8 @@ def reduce_scores(self, results: list[EvaluationResultDto]) -> float:
         # Reconstruct confusion matrix: confusion[pred_idx][exp_idx]
         confusion = [[0] * k for _ in range(k)]
         for r in results:
-            if isinstance(r.details, BaseEvaluatorJustification):
-                details = r.details
-            elif isinstance(r.details, dict):
-                try:
-                    details = BaseEvaluatorJustification.model_validate(r.details)
-                except Exception:
-                    continue
-            else:
+            details = BaseEvaluatorJustification.try_from(r.details)
+            if details is None:
                 continue
             pred = details.actual
             exp = details.expected

From ad32c22c64e7ccb5aaf8446454cf8bc9408f6c30 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 22:27:49 -0700
Subject: [PATCH 09/13] fix(eval): address adversarial-review feedback on
 dataset evaluators

- M2: drop _METRIC_NAME indirection. metric field on
  ClassificationDetails now uses spec.type verbatim ("fscore" not
  "f_score"), matching the discriminator on the wire.
- M3: document confusion_matrix orientation via Field(description=...).
  Matrix is [predicted_idx][expected_idx], opposite of sklearn's
  convention. Add a regression test pinning the orientation.
- M4: _metric raises ValueError on unknown metric_type instead of
  silently falling through to the F-beta formula. Defense in depth
  on top of pydantic's discriminator.
- M6: replace defensive getattr chain in compute_dataset_evaluator_
  results with isinstance narrowing on the classification config types.
  Mypy-clean; intent is now "classification configs declare
  aggregators" rather than "anything might have an aggregators
  attribute".
- L1: rename duplicate test_two_class_macro tests so pytest output
  disambiguates Precision vs Recall.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../classification_dataset_evaluators.py      | 32 +++++++++++++------
 .../uipath/src/uipath/eval/runtime/runtime.py | 24 ++++++++++++--
 .../test_dataset_classification_evaluators.py | 24 ++++++++++++--
 3 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
index ef6063b4c..70d74cd26 100644
--- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -12,7 +12,7 @@
 
 from dataclasses import dataclass
 
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 from pydantic.alias_generators import to_camel
 
 from ..models.models import (
@@ -58,7 +58,17 @@ class ClassificationDetails(BaseModel):
     metric: str
     average: str
     classes: list[str]
-    confusion_matrix: list[list[int]]
+    confusion_matrix: list[list[int]] = Field(
+        ...,
+        description=(
+            "k x k confusion matrix indexed as "
+            "``confusion_matrix[predicted_idx][expected_idx]`` "
+            "(rows are predicted classes, columns are expected). "
+            "This is the transpose of sklearn's convention "
+            "(``[true][predicted]``); UI / consumer code must use the "
+            "orientation documented here."
+        ),
+    )
     per_class: dict[str, PerClassMetrics]
     micro: float
     macro: float
@@ -120,9 +130,6 @@ def _build_confusion(
     )
 
 
-_METRIC_NAME = {"precision": "precision", "recall": "recall", "fscore": "f_score"}
-
-
 class ClassificationDatasetEvaluator(BaseDatasetEvaluator[AggregatorSpec]):
     """One implementation for all three classification aggregators.
 
@@ -168,7 +175,7 @@ def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
         macro = sum(per_class[c].value for c in confusion.classes) / k
 
         details = ClassificationDetails(
-            metric=_METRIC_NAME[metric_type],
+            metric=metric_type,
             average=self.spec.averaging,
             classes=confusion.classes,
             confusion_matrix=confusion.matrix,
@@ -190,7 +197,12 @@ def _metric(metric_type: str, tp: int, fp: int, fn: int, beta_sq: float) -> floa
         return tp / (tp + fp) if (tp + fp) > 0 else 0.0
     if metric_type == "recall":
         return tp / (tp + fn) if (tp + fn) > 0 else 0.0
-    p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
-    r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
-    denom = beta_sq * p + r
-    return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0
+    if metric_type == "fscore":
+        p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        denom = beta_sq * p + r
+        return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0
+    raise ValueError(
+        f"Unknown metric_type: {metric_type!r}. "
+        "Expected one of: precision, recall, fscore."
+    )
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
index 89f8f6c29..987b6c4ae 100644
--- a/packages/uipath/src/uipath/eval/runtime/runtime.py
+++ b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -46,7 +46,13 @@
 
 from .._execution_context import ExecutionSpanCollector
 from ..evaluators.base_evaluator import GenericBaseEvaluator
+from ..evaluators.binary_classification_evaluator import (
+    BinaryClassificationEvaluatorConfig,
+)
 from ..evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from ..evaluators.multiclass_classification_evaluator import (
+    MulticlassClassificationEvaluatorConfig,
+)
 from ..evaluators.output_evaluator import OutputEvaluationCriteria
 from ..helpers import get_agent_model
 from ..mocks._cache_manager import CacheManager
@@ -237,13 +243,25 @@ def compute_dataset_evaluator_results(
 
     dataset_results: dict[str, EvaluationResultDto] = {}
     for evaluator in evaluators:
+        # Aggregators currently only live on classification evaluator configs.
+        # ``GenericBaseEvaluator`` doesn't declare ``evaluator_config``, so we
+        # retrieve it via ``getattr`` and narrow with ``isinstance`` to a
+        # classification config type before reading ``aggregators``. Widen the
+        # tuple if a future evaluator type grows an ``aggregators`` field.
         config = getattr(evaluator, "evaluator_config", None)
-        aggregators = getattr(config, "aggregators", None)
-        if config is None or not aggregators:
+        if not isinstance(
+            config,
+            (
+                BinaryClassificationEvaluatorConfig,
+                MulticlassClassificationEvaluatorConfig,
+            ),
+        ):
+            continue
+        if not config.aggregators:
             continue
         source_name = config.name
         source_results = results_by_evaluator.get(source_name, [])
-        for spec in aggregators:
+        for spec in config.aggregators:
             dataset_evaluator = build_dataset_evaluator(spec, source_name)
             evaluation_result = dataset_evaluator.evaluate(source_results)
             dataset_results[dataset_evaluator.name] = (
diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
index 29343b170..bb7d3538e 100644
--- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -110,7 +110,27 @@ def test_empty_input_returns_zeroed_result(self) -> None:
         assert d.per_class["cat"].tp == 0
         assert d.per_class["cat"].tn == 0
 
-    def test_two_class_macro(self) -> None:
+    def test_confusion_matrix_is_predicted_by_expected(self) -> None:
+        # Pin the documented orientation: confusion_matrix[predicted][expected].
+        # Differs from sklearn's [true][predicted] convention.
+        results = [
+            _result("cat", "cat"),  # expected=cat, predicted=cat -> [cat][cat]
+            _result("cat", "dog"),  # expected=cat, predicted=dog -> [dog][cat]
+            _result("dog", "dog"),  # expected=dog, predicted=dog -> [dog][dog]
+            _result("dog", "dog"),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        # classes -> index: cat=0, dog=1
+        # [predicted=cat][expected=cat] = 1
+        assert d.confusion_matrix[0][0] == 1
+        # [predicted=dog][expected=cat] = 1 (the FP for dog / FN for cat)
+        assert d.confusion_matrix[1][0] == 1
+        # [predicted=dog][expected=dog] = 2
+        assert d.confusion_matrix[1][1] == 2
+        # [predicted=cat][expected=dog] = 0
+        assert d.confusion_matrix[0][1] == 0
+
+    def test_precision_two_class_macro(self) -> None:
         results = [
             _result("yes", "yes"),
             _result("yes", "yes"),
@@ -164,7 +184,7 @@ def test_three_class_macro(self) -> None:
 
 
 class TestRecallEvaluator:
-    def test_two_class_macro(self) -> None:
+    def test_recall_two_class_macro(self) -> None:
         results = [
             _result("yes", "yes"),
             _result("yes", "yes"),

From 027901c96be416d791e76d93c3b2ca9d4a470a95 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 22:32:46 -0700
Subject: [PATCH 10/13] fix(eval): address adversarial-review feedback on
 classification samples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- H1/H2: pydantic model_validator on Binary/Multiclass classification
  configs cross-checks aggregators against evaluator-level fields. Binary
  rejects aggregators whose `classes` doesn't include `positive_class`,
  and aggregators of the same metric type with a different `f_value`.
  Multiclass extends this with the full class-coverage check and an
  `averaging` consistency check. Without this, a user could ship configs
  where the per-evaluator headline and the dataset aggregator silently
  scored disjoint label spaces or used different averaging.
- H3: binary e2e test now asserts the precision/recall/fscore aggregator
  scores (5/6, 5/6, 0.8) instead of only the key set. A regression that
  zeros out all aggregator scores would now fail the test.
- H4: multiclass `evaluate()` no longer raises on out-of-vocab predicted
  class — it now returns score=0.0 with the OOV label preserved in the
  justification, mirroring binary's behavior. The dataset evaluator's
  confusion matrix already accounts for this via `n_skipped`.
  Configuration errors (expected_class outside vocab) still raise.
- M1: drop the `_coerce_justification` one-line wrapper; inline
  `BaseEvaluatorJustification.try_from(r.details)` at the single caller
  in `_build_confusion`.
- M2: preserve user-supplied class casing in `_ConfusionData.classes` and
  the `per_class` keys. The lowercase normalization is now only used for
  the internal lookup index, so a config with classes=["Spam","Ham"]
  surfaces "Spam"/"Ham" in the output rather than "spam"/"ham".
- M3 (multiclass `reduce_scores` + ClassificationDatasetEvaluator
  double-walking the same confusion matrix): deferred. Cleanest fix is
  to drop the evaluator-level `metric_type`/`averaging`/`f_value` fields
  and route the per-evaluator headline through the dataset evaluator
  framework — out of scope for this commit. Tracked as a follow-up.
- L1: refreshed test_classification_samples_e2e docstring to reflect
  the new aggregator-score coverage on the binary side.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../binary_classification_evaluator.py        |  59 ++++++-
 .../classification_dataset_evaluators.py      |  21 +--
 .../multiclass_classification_evaluator.py    |  87 ++++++++--
 .../eval/test_classification_samples_e2e.py   |  19 ++-
 .../evaluators/test_evaluator_methods.py      | 157 +++++++++++++++++-
 5 files changed, 314 insertions(+), 29 deletions(-)

diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
index c3f394d96..44a795d90 100644
--- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
@@ -8,6 +8,8 @@
 
 from typing import Literal
 
+from pydantic import model_validator
+
 from ..models import (
     AgentExecution,
     EvaluationResult,
@@ -19,13 +21,22 @@
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
-from ._aggregator_specs import AggregatorSpec
+from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
 from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification
 from .output_evaluator import (
     BaseOutputEvaluator,
     OutputEvaluatorConfig,
 )
 
+# Maps the evaluator-level ``metric_type`` strings to the corresponding
+# aggregator-spec ``type`` values. The two spellings differ historically:
+# the evaluator uses "f-score" (hyphen), the aggregator uses "fscore".
+_METRIC_TYPE_TO_AGGREGATOR_TYPE = {
+    "precision": "precision",
+    "recall": "recall",
+    "f-score": "fscore",
+}
+
 
 class BinaryClassificationEvaluationCriteria(BaseEvaluationCriteria):
     """Per-datapoint criteria: which class this sample should belong to."""
@@ -49,6 +60,52 @@ class BinaryClassificationEvaluatorConfig(
     # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``.
     aggregators: list[AggregatorSpec] | None = None
 
+    @model_validator(mode="after")
+    def _validate_aggregators_against_evaluator_config(
+        self,
+    ) -> "BinaryClassificationEvaluatorConfig":
+        """Reject aggregators that are inconsistent with the evaluator's own config.
+
+        Two checks:
+          * ``positive_class`` must appear in every aggregator's ``classes``
+            list (case-insensitive). Otherwise the per-datapoint headline
+            and the aggregator's confusion matrix score completely
+            disjoint label spaces.
+          * For each aggregator whose ``type`` matches the evaluator-level
+            ``metric_type`` (mapped via :data:`_METRIC_TYPE_TO_AGGREGATOR_TYPE`),
+            the aggregator's ``f_value`` must match the evaluator's
+            ``f_value``. Otherwise the per-evaluator headline produced via
+            ``reduce_scores`` and the dataset evaluator's per-aggregator
+            score diverge silently.
+        """
+        if not self.aggregators:
+            return self
+        positive_lower = self.positive_class.lower() if self.positive_class else ""
+        evaluator_aggregator_type = _METRIC_TYPE_TO_AGGREGATOR_TYPE.get(
+            self.metric_type
+        )
+        for spec in self.aggregators:
+            if positive_lower and positive_lower not in {
+                c.lower() for c in spec.classes
+            }:
+                raise ValueError(
+                    f"Aggregator '{spec.type}' on evaluator '{self.name}' "
+                    f"declares classes={spec.classes!r} but positive_class="
+                    f"{self.positive_class!r} is not in that list. Add the "
+                    "positive class to the aggregator's classes or remove it."
+                )
+            if spec.type == evaluator_aggregator_type and isinstance(
+                spec, FScoreAggregatorSpec
+            ):
+                if spec.f_value != self.f_value:
+                    raise ValueError(
+                        f"Aggregator 'fscore' on evaluator '{self.name}' has "
+                        f"f_value={spec.f_value} but the evaluator's f_value="
+                        f"{self.f_value}. The per-evaluator headline and the "
+                        "aggregator would compute different F-beta scores."
+                    )
+        return self
+
 
 class BinaryClassificationEvaluator(
     BaseOutputEvaluator[
diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
index 3aad5832e..7f2ca2519 100644
--- a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -25,11 +25,6 @@
 from .base_evaluator import BaseEvaluatorJustification
 
 
-def _coerce_justification(details: object) -> BaseEvaluatorJustification | None:
-    """Extract the BaseEvaluatorJustification from an EvaluationResultDto.details payload."""
-    return BaseEvaluatorJustification.try_from(details)
-
-
 class PerClassMetrics(BaseModel):
     """Per-class confusion counts plus the metric the evaluator computed."""
 
@@ -89,12 +84,14 @@ def _build_confusion(
 
     Results without a parseable justification are counted in ``n_skipped`` and
     omitted from the matrix. Pairs whose expected or actual label isn't in
-    ``classes`` are also skipped. Labels are normalized to lowercase so a
-    classifier returning "Book" vs configured "book" still matches.
+    ``classes`` are also skipped. Labels are normalized to lowercase for the
+    lookup index so a classifier returning "Book" vs configured "book" still
+    matches, but the user-supplied casing is preserved in the returned
+    ``_ConfusionData.classes`` so downstream output (per_class keys, UI labels)
+    shows what the user typed.
     """
-    canonical_classes = [c.lower() for c in classes]
-    index_of = {c: i for i, c in enumerate(canonical_classes)}
-    k = len(canonical_classes)
+    index_of = {c.lower(): i for i, c in enumerate(classes)}
+    k = len(classes)
     matrix = [[0] * k for _ in range(k)]
 
     n_total = len(results)
@@ -102,7 +99,7 @@ def _build_confusion(
     n_skipped = 0
 
     for r in results:
-        j = _coerce_justification(r.details)
+        j = BaseEvaluatorJustification.try_from(r.details)
         if j is None:
             n_skipped += 1
             continue
@@ -115,7 +112,7 @@ def _build_confusion(
         n_scored += 1
 
     return _ConfusionData(
-        classes=canonical_classes,
+        classes=list(classes),
         matrix=matrix,
         n_total=n_total,
         n_scored=n_scored,
diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
index 1fb736f2a..1799323ac 100644
--- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
@@ -9,6 +9,8 @@
 
 from typing import Literal
 
+from pydantic import model_validator
+
 from ..models import (
     AgentExecution,
     EvaluationResult,
@@ -20,13 +22,22 @@
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
-from ._aggregator_specs import AggregatorSpec
+from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
 from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification
 from .output_evaluator import (
     BaseOutputEvaluator,
     OutputEvaluatorConfig,
 )
 
+# Maps the evaluator-level ``metric_type`` strings to the corresponding
+# aggregator-spec ``type`` values. The two spellings differ historically:
+# the evaluator uses "f-score" (hyphen), the aggregator uses "fscore".
+_METRIC_TYPE_TO_AGGREGATOR_TYPE = {
+    "precision": "precision",
+    "recall": "recall",
+    "f-score": "fscore",
+}
+
 
 class MulticlassClassificationEvaluationCriteria(BaseEvaluationCriteria):
     """Per-datapoint criteria: which class this sample should belong to."""
@@ -51,6 +62,61 @@ class MulticlassClassificationEvaluatorConfig(
     # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``.
     aggregators: list[AggregatorSpec] | None = None
 
+    @model_validator(mode="after")
+    def _validate_aggregators_against_evaluator_config(
+        self,
+    ) -> "MulticlassClassificationEvaluatorConfig":
+        """Reject aggregators that are inconsistent with the evaluator's own config.
+
+        Two checks:
+          * Every evaluator-level class must appear in every aggregator's
+            ``classes`` list (case-insensitive). Otherwise the per-datapoint
+            and aggregator paths score disjoint label spaces.
+          * For each aggregator whose ``type`` matches the evaluator-level
+            ``metric_type`` (mapped via :data:`_METRIC_TYPE_TO_AGGREGATOR_TYPE`),
+            the aggregator's ``averaging`` must match the evaluator's
+            ``averaging``, and for ``fscore`` the ``f_value`` must match too.
+            Otherwise the per-evaluator headline and the dataset evaluator's
+            per-aggregator score diverge silently.
+        """
+        if not self.aggregators:
+            return self
+        evaluator_classes_lower = {c.lower() for c in self.classes}
+        evaluator_aggregator_type = _METRIC_TYPE_TO_AGGREGATOR_TYPE.get(
+            self.metric_type
+        )
+        for spec in self.aggregators:
+            spec_classes_lower = {c.lower() for c in spec.classes}
+            missing = evaluator_classes_lower - spec_classes_lower
+            if missing:
+                raise ValueError(
+                    f"Aggregator '{spec.type}' on evaluator '{self.name}' "
+                    f"declares classes={spec.classes!r} but the evaluator's "
+                    f"classes={self.classes!r} include {sorted(missing)!r} "
+                    "that the aggregator does not. Aggregators must cover "
+                    "the evaluator's full class space."
+                )
+            if spec.type == evaluator_aggregator_type:
+                if spec.averaging != self.averaging:
+                    raise ValueError(
+                        f"Aggregator '{spec.type}' on evaluator '{self.name}' "
+                        f"has averaging={spec.averaging!r} but the evaluator's "
+                        f"averaging={self.averaging!r}. The per-evaluator "
+                        "headline and the aggregator would compute different "
+                        "scores."
+                    )
+                if (
+                    isinstance(spec, FScoreAggregatorSpec)
+                    and spec.f_value != self.f_value
+                ):
+                    raise ValueError(
+                        f"Aggregator 'fscore' on evaluator '{self.name}' has "
+                        f"f_value={spec.f_value} but the evaluator's f_value="
+                        f"{self.f_value}. The per-evaluator headline and the "
+                        "aggregator would compute different F-beta scores."
+                    )
+        return self
+
 
 class MulticlassClassificationEvaluator(
     BaseOutputEvaluator[
@@ -76,7 +142,16 @@ async def evaluate(
         agent_execution: AgentExecution,
         evaluation_criteria: MulticlassClassificationEvaluationCriteria,
     ) -> EvaluationResult:
-        """Evaluate multiclass classification by comparing predicted vs expected class."""
+        """Evaluate multiclass classification by comparing predicted vs expected class.
+
+        Configuration errors (e.g. ``expected_class`` not in the configured
+        ``classes``) raise — that's a setup mistake the user must fix. But a
+        predicted class outside the vocabulary (a sloppy LLM returning
+        "unknown", garbage, or an unconfigured label) returns a 0.0 score with
+        the OOV label preserved in the justification, mirroring the binary
+        evaluator's behavior. The dataset evaluator's confusion matrix
+        accounts for these via ``n_skipped``.
+        """
         predicted_class = str(self._get_actual_output(agent_execution)).lower()
         expected_class = evaluation_criteria.expected_class.lower()
         classes = [c.lower() for c in self.evaluator_config.classes]
@@ -89,14 +164,6 @@ async def evaluate(
                 category=UiPathEvaluationErrorCategory.USER,
             )
 
-        if predicted_class not in classes:
-            raise UiPathEvaluationError(
-                code="INVALID_PREDICTED_CLASS",
-                title="Predicted class not in configured classes",
-                detail=f"Predicted class '{predicted_class}' is not in the configured classes: {classes}",
-                category=UiPathEvaluationErrorCategory.USER,
-            )
-
         score = 1.0 if predicted_class == expected_class else 0.0
 
         justification = self.validate_justification(
diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
index f2bdfa3cb..d87d9013e 100644
--- a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
+++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
@@ -1,9 +1,12 @@
 """End-to-end tests that run the classification sample projects through evaluate().
 
 These tests double as integration coverage for the binary and multiclass
-classification evaluators added in #1397 — they wire each sample's main.py
-into a stand-in runtime, run the full eval set, and assert the per-row scores
-plus the aggregated metric produced by `reduce_scores`.
+classification evaluators added in #1397 plus the embedded dataset-level
+aggregators added in #1669 — they wire each sample's main.py into a stand-in
+runtime, run the full eval set, and assert the per-row scores AND the
+specific aggregator scores produced by the embedded ``aggregators[]``. A
+regression that returns 0.0 for all aggregators (or one that swaps macro
+for micro silently) fails these tests.
 """
 
 import importlib.util
@@ -178,6 +181,16 @@ async def test_binary_classification_sample_end_to_end():
         "BinarySpamPrecision.recall",
         "BinarySpamPrecision.fscore",
     }
+    # Confusion matrix (predicted x expected, classes=[spam, ham]):
+    #   matrix[spam][spam] = 2  matrix[spam][ham] = 1  (the FP)
+    #   matrix[ham][spam]  = 0  matrix[ham][ham]  = 2
+    # per-class precision: spam = 2/3, ham = 1.0  → macro = (2/3 + 1) / 2 = 5/6
+    # per-class recall:    spam = 1.0, ham = 2/3  → macro = (1 + 2/3) / 2 = 5/6
+    # per-class F1:        spam = 0.8, ham = 0.8  → macro = 0.8
+    agg = output.dataset_evaluator_results
+    assert agg["BinarySpamPrecision.precision"].score == pytest.approx(5 / 6, rel=1e-6)
+    assert agg["BinarySpamPrecision.recall"].score == pytest.approx(5 / 6, rel=1e-6)
+    assert agg["BinarySpamPrecision.fscore"].score == pytest.approx(0.8, rel=1e-6)
 
 
 async def test_multiclass_classification_sample_end_to_end():
diff --git a/packages/uipath/tests/evaluators/test_evaluator_methods.py b/packages/uipath/tests/evaluators/test_evaluator_methods.py
index ec795499d..0083aeec0 100644
--- a/packages/uipath/tests/evaluators/test_evaluator_methods.py
+++ b/packages/uipath/tests/evaluators/test_evaluator_methods.py
@@ -2608,12 +2608,20 @@ async def test_multiclass_classification_invalid_expected_class(self) -> None:
 
     @pytest.mark.asyncio
     async def test_multiclass_classification_invalid_predicted_class(self) -> None:
-        """Test that an invalid predicted class returns an error result."""
+        """Out-of-vocab predicted class returns score=0.0, not an error.
+
+        Mirrors binary classification's soft-fail behavior so a sloppy LLM
+        returning "fish" doesn't crash the whole eval set. The dataset
+        evaluator's confusion matrix counts the OOV prediction under
+        ``n_skipped``. Configuration errors (expected_class outside vocab)
+        still raise; only predicted_class is soft.
+        """
+        from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
         from uipath.eval.evaluators.multiclass_classification_evaluator import (
             MulticlassClassificationEvaluationCriteria,
             MulticlassClassificationEvaluator,
         )
-        from uipath.eval.models.models import ErrorEvaluationResult
+        from uipath.eval.models import NumericEvaluationResult
 
         execution = AgentExecution(
             agent_input={},
@@ -2630,5 +2638,148 @@ async def test_multiclass_classification_invalid_predicted_class(self) -> None:
         )
         criteria = MulticlassClassificationEvaluationCriteria(expected_class="cat")
         result = await evaluator.evaluate(execution, criteria)
-        assert isinstance(result, ErrorEvaluationResult)
+        assert isinstance(result, NumericEvaluationResult)
         assert result.score == 0.0
+        assert isinstance(result.details, BaseEvaluatorJustification)
+        assert result.details.actual == "fish"
+        assert result.details.expected == "cat"
+
+
+class TestClassificationConfigCrossValidators:
+    """Pydantic validators that catch internally-inconsistent classification configs.
+
+    Without these validators, a config with ``positive_class="yes"`` but an
+    aggregator declaring ``classes=["spam","ham"]`` silently scores against
+    completely disjoint label spaces — the per-evaluator headline and the
+    aggregator's confusion matrix both return numbers, neither one meaningful.
+    """
+
+    def test_binary_aggregator_missing_positive_class_rejected(self) -> None:
+        from uipath.eval.evaluators.binary_classification_evaluator import (
+            BinaryClassificationEvaluator,
+        )
+
+        config = {
+            "name": "SpamPrecision",
+            "positive_class": "spam",
+            "metric_type": "precision",
+            "aggregators": [
+                {
+                    "type": "precision",
+                    # "spam" is intentionally missing
+                    "classes": ["other", "ham"],
+                    "averaging": "macro",
+                }
+            ],
+        }
+        with pytest.raises(Exception) as exc_info:
+            BinaryClassificationEvaluator.model_validate(
+                {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+            )
+        assert "positive_class" in str(exc_info.value)
+
+    def test_binary_aggregator_fvalue_mismatch_rejected(self) -> None:
+        from uipath.eval.evaluators.binary_classification_evaluator import (
+            BinaryClassificationEvaluator,
+        )
+
+        config = {
+            "name": "SpamFScore",
+            "positive_class": "spam",
+            "metric_type": "f-score",
+            "f_value": 1.0,
+            "aggregators": [
+                {
+                    "type": "fscore",
+                    "classes": ["spam", "ham"],
+                    "averaging": "macro",
+                    "f_value": 2.0,  # diverges from evaluator-level 1.0
+                }
+            ],
+        }
+        with pytest.raises(Exception) as exc_info:
+            BinaryClassificationEvaluator.model_validate(
+                {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+            )
+        assert "f_value" in str(exc_info.value)
+
+    def test_multiclass_aggregator_missing_class_rejected(self) -> None:
+        from uipath.eval.evaluators.multiclass_classification_evaluator import (
+            MulticlassClassificationEvaluator,
+        )
+
+        config = {
+            "name": "IntentClassifier",
+            "classes": ["book", "cancel", "reschedule"],
+            "metric_type": "f-score",
+            "averaging": "macro",
+            "aggregators": [
+                {
+                    "type": "fscore",
+                    # "reschedule" is intentionally missing from the aggregator
+                    "classes": ["book", "cancel"],
+                    "averaging": "macro",
+                    "f_value": 1.0,
+                }
+            ],
+        }
+        with pytest.raises(Exception) as exc_info:
+            MulticlassClassificationEvaluator.model_validate(
+                {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+            )
+        assert "reschedule" in str(exc_info.value)
+
+    def test_multiclass_aggregator_averaging_mismatch_rejected(self) -> None:
+        from uipath.eval.evaluators.multiclass_classification_evaluator import (
+            MulticlassClassificationEvaluator,
+        )
+
+        config = {
+            "name": "IntentClassifier",
+            "classes": ["book", "cancel"],
+            "metric_type": "precision",
+            "averaging": "macro",
+            "aggregators": [
+                {
+                    "type": "precision",
+                    "classes": ["book", "cancel"],
+                    "averaging": "micro",  # diverges from evaluator-level macro
+                }
+            ],
+        }
+        with pytest.raises(Exception) as exc_info:
+            MulticlassClassificationEvaluator.model_validate(
+                {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+            )
+        assert "averaging" in str(exc_info.value)
+
+    def test_binary_aggregator_unrelated_type_does_not_cross_check(self) -> None:
+        """An aggregator whose ``type`` differs from the evaluator's ``metric_type``
+        should NOT be cross-checked for f_value / averaging matching — only the
+        positive_class containment rule applies.
+        """
+        from uipath.eval.evaluators.binary_classification_evaluator import (
+            BinaryClassificationEvaluator,
+        )
+
+        config = {
+            "name": "SpamPrecision",
+            "positive_class": "spam",
+            "metric_type": "precision",
+            "f_value": 1.0,
+            # evaluator computes precision; the aggregator below is an fscore
+            # with a different f_value — should be allowed because the
+            # evaluator headline isn't an fscore.
+            "aggregators": [
+                {
+                    "type": "fscore",
+                    "classes": ["spam", "ham"],
+                    "averaging": "macro",
+                    "f_value": 2.0,
+                }
+            ],
+        }
+        evaluator = BinaryClassificationEvaluator.model_validate(
+            {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+        )
+        assert evaluator.evaluator_config.aggregators is not None

From 4d6afccafbdb4adb5af0365af69d6613953ddc31 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 23:05:19 -0700
Subject: [PATCH 11/13] fix(eval): address codex P1 + lint failures on dataset
 evaluators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Bump uipath version 2.11.5 -> 2.11.6 (2.11.5 already on PyPI).
- Widen examples/dataset_evaluators_demo.py:report() to accept the full
  EvaluationResult union and narrow once inside with isinstance, fixing
  6 mypy "expected NumericEvaluationResult" errors at the call sites.
- Address Codex P1 (runtime.py:268 — result-key collision): two
  aggregators of the same type on the same source (e.g. macro+micro
  precision) previously produced identical {source}.{type} keys, with
  the second silently overwriting the first. compute_dataset_evaluator
  _results now counts type occurrences per source and disambiguates
  duplicate-type aggregators as {source}.{type}.{averaging} (plus
  ".fb{f_value}" for fscore variants), preserving the simple key shape
  for the common single-aggregator case. Docstring updated; 2 new
  tests cover both the precision-duplicate and fscore-duplicate paths.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../examples/dataset_evaluators_demo.py       |  9 ++-
 packages/uipath/pyproject.toml                |  2 +-
 .../uipath/src/uipath/eval/runtime/runtime.py | 41 ++++++++++--
 .../test_dataset_classification_evaluators.py | 62 +++++++++++++++++++
 packages/uipath/uv.lock                       |  2 +-
 5 files changed, 107 insertions(+), 9 deletions(-)

diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py
index 2d13f3572..1a3c376c0 100644
--- a/packages/uipath/examples/dataset_evaluators_demo.py
+++ b/packages/uipath/examples/dataset_evaluators_demo.py
@@ -26,7 +26,11 @@
     ClassificationDetails,
 )
 from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
-from uipath.eval.models.models import EvaluationResultDto, NumericEvaluationResult
+from uipath.eval.models.models import (
+    EvaluationResult,
+    EvaluationResultDto,
+    NumericEvaluationResult,
+)
 
 # ─── helpers ──────────────────────────────────────────────────────────────────
 
@@ -94,12 +98,13 @@ def print_per_class(details: ClassificationDetails) -> None:
 
 def report(
     title: str,
-    result: NumericEvaluationResult,
+    result: EvaluationResult,
     *,
     show_json_tail: bool = False,
 ) -> None:
     """Render one scenario's result block."""
     print_header(title)
+    assert isinstance(result, NumericEvaluationResult)
     assert isinstance(result.details, ClassificationDetails)
     d = result.details
     print(
diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
index 0add2e09e..fd088202e 100644
--- a/packages/uipath/pyproject.toml
+++ b/packages/uipath/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.11.5"
+version = "2.11.6"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
index 987b6c4ae..7167d7f20 100644
--- a/packages/uipath/src/uipath/eval/runtime/runtime.py
+++ b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -45,6 +45,7 @@
 from uipath.runtime.schema import UiPathRuntimeSchema
 
 from .._execution_context import ExecutionSpanCollector
+from ..evaluators._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
 from ..evaluators.base_evaluator import GenericBaseEvaluator
 from ..evaluators.binary_classification_evaluator import (
     BinaryClassificationEvaluatorConfig,
@@ -227,8 +228,13 @@ def compute_dataset_evaluator_results(
 
     Returns:
         Dict mapping ``"{evaluator_name}.{aggregator_type}"`` to the run-level
-        EvaluationResultDto. Aggregators whose source produced no results are
-        still invoked with an empty list so they emit a zeroed result.
+        EvaluationResultDto. When the same aggregator ``type`` appears more
+        than once on a source (e.g. macro+micro precision), each variant is
+        disambiguated as ``"{evaluator_name}.{type}.{averaging}"`` and, for
+        fscore, with the ``f_value`` suffix (``"...fbN"``), so a duplicate
+        type never overwrites a previous result. Aggregators whose source
+        produced no results are still invoked with an empty list so they emit
+        a zeroed result.
     """
     results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict(
         list
@@ -261,15 +267,40 @@ def compute_dataset_evaluator_results(
             continue
         source_name = config.name
         source_results = results_by_evaluator.get(source_name, [])
+        # Count occurrences of each aggregator type to detect duplicates
+        # (e.g. macro+micro precision on the same source). The default key
+        # shape ``{source}.{type}`` collides on duplicates; disambiguate with
+        # ``.{averaging}`` (and ``.fb{f_value}`` for fscore variants) only
+        # when more than one aggregator of that type exists, to preserve the
+        # simple key shape in the common case.
+        type_counts: dict[str, int] = defaultdict(int)
+        for spec in config.aggregators:
+            type_counts[spec.type] += 1
         for spec in config.aggregators:
             dataset_evaluator = build_dataset_evaluator(spec, source_name)
-            evaluation_result = dataset_evaluator.evaluate(source_results)
-            dataset_results[dataset_evaluator.name] = (
-                EvaluationResultDto.from_evaluation_result(evaluation_result)
+            key = _dataset_result_key(source_name, spec, type_counts[spec.type] > 1)
+            dataset_results[key] = EvaluationResultDto.from_evaluation_result(
+                dataset_evaluator.evaluate(source_results)
             )
     return dataset_results
 
 
+def _dataset_result_key(
+    source_name: str, spec: AggregatorSpec, disambiguate: bool
+) -> str:
+    """Build the result-dict key for a dataset evaluator.
+
+    Uses ``{source}.{type}`` for unique-type aggregators, and appends
+    ``.{averaging}`` (plus ``.fb{f_value}`` for fscore) when the same type
+    appears more than once on the same source.
+    """
+    if not disambiguate:
+        return f"{source_name}.{spec.type}"
+    if isinstance(spec, FScoreAggregatorSpec):
+        return f"{source_name}.{spec.type}.{spec.averaging}.fb{spec.f_value}"
+    return f"{source_name}.{spec.type}.{spec.averaging}"
+
+
 class UiPathEvalRuntime:
     """Specialized runtime for evaluation runs, with access to the factory."""
 
diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
index bb7d3538e..e04a13fb0 100644
--- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -496,3 +496,65 @@ def test_source_with_no_results_produces_zeroed_report(self) -> None:
         assert dto.score == 0.0
         assert isinstance(dto.details, dict)
         assert dto.details["n_scored"] == 0
+
+    def test_duplicate_aggregator_type_disambiguates_by_averaging(self) -> None:
+        """Two aggregators of the same type get distinct keys (no overwrite)."""
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="micro"),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        # Same type appears twice → averaging suffix disambiguates so neither
+        # is silently overwritten.
+        assert set(out) == {
+            "intent_match.precision.macro",
+            "intent_match.precision.micro",
+        }
+
+    def test_duplicate_fscore_disambiguates_by_averaging_and_fvalue(self) -> None:
+        """Two FScore aggregators (e.g. F1 macro and F2 macro) both survive."""
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                FScoreAggregatorSpec(
+                    classes=["yes", "no"], averaging="macro", f_value=1.0
+                ),
+                FScoreAggregatorSpec(
+                    classes=["yes", "no"], averaging="macro", f_value=2.0
+                ),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        assert set(out) == {
+            "intent_match.fscore.macro.fb1.0",
+            "intent_match.fscore.macro.fb2.0",
+        }
diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock
index 86f8936e1..bd7f1f86e 100644
--- a/packages/uipath/uv.lock
+++ b/packages/uipath/uv.lock
@@ -2552,7 +2552,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.11.5"
+version = "2.11.6"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },

From 5d782052d4cd1ee22966b5784e7a9f885192ec29 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 23:07:31 -0700
Subject: [PATCH 12/13] test(eval): drop fscore-duplicate test that conflicts
 with #1663 H2 validator

The fscore-duplicate disambiguation test added in 4d6afcca conflicts
with the H2 model_validator on #1663, which cross-checks aggregator
f_value against the evaluator's f_value when types match. The
precision-duplicate test still exercises the new
_dataset_result_key path; the FScore branch is exercised by the
factory + math tests.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../test_dataset_classification_evaluators.py | 32 -------------------
 1 file changed, 32 deletions(-)

diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
index e04a13fb0..69fbfda40 100644
--- a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -526,35 +526,3 @@ def test_duplicate_aggregator_type_disambiguates_by_averaging(self) -> None:
             "intent_match.precision.macro",
             "intent_match.precision.micro",
         }
-
-    def test_duplicate_fscore_disambiguates_by_averaging_and_fvalue(self) -> None:
-        """Two FScore aggregators (e.g. F1 macro and F2 macro) both survive."""
-        evaluator = _multiclass_evaluator(
-            "intent_match",
-            classes=["yes", "no"],
-            aggregators=[
-                FScoreAggregatorSpec(
-                    classes=["yes", "no"], averaging="macro", f_value=1.0
-                ),
-                FScoreAggregatorSpec(
-                    classes=["yes", "no"], averaging="macro", f_value=2.0
-                ),
-            ],
-        )
-        eval_results = [
-            UiPathEvalRunResult(
-                evaluation_name="dp1",
-                evaluation_run_results=[
-                    UiPathEvalRunResultDto(
-                        evaluator_name="intent_match",
-                        evaluator_id=str(uuid.uuid4()),
-                        result=_result("yes", "yes"),
-                    ),
-                ],
-            ),
-        ]
-        out = compute_dataset_evaluator_results(eval_results, [evaluator])
-        assert set(out) == {
-            "intent_match.fscore.macro.fb1.0",
-            "intent_match.fscore.macro.fb2.0",
-        }

From 363855d4f2b86321ee933c6b1e382364257a6c44 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 18 Jun 2026 23:13:25 -0700
Subject: [PATCH 13/13] fix(eval): publish aggregators in classification
 evaluator type schemas

Regenerate BinaryClassificationEvaluator.json and
MulticlassClassificationEvaluator.json from the updated pydantic models
so schema-driven consumers can discover and validate the new
evaluatorConfig.aggregators array + Precision/Recall/FScore variants.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../BinaryClassificationEvaluator.json        | 154 +++++++++++++++++-
 .../MulticlassClassificationEvaluator.json    | 154 +++++++++++++++++-
 2 files changed, 302 insertions(+), 6 deletions(-)

diff --git a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
index 9f7351865..a15ac8e5a 100644
--- a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
+++ b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
@@ -15,6 +15,111 @@
         ],
         "title": "BinaryClassificationEvaluationCriteria",
         "type": "object"
+      },
+      "FScoreAggregatorSpec": {
+        "description": "Run-level F-beta aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "fscore",
+            "default": "fscore",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          },
+          "f_value": {
+            "default": 1.0,
+            "exclusiveMinimum": 0,
+            "title": "F Value",
+            "type": "number"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "FScoreAggregatorSpec",
+        "type": "object"
+      },
+      "PrecisionAggregatorSpec": {
+        "description": "Run-level precision aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "precision",
+            "default": "precision",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "PrecisionAggregatorSpec",
+        "type": "object"
+      },
+      "RecallAggregatorSpec": {
+        "description": "Run-level recall aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "recall",
+            "default": "recall",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "RecallAggregatorSpec",
+        "type": "object"
       }
     },
     "description": "Configuration for the binary classification evaluator.",
@@ -42,10 +147,20 @@
         "default": null
       },
       "target_output_key": {
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "items": {
+              "type": "string"
+            },
+            "type": "array"
+          }
+        ],
         "default": "*",
-        "description": "Key to extract output from agent execution",
-        "title": "Target Output Key",
-        "type": "string"
+        "description": "Key or list of keys to extract output from agent execution",
+        "title": "Target Output Key"
       },
       "line_by_line_evaluator": {
         "default": false,
@@ -77,6 +192,39 @@
         "default": 1.0,
         "title": "F Value",
         "type": "number"
+      },
+      "aggregators": {
+        "anyOf": [
+          {
+            "items": {
+              "discriminator": {
+                "mapping": {
+                  "fscore": "#/$defs/FScoreAggregatorSpec",
+                  "precision": "#/$defs/PrecisionAggregatorSpec",
+                  "recall": "#/$defs/RecallAggregatorSpec"
+                },
+                "propertyName": "type"
+              },
+              "oneOf": [
+                {
+                  "$ref": "#/$defs/PrecisionAggregatorSpec"
+                },
+                {
+                  "$ref": "#/$defs/RecallAggregatorSpec"
+                },
+                {
+                  "$ref": "#/$defs/FScoreAggregatorSpec"
+                }
+              ]
+            },
+            "type": "array"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null,
+        "title": "Aggregators"
       }
     },
     "required": [
diff --git a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
index 72262ba92..8cc971f75 100644
--- a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
+++ b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
@@ -2,6 +2,45 @@
   "evaluatorTypeId": "uipath-multiclass-classification",
   "evaluatorConfigSchema": {
     "$defs": {
+      "FScoreAggregatorSpec": {
+        "description": "Run-level F-beta aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "fscore",
+            "default": "fscore",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          },
+          "f_value": {
+            "default": 1.0,
+            "exclusiveMinimum": 0,
+            "title": "F Value",
+            "type": "number"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "FScoreAggregatorSpec",
+        "type": "object"
+      },
       "MulticlassClassificationEvaluationCriteria": {
         "description": "Per-datapoint criteria: which class this sample should belong to.",
         "properties": {
@@ -15,6 +54,72 @@
         ],
         "title": "MulticlassClassificationEvaluationCriteria",
         "type": "object"
+      },
+      "PrecisionAggregatorSpec": {
+        "description": "Run-level precision aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "precision",
+            "default": "precision",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "PrecisionAggregatorSpec",
+        "type": "object"
+      },
+      "RecallAggregatorSpec": {
+        "description": "Run-level recall aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "recall",
+            "default": "recall",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "RecallAggregatorSpec",
+        "type": "object"
       }
     },
     "description": "Configuration for the multiclass classification evaluator.",
@@ -42,10 +147,20 @@
         "default": null
       },
       "target_output_key": {
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "items": {
+              "type": "string"
+            },
+            "type": "array"
+          }
+        ],
         "default": "*",
-        "description": "Key to extract output from agent execution",
-        "title": "Target Output Key",
-        "type": "string"
+        "description": "Key or list of keys to extract output from agent execution",
+        "title": "Target Output Key"
       },
       "line_by_line_evaluator": {
         "default": false,
@@ -89,6 +204,39 @@
         "default": 1.0,
         "title": "F Value",
         "type": "number"
+      },
+      "aggregators": {
+        "anyOf": [
+          {
+            "items": {
+              "discriminator": {
+                "mapping": {
+                  "fscore": "#/$defs/FScoreAggregatorSpec",
+                  "precision": "#/$defs/PrecisionAggregatorSpec",
+                  "recall": "#/$defs/RecallAggregatorSpec"
+                },
+                "propertyName": "type"
+              },
+              "oneOf": [
+                {
+                  "$ref": "#/$defs/PrecisionAggregatorSpec"
+                },
+                {
+                  "$ref": "#/$defs/RecallAggregatorSpec"
+                },
+                {
+                  "$ref": "#/$defs/FScoreAggregatorSpec"
+                }
+              ]
+            },
+            "type": "array"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null,
+        "title": "Aggregators"
       }
     },
     "required": [