diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py
new file mode 100644
index 000000000..bc88ec94a
--- /dev/null
+++ b/packages/uipath/examples/dataset_evaluators_demo.py
@@ -0,0 +1,237 @@
+"""Runnable proof that the dataset-level evaluators work on realistic data.
+
+Five scenarios exercise the framework end-to-end at the SDK layer (no
+worker, no backend). Each prints the headline score plus a confusion
+matrix table, so the math is inspectable rather than a passing-test
+binary signal.
+
+Run::
+
+    cd packages/uipath
+    uv run python examples/dataset_evaluators_demo.py
+"""
+
+from __future__ import annotations
+
+from typing import Iterable
+
+from uipath.eval.evaluators._aggregator_specs import (
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDetails,
+)
+from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from uipath.eval.models.models import (
+    EvaluationResult,
+    EvaluationResultDto,
+    NumericEvaluationResult,
+)
+
+# ─── helpers ──────────────────────────────────────────────────────────────────
+
+
+def make_result(expected: str, actual: str) -> EvaluationResultDto:
+    """Build a single per-datapoint EvaluationResultDto.
+
+    Models what an upstream classification evaluator would produce after running
+    on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with the
+    expected/actual labels carried in the justification.
+    """
+    score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(score=score, details=justification.model_dump())
+
+
+def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]:
+    """Build a list of EvaluationResultDto from (expected, actual) pairs."""
+    return [make_result(e, a) for e, a in pairs]
+
+
+def print_header(title: str) -> None:
+    """Print a section header banner."""
+    print()
+    print("═" * 78)
+    print(f" {title}")
+    print("═" * 78)
+
+
+def report(
+    title: str,
+    result: EvaluationResult,
+    *,
+    show_json_tail: bool = False,  # kept for call-site compat; payload is always emitted
+) -> None:
+    """Render one scenario's result block as JSON — the actual wire shape."""
+    _ = show_json_tail
+    print_header(title)
+    assert isinstance(result, NumericEvaluationResult)
+    assert isinstance(result.details, ClassificationDetails)
+    print(f"  headline score = {result.score:.4f}")
+    print(result.details.model_dump_json(indent=2, by_alias=True))
+
+
+# ─── scenarios ────────────────────────────────────────────────────────────────
+
+
+def scenario_1_balanced_three_class() -> None:
+    """Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong."""
+    pairs = [
+        ("book", "book"),
+        ("book", "book"),
+        ("book", "cancel"),
+        ("cancel", "cancel"),
+        ("cancel", "cancel"),
+        ("cancel", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "book"),
+    ]
+    spec = PrecisionAggregatorSpec(
+        classes=["book", "cancel", "reschedule"], averaging="macro"
+    )
+    evaluator = build_dataset_evaluator(spec, source_evaluator="intent_match")
+    report(
+        "Scenario 1 — Balanced 3-class (intent recognition)\n"
+        "  Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.",
+        evaluator.evaluate(materialize_pairs(pairs)),
+        show_json_tail=True,
+    )
+
+
+def scenario_2_imbalanced_two_class() -> None:
+    """Rare-positive case — why macro vs micro matters."""
+    pairs: list[tuple[str, str]] = []
+    pairs += [("negative", "negative")] * 13
+    pairs += [("negative", "positive")] * 3
+    pairs += [("positive", "positive")] * 2
+    pairs += [("positive", "negative")] * 2
+
+    results = materialize_pairs(pairs)
+    classes = ["positive", "negative"]
+
+    macro = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=classes, averaging="macro"),
+        source_evaluator="positive_match",
+    )
+    micro = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=classes, averaging="micro"),
+        source_evaluator="positive_match",
+    )
+    report(
+        "Scenario 2a — Imbalanced 2-class, MACRO precision\n"
+        "  Rare positive class. Macro averages per-class, so the rare class\n"
+        "  having precision = 2/(2+3) = 0.40 drags the score down.",
+        macro.evaluate(results),
+    )
+    report(
+        "Scenario 2b — Same data, MICRO precision\n"
+        "  Pools TP/FP across classes. In a 2-class case this equals accuracy.",
+        micro.evaluate(results),
+    )
+
+
+def scenario_3_precision_vs_recall_vs_f() -> None:
+    """Same dataset, three different metrics — show they diverge on asymmetric data."""
+    pairs = [
+        ("yes", "yes"),
+        ("yes", "yes"),
+        ("no", "yes"),
+        ("no", "yes"),
+        ("no", "no"),
+        ("no", "no"),
+        ("yes", "no"),
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["yes", "no"]
+
+    evaluators = {
+        "Scenario 3a — Precision on a recall-favourable dataset": build_dataset_evaluator(
+            PrecisionAggregatorSpec(classes=classes, averaging="macro"),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)": build_dataset_evaluator(
+            RecallAggregatorSpec(classes=classes, averaging="macro"),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3c — F1 (harmonic mean of P and R)": build_dataset_evaluator(
+            FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)": build_dataset_evaluator(
+            FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=2.0),
+            source_evaluator="yes_match",
+        ),
+    }
+    for title, evaluator in evaluators.items():
+        report(title, evaluator.evaluate(results))
+
+
+def scenario_4_skipped_datapoints() -> None:
+    """Show how malformed / out-of-vocab data is reported, not silently dropped."""
+    results = [
+        make_result("cat", "cat"),
+        make_result("dog", "dog"),
+        make_result("cat", "platypus"),
+        make_result("zebra", "cat"),
+        EvaluationResultDto(score=1.0, details="bare string — no justification"),
+        EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+    ]
+    evaluator = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=["cat", "dog"], averaging="macro"),
+        source_evaluator="any_match",
+    )
+    report(
+        "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n"
+        "  6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n"
+        "  report so you can tell whether a low score is a real signal or\n"
+        "  just sparse data.",
+        evaluator.evaluate(results),
+    )
+
+
+def scenario_5_realistic_intent_classifier() -> None:
+    """A larger, more interesting 4-class dataset — uneven per-class performance."""
+    pairs = [
+        *[("book", "book")] * 10,
+        ("book", "cancel"),
+        *[("cancel", "cancel")] * 6,
+        ("cancel", "book"),
+        ("cancel", "modify"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "modify"),
+        ("reschedule", "modify"),
+        ("modify", "modify"),
+        ("modify", "reschedule"),
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["book", "cancel", "reschedule", "modify"]
+    macro_f1 = build_dataset_evaluator(
+        FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
+        source_evaluator="intent_match",
+    )
+    report(
+        "Scenario 5 — Realistic 4-class intent classifier\n"
+        "  Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n"
+        "  'modify' weakness; micro F1 would have hidden it under 'book' wins.",
+        macro_f1.evaluate(results),
+    )
+
+
+def main() -> None:
+    """Run every scenario sequentially."""
+    scenario_1_balanced_three_class()
+    scenario_2_imbalanced_two_class()
+    scenario_3_precision_vs_recall_vs_f()
+    scenario_4_skipped_datapoints()
+    scenario_5_realistic_intent_classifier()
+    print()
+    print("Done. All scenarios computed from real evaluator code.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
index 0add2e09e..fd088202e 100644
--- a/packages/uipath/pyproject.toml
+++ b/packages/uipath/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.11.5"
+version = "2.11.6"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath/samples/binary_classification_agent/bindings.json b/packages/uipath/samples/binary_classification_agent/bindings.json
new file mode 100644
index 000000000..5e9beeb01
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
new file mode 100644
index 000000000..f47cd25b8
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
@@ -0,0 +1,63 @@
+{
+  "version": "1.0",
+  "id": "SpamBinaryEval",
+  "name": "Binary spam classifier — precision",
+  "evaluatorRefs": ["BinarySpamPrecision"],
+  "evaluations": [
+    {
+      "id": "spam-prize",
+      "name": "Spam: prize giveaway",
+      "inputs": {
+        "email_subject": "You won a FREE iPhone!!!",
+        "email_body": "Congratulations! Click here to claim your prize now."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "spam-promo",
+      "name": "Spam: unsolicited promo",
+      "inputs": {
+        "email_subject": "Winner of the monthly drawing",
+        "email_body": "You've been selected. Click here to redeem."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "ham-invoice",
+      "name": "Ham: legitimate invoice",
+      "inputs": {
+        "email_subject": "Your March invoice is ready",
+        "email_body": "Your monthly invoice of $45.99 is attached. Payment is due March 15."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    },
+    {
+      "id": "ham-meeting",
+      "name": "Ham: meeting request",
+      "inputs": {
+        "email_subject": "Sync on Q2 planning",
+        "email_body": "Can we meet Wednesday at 2pm to align on next quarter's roadmap?"
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    },
+    {
+      "id": "ham-mislabeled",
+      "name": "Ham mislabeled as spam (forces a false positive)",
+      "inputs": {
+        "email_subject": "Free coffee in the break room!!!",
+        "email_body": "Just a heads up — the new espresso machine is set up."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    }
+  ]
+}
diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
new file mode 100644
index 000000000..d2cc64b71
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
@@ -0,0 +1,34 @@
+{
+  "version": "1.0",
+  "id": "BinarySpamPrecision",
+  "description": "Precision on the 'spam' positive class, plus run-level aggregators",
+  "evaluatorTypeId": "uipath-binary-classification",
+  "evaluatorConfig": {
+    "name": "BinarySpamPrecision",
+    "targetOutputKey": "category",
+    "positiveClass": "spam",
+    "metricType": "precision",
+    "fValue": 1.0,
+    "defaultEvaluationCriteria": {
+      "expectedClass": "ham"
+    },
+    "aggregators": [
+      {
+        "type": "precision",
+        "classes": ["spam", "ham"],
+        "averaging": "macro"
+      },
+      {
+        "type": "recall",
+        "classes": ["spam", "ham"],
+        "averaging": "macro"
+      },
+      {
+        "type": "fscore",
+        "classes": ["spam", "ham"],
+        "averaging": "macro",
+        "fValue": 1.0
+      }
+    ]
+  }
+}
diff --git a/packages/uipath/samples/binary_classification_agent/main.py b/packages/uipath/samples/binary_classification_agent/main.py
new file mode 100644
index 000000000..1df5dea15
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/main.py
@@ -0,0 +1,39 @@
+"""Rule-based spam/ham classifier demonstrating the binary classification evaluator."""
+
+from dataclasses import dataclass
+
+from uipath.tracing import traced
+
+SPAMMY_TOKENS = {
+    "free",
+    "winner",
+    "congratulations",
+    "click here",
+    "prize",
+    "!!!",
+}
+
+
+@dataclass
+class EmailInput:
+    email_subject: str
+    email_body: str
+
+
+@dataclass
+class Classification:
+    category: str
+
+
+@traced(name="classify_email", span_type="tool")
+def classify_email(subject: str, body: str) -> str:
+    """Return 'spam' if any spam-indicator token appears in the subject or body."""
+    text = f"{subject} {body}".lower()
+    return "spam" if any(token in text for token in SPAMMY_TOKENS) else "ham"
+
+
+@traced()
+async def main(input: EmailInput) -> Classification:
+    """Classify an email as 'spam' or 'ham'."""
+    category = classify_email(input.email_subject, input.email_body)
+    return Classification(category=category)
diff --git a/packages/uipath/samples/binary_classification_agent/pyproject.toml b/packages/uipath/samples/binary_classification_agent/pyproject.toml
new file mode 100644
index 000000000..7d81d251a
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "binary-classification-agent"
+version = "0.0.1"
+description = "Rule-based spam/ham classifier demonstrating the binary classification evaluator"
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]
diff --git a/packages/uipath/samples/binary_classification_agent/uipath.json b/packages/uipath/samples/binary_classification_agent/uipath.json
new file mode 100644
index 000000000..9b02c2654
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "main.py:main"
+  }
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/bindings.json b/packages/uipath/samples/multiclass_classification_simple/bindings.json
new file mode 100644
index 000000000..5e9beeb01
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json
new file mode 100644
index 000000000..27e66c25d
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json
@@ -0,0 +1,85 @@
+{
+  "version": "1.0",
+  "id": "EmailMulticlassEval",
+  "name": "3-class email router — macro F1",
+  "evaluatorRefs": ["EmailMulticlassFScore"],
+  "evaluations": [
+    {
+      "id": "pay-invoice",
+      "name": "Payments: invoice reminder",
+      "inputs": {
+        "email_subject": "Your March invoice is ready",
+        "email_body": "Your monthly invoice of $45.99 is now available. Payment is due March 15."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "payments" }
+      }
+    },
+    {
+      "id": "pay-refund",
+      "name": "Payments: refund request",
+      "inputs": {
+        "email_subject": "Refund for last month's charge",
+        "email_body": "I was charged twice for the same service. Please process a refund."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "payments" }
+      }
+    },
+    {
+      "id": "support-broken",
+      "name": "Support: feature broken",
+      "inputs": {
+        "email_subject": "Login is broken",
+        "email_body": "I'm getting an error when trying to sign in. Need help."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    },
+    {
+      "id": "support-question",
+      "name": "Support: how-to question",
+      "inputs": {
+        "email_subject": "How do I export my data?",
+        "email_body": "Can you help me figure out where the export button is?"
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    },
+    {
+      "id": "spam-prize",
+      "name": "Spam: prize giveaway",
+      "inputs": {
+        "email_subject": "You won a FREE iPhone!!!",
+        "email_body": "Congratulations! Click here to claim your prize."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "spam-promo",
+      "name": "Spam: marketing winner",
+      "inputs": {
+        "email_subject": "Winner of the monthly drawing",
+        "email_body": "Congratulations, click here to redeem your reward."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "support-misrouted-by-payment-word",
+      "name": "Support email accidentally routed to payments (forces an FP for payments)",
+      "inputs": {
+        "email_subject": "Question about my billing portal access",
+        "email_body": "I cannot log into the billing portal. The page just spins. Can you help?"
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    }
+  ]
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
new file mode 100644
index 000000000..871afbc21
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
@@ -0,0 +1,35 @@
+{
+  "version": "1.0",
+  "id": "EmailMulticlassFScore",
+  "description": "Macro-averaged F1 across payments / support / spam, plus run-level aggregators",
+  "evaluatorTypeId": "uipath-multiclass-classification",
+  "evaluatorConfig": {
+    "name": "EmailMulticlassFScore",
+    "targetOutputKey": "category",
+    "classes": ["payments", "support", "spam"],
+    "metricType": "f-score",
+    "averaging": "macro",
+    "fValue": 1.0,
+    "defaultEvaluationCriteria": {
+      "expectedClass": "support"
+    },
+    "aggregators": [
+      {
+        "type": "precision",
+        "classes": ["payments", "support", "spam"],
+        "averaging": "macro"
+      },
+      {
+        "type": "recall",
+        "classes": ["payments", "support", "spam"],
+        "averaging": "macro"
+      },
+      {
+        "type": "fscore",
+        "classes": ["payments", "support", "spam"],
+        "averaging": "macro",
+        "fValue": 1.0
+      }
+    ]
+  }
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/main.py b/packages/uipath/samples/multiclass_classification_simple/main.py
new file mode 100644
index 000000000..3ab684298
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/main.py
@@ -0,0 +1,51 @@
+"""Rule-based 3-class email router demonstrating the multiclass classification evaluator."""
+
+from dataclasses import dataclass
+
+from uipath.tracing import traced
+
+SPAM_TOKENS = {"free", "winner", "congratulations", "click here", "prize", "!!!"}
+PAYMENT_TOKENS = {"invoice", "payment", "refund", "charge", "billing", "$"}
+SUPPORT_TOKENS = {
+    "help",
+    "support",
+    "issue",
+    "error",
+    "ticket",
+    "broken",
+    "not working",
+}
+
+
+@dataclass
+class EmailInput:
+    email_subject: str
+    email_body: str
+
+
+@dataclass
+class Classification:
+    category: str
+
+
+@traced(name="classify_email", span_type="tool")
+def classify_email(subject: str, body: str) -> str:
+    """Classify into 'spam', 'payments', or 'support' using priority rules.
+
+    Spam is checked first so promos with billing-flavored words still route to spam.
+    Payments is checked before support because it is the more specific intent.
+    Support is the catch-all default.
+    """
+    text = f"{subject} {body}".lower()
+    if any(token in text for token in SPAM_TOKENS):
+        return "spam"
+    if any(token in text for token in PAYMENT_TOKENS):
+        return "payments"
+    return "support"
+
+
+@traced()
+async def main(input: EmailInput) -> Classification:
+    """Route an email to one of three queues."""
+    category = classify_email(input.email_subject, input.email_body)
+    return Classification(category=category)
diff --git a/packages/uipath/samples/multiclass_classification_simple/pyproject.toml b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml
new file mode 100644
index 000000000..e803a2a43
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "multiclass-classification-simple"
+version = "0.0.1"
+description = "Rule-based 3-class email router demonstrating the multiclass classification evaluator with macro-averaged F1"
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]
diff --git a/packages/uipath/samples/multiclass_classification_simple/uipath.json b/packages/uipath/samples/multiclass_classification_simple/uipath.json
new file mode 100644
index 000000000..9b02c2654
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "main.py:main"
+  }
+}
diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
new file mode 100644
index 000000000..6c0b2b880
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
@@ -0,0 +1,53 @@
+"""Aggregator specs embedded in per-datapoint classification evaluator configs.
+
+Each aggregator is a self-contained run-level metric (precision / recall /
+f-score) attached to a classification evaluator. Specs do not share any
+properties — each variant declares its own ``classes``, ``averaging``, and
+(for fscore) ``f_value`` independently. This keeps each aggregator's contract
+explicit at the JSON level: nothing is hoisted up to the evaluator and silently
+applied to siblings.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Literal, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+
+class _AggregatorSpecBase(BaseModel):
+    """Shared pydantic config for every aggregator variant."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+
+class PrecisionAggregatorSpec(_AggregatorSpecBase):
+    """Run-level precision aggregator (multiclass, micro or macro averaged)."""
+
+    type: Literal["precision"] = "precision"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+
+
+class RecallAggregatorSpec(_AggregatorSpecBase):
+    """Run-level recall aggregator (multiclass, micro or macro averaged)."""
+
+    type: Literal["recall"] = "recall"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+
+
+class FScoreAggregatorSpec(_AggregatorSpecBase):
+    """Run-level F-beta aggregator (multiclass, micro or macro averaged)."""
+
+    type: Literal["fscore"] = "fscore"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+    f_value: float = Field(default=1.0, gt=0)
+
+
+AggregatorSpec = Annotated[
+    Union[PrecisionAggregatorSpec, RecallAggregatorSpec, FScoreAggregatorSpec],
+    Field(discriminator="type"),
+]
diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
new file mode 100644
index 000000000..c00eb666a
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
@@ -0,0 +1,56 @@
+"""Base abstractions for dataset-level evaluators.
+
+A dataset-level evaluator runs once per evaluation set, after all per-datapoint
+evaluators have produced their results. It consumes the per-datapoint
+EvaluationResultDto values from one named source evaluator and emits a single
+EvaluationResult that summarizes the dataset.
+
+Unlike the earlier pointer-style design, dataset evaluators no longer carry
+their own JSON config or a ``source_evaluator`` field. They are constructed by
+the factory directly from an :class:`AggregatorSpec` embedded in a per-datapoint
+classification evaluator's config, together with the source evaluator's name
+which is supplied externally by the runtime when walking those configs.
+
+Concretely distinct from GenericBaseEvaluator: different evaluate() signature,
+different lifecycle. Kept as a parallel hierarchy rather than a subclass so the
+runtime cannot accidentally dispatch a dataset evaluator through the
+per-datapoint loop.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+
+from ..models.models import EvaluationResult, EvaluationResultDto
+from ._aggregator_specs import AggregatorSpec
+
+SpecT = TypeVar("SpecT", bound="AggregatorSpec")
+
+
+class BaseDatasetEvaluator(ABC, Generic[SpecT]):
+    """Abstract base for dataset-level evaluators.
+
+    Constructed from an :class:`AggregatorSpec` and the name of the source
+    per-datapoint evaluator whose results this aggregator consumes. The
+    dataset evaluator's "name" used for result keying is derived from
+    ``"{source_evaluator}.{spec.type}"`` so two aggregators on the same source
+    don't collide.
+    """
+
+    spec: SpecT
+    source_evaluator: str
+
+    def __init__(self, spec: SpecT, source_evaluator: str) -> None:
+        """Store the aggregator spec and the source evaluator name."""
+        self.spec = spec
+        self.source_evaluator = source_evaluator
+
+    @property
+    def name(self) -> str:
+        """Stable key for this dataset evaluator's result in the output map."""
+        return f"{self.source_evaluator}.{self.spec.type}"
+
+    @abstractmethod
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Reduce per-datapoint results into a single run-level EvaluationResult."""
diff --git a/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py
index 73fac46c6..285a022f4 100644
--- a/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/base_evaluator.py
@@ -47,6 +47,25 @@ class BaseEvaluatorJustification(BaseModel):
     expected: str
     actual: str
 
+    @classmethod
+    def try_from(cls, details: object) -> "BaseEvaluatorJustification | None":
+        """Coerce a free-form details payload into a justification, or return None.
+
+        Accepts either an existing instance or a dict that ``model_validate`` can
+        parse. Anything else (str, None, malformed dict) yields ``None``. Used by
+        the classification evaluators + dataset evaluator framework to walk
+        per-datapoint results without each site re-implementing the same
+        isinstance/try/except dance.
+        """
+        if isinstance(details, cls):
+            return details
+        if isinstance(details, dict):
+            try:
+                return cls.model_validate(details)
+            except Exception:
+                return None
+        return None
+
 
 # Additional type variables for Config and Justification
 # Note: C must be BaseEvaluatorConfig[T] to ensure type consistency
diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
index d56509228..44a795d90 100644
--- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
@@ -8,6 +8,8 @@
 
 from typing import Literal
 
+from pydantic import model_validator
+
 from ..models import (
     AgentExecution,
     EvaluationResult,
@@ -19,12 +21,22 @@
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
+from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
 from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification
 from .output_evaluator import (
     BaseOutputEvaluator,
     OutputEvaluatorConfig,
 )
 
+# Maps the evaluator-level ``metric_type`` strings to the corresponding
+# aggregator-spec ``type`` values. The two spellings differ historically:
+# the evaluator uses "f-score" (hyphen), the aggregator uses "fscore".
+_METRIC_TYPE_TO_AGGREGATOR_TYPE = {
+    "precision": "precision",
+    "recall": "recall",
+    "f-score": "fscore",
+}
+
 
 class BinaryClassificationEvaluationCriteria(BaseEvaluationCriteria):
     """Per-datapoint criteria: which class this sample should belong to."""
@@ -41,6 +53,58 @@ class BinaryClassificationEvaluatorConfig(
     positive_class: str
     metric_type: Literal["precision", "recall", "f-score"] = "precision"
     f_value: float = 1.0
+    # Optional run-level aggregators (precision / recall / fscore). Each is a
+    # self-contained spec carrying its own ``classes``, ``averaging``, and
+    # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list
+    # after all per-datapoint evaluators complete and emits one structured
+    # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``.
+    aggregators: list[AggregatorSpec] | None = None
+
+    @model_validator(mode="after")
+    def _validate_aggregators_against_evaluator_config(
+        self,
+    ) -> "BinaryClassificationEvaluatorConfig":
+        """Reject aggregators that are inconsistent with the evaluator's own config.
+
+        Two checks:
+          * ``positive_class`` must appear in every aggregator's ``classes``
+            list (case-insensitive). Otherwise the per-datapoint headline
+            and the aggregator's confusion matrix score completely
+            disjoint label spaces.
+          * For each aggregator whose ``type`` matches the evaluator-level
+            ``metric_type`` (mapped via :data:`_METRIC_TYPE_TO_AGGREGATOR_TYPE`),
+            the aggregator's ``f_value`` must match the evaluator's
+            ``f_value``. Otherwise the per-evaluator headline produced via
+            ``reduce_scores`` and the dataset evaluator's per-aggregator
+            score diverge silently.
+        """
+        if not self.aggregators:
+            return self
+        positive_lower = self.positive_class.lower() if self.positive_class else ""
+        evaluator_aggregator_type = _METRIC_TYPE_TO_AGGREGATOR_TYPE.get(
+            self.metric_type
+        )
+        for spec in self.aggregators:
+            if positive_lower and positive_lower not in {
+                c.lower() for c in spec.classes
+            }:
+                raise ValueError(
+                    f"Aggregator '{spec.type}' on evaluator '{self.name}' "
+                    f"declares classes={spec.classes!r} but positive_class="
+                    f"{self.positive_class!r} is not in that list. Add the "
+                    "positive class to the aggregator's classes or remove it."
+                )
+            if spec.type == evaluator_aggregator_type and isinstance(
+                spec, FScoreAggregatorSpec
+            ):
+                if spec.f_value != self.f_value:
+                    raise ValueError(
+                        f"Aggregator 'fscore' on evaluator '{self.name}' has "
+                        f"f_value={spec.f_value} but the evaluator's f_value="
+                        f"{self.f_value}. The per-evaluator headline and the "
+                        "aggregator would compute different F-beta scores."
+                    )
+        return self
 
 
 class BinaryClassificationEvaluator(
@@ -98,14 +162,8 @@ def reduce_scores(self, results: list[EvaluationResultDto]) -> float:
         tp = fp = fn = 0
 
         for r in results:
-            if isinstance(r.details, BaseEvaluatorJustification):
-                details = r.details
-            elif isinstance(r.details, dict):
-                try:
-                    details = BaseEvaluatorJustification.model_validate(r.details)
-                except Exception:
-                    continue
-            else:
+            details = BaseEvaluatorJustification.try_from(r.details)
+            if details is None:
                 continue
             pred = details.actual
             exp = details.expected
diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
new file mode 100644
index 000000000..7f2ca2519
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -0,0 +1,199 @@
+"""Dataset-level classification evaluators: Precision, Recall, F-score.
+
+All three share the same internal machinery — a k x k confusion matrix built
+from each per-datapoint result's BaseEvaluatorJustification (expected, actual)
+strings. They differ only in the final formula and (for F-score) the beta
+parameter. The headline ``score`` is the micro or macro average per the
+embedded :class:`AggregatorSpec`; ``details`` carries the full per-class
+breakdown plus the confusion matrix.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+from ..models.models import (
+    EvaluationResult,
+    EvaluationResultDto,
+    NumericEvaluationResult,
+)
+from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
+from .base_dataset_evaluator import BaseDatasetEvaluator
+from .base_evaluator import BaseEvaluatorJustification
+
+
+class PerClassMetrics(BaseModel):
+    """Per-class confusion counts plus the metric the evaluator computed."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    tp: int
+    tn: int
+    fp: int
+    fn: int
+    support: int
+    value: float
+
+
+class ClassificationDetails(BaseModel):
+    """Structured details payload emitted by every classification evaluator."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    metric: str
+    average: str
+    classes: list[str]
+    confusion_matrix: list[list[int]] = Field(
+        ...,
+        description=(
+            "k x k confusion matrix indexed as "
+            "``confusion_matrix[predicted_idx][expected_idx]`` "
+            "(rows are predicted classes, columns are expected). "
+            "This is the transpose of sklearn's convention "
+            "(``[true][predicted]``); UI / consumer code must use the "
+            "orientation documented here."
+        ),
+    )
+    per_class: dict[str, PerClassMetrics]
+    micro: float
+    macro: float
+    n_total: int
+    n_scored: int
+    n_skipped: int
+
+
+@dataclass(slots=True)
+class _ConfusionData:
+    """Internal: confusion matrix and per-class counts derived from results."""
+
+    classes: list[str]
+    matrix: list[list[int]]
+    n_total: int
+    n_scored: int
+    n_skipped: int
+
+
+def _build_confusion(
+    results: list[EvaluationResultDto],
+    classes: list[str],
+) -> _ConfusionData:
+    """Build a confusion matrix from per-datapoint results.
+
+    Results without a parseable justification are counted in ``n_skipped`` and
+    omitted from the matrix. Pairs whose expected or actual label isn't in
+    ``classes`` are also skipped. Labels are normalized to lowercase for the
+    lookup index so a classifier returning "Book" vs configured "book" still
+    matches, but the user-supplied casing is preserved in the returned
+    ``_ConfusionData.classes`` so downstream output (per_class keys, UI labels)
+    shows what the user typed.
+    """
+    index_of = {c.lower(): i for i, c in enumerate(classes)}
+    k = len(classes)
+    matrix = [[0] * k for _ in range(k)]
+
+    n_total = len(results)
+    n_scored = 0
+    n_skipped = 0
+
+    for r in results:
+        j = BaseEvaluatorJustification.try_from(r.details)
+        if j is None:
+            n_skipped += 1
+            continue
+        exp = j.expected.lower()
+        act = j.actual.lower()
+        if exp not in index_of or act not in index_of:
+            n_skipped += 1
+            continue
+        matrix[index_of[act]][index_of[exp]] += 1
+        n_scored += 1
+
+    return _ConfusionData(
+        classes=list(classes),
+        matrix=matrix,
+        n_total=n_total,
+        n_scored=n_scored,
+        n_skipped=n_skipped,
+    )
+
+
+class ClassificationDatasetEvaluator(BaseDatasetEvaluator[AggregatorSpec]):
+    """One implementation for all three classification aggregators.
+
+    Dispatches on ``self.spec.type`` to pick the per-class metric formula:
+    precision, recall, or F-beta. The math (confusion-matrix build, per-class
+    counts, micro/macro averaging) is identical across the three.
+    """
+
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Compute the configured metric report and return the headline as score."""
+        confusion = _build_confusion(results, self.spec.classes)
+        beta_sq = (
+            self.spec.f_value * self.spec.f_value
+            if isinstance(self.spec, FScoreAggregatorSpec)
+            else 0.0
+        )
+        metric_type = self.spec.type
+
+        per_class: dict[str, PerClassMetrics] = {}
+        total_tp = 0
+        total_fp = 0
+        total_fn = 0
+        k = len(confusion.classes)
+
+        for c, label in enumerate(confusion.classes):
+            tp = confusion.matrix[c][c]
+            fp = sum(confusion.matrix[c][j] for j in range(k)) - tp
+            fn = sum(confusion.matrix[j][c] for j in range(k)) - tp
+            tn = confusion.n_scored - tp - fp - fn
+            total_tp += tp
+            total_fp += fp
+            total_fn += fn
+            per_class[label] = PerClassMetrics(
+                tp=tp,
+                tn=tn,
+                fp=fp,
+                fn=fn,
+                support=tp + fn,
+                value=_metric(metric_type, tp, fp, fn, beta_sq),
+            )
+
+        micro = _metric(metric_type, total_tp, total_fp, total_fn, beta_sq)
+        # AggregatorSpec.classes has min_length=1, so k >= 1 always.
+        macro = sum(per_class[c].value for c in confusion.classes) / k
+
+        details = ClassificationDetails(
+            metric=metric_type,
+            average=self.spec.averaging,
+            classes=confusion.classes,
+            confusion_matrix=confusion.matrix,
+            per_class=per_class,
+            micro=micro,
+            macro=macro,
+            n_total=confusion.n_total,
+            n_scored=confusion.n_scored,
+            n_skipped=confusion.n_skipped,
+        )
+
+        headline = micro if self.spec.averaging == "micro" else macro
+        return NumericEvaluationResult(score=headline, details=details)
+
+
+def _metric(metric_type: str, tp: int, fp: int, fn: int, beta_sq: float) -> float:
+    """One formula switch covering precision / recall / F-beta."""
+    if metric_type == "precision":
+        return tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    if metric_type == "recall":
+        return tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    if metric_type == "fscore":
+        p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        denom = beta_sq * p + r
+        return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0
+    raise ValueError(
+        f"Unknown metric_type: {metric_type!r}. "
+        "Expected one of: precision, recall, fscore."
+    )
diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
new file mode 100644
index 000000000..9cd895ad2
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
@@ -0,0 +1,27 @@
+"""Factory that instantiates dataset-level evaluators from aggregator specs.
+
+Dataset evaluators are built from a self-contained :class:`AggregatorSpec`
+embedded in a per-datapoint classification evaluator's config, plus the source
+evaluator's name (supplied by the runtime when walking those configs). All
+three aggregator types share a single :class:`ClassificationDatasetEvaluator`
+implementation that dispatches on ``spec.type`` internally.
+"""
+
+from __future__ import annotations
+
+from ._aggregator_specs import AggregatorSpec
+from .classification_dataset_evaluators import ClassificationDatasetEvaluator
+
+
+def build_dataset_evaluator(
+    spec: AggregatorSpec,
+    source_evaluator: str,
+) -> ClassificationDatasetEvaluator:
+    """Build a dataset evaluator instance from an aggregator spec.
+
+    Args:
+        spec: A validated :class:`AggregatorSpec` (precision / recall / fscore).
+        source_evaluator: Name of the per-datapoint evaluator whose results
+            this aggregator consumes.
+    """
+    return ClassificationDatasetEvaluator(spec, source_evaluator)
diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
index 69790c3aa..1799323ac 100644
--- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
@@ -9,6 +9,8 @@
 
 from typing import Literal
 
+from pydantic import model_validator
+
 from ..models import (
     AgentExecution,
     EvaluationResult,
@@ -20,12 +22,22 @@
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
+from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
 from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification
 from .output_evaluator import (
     BaseOutputEvaluator,
     OutputEvaluatorConfig,
 )
 
+# Maps the evaluator-level ``metric_type`` strings to the corresponding
+# aggregator-spec ``type`` values. The two spellings differ historically:
+# the evaluator uses "f-score" (hyphen), the aggregator uses "fscore".
+_METRIC_TYPE_TO_AGGREGATOR_TYPE = {
+    "precision": "precision",
+    "recall": "recall",
+    "f-score": "fscore",
+}
+
 
 class MulticlassClassificationEvaluationCriteria(BaseEvaluationCriteria):
     """Per-datapoint criteria: which class this sample should belong to."""
@@ -43,6 +55,67 @@ class MulticlassClassificationEvaluatorConfig(
     metric_type: Literal["precision", "recall", "f-score"] = "f-score"
     averaging: Literal["micro", "macro"] = "macro"
     f_value: float = 1.0
+    # Optional run-level aggregators (precision / recall / fscore). Each is a
+    # self-contained spec carrying its own ``classes``, ``averaging``, and
+    # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list
+    # after all per-datapoint evaluators complete and emits one structured
+    # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``.
+    aggregators: list[AggregatorSpec] | None = None
+
+    @model_validator(mode="after")
+    def _validate_aggregators_against_evaluator_config(
+        self,
+    ) -> "MulticlassClassificationEvaluatorConfig":
+        """Reject aggregators that are inconsistent with the evaluator's own config.
+
+        Two checks:
+          * Every evaluator-level class must appear in every aggregator's
+            ``classes`` list (case-insensitive). Otherwise the per-datapoint
+            and aggregator paths score disjoint label spaces.
+          * For each aggregator whose ``type`` matches the evaluator-level
+            ``metric_type`` (mapped via :data:`_METRIC_TYPE_TO_AGGREGATOR_TYPE`),
+            the aggregator's ``averaging`` must match the evaluator's
+            ``averaging``, and for ``fscore`` the ``f_value`` must match too.
+            Otherwise the per-evaluator headline and the dataset evaluator's
+            per-aggregator score diverge silently.
+        """
+        if not self.aggregators:
+            return self
+        evaluator_classes_lower = {c.lower() for c in self.classes}
+        evaluator_aggregator_type = _METRIC_TYPE_TO_AGGREGATOR_TYPE.get(
+            self.metric_type
+        )
+        for spec in self.aggregators:
+            spec_classes_lower = {c.lower() for c in spec.classes}
+            missing = evaluator_classes_lower - spec_classes_lower
+            if missing:
+                raise ValueError(
+                    f"Aggregator '{spec.type}' on evaluator '{self.name}' "
+                    f"declares classes={spec.classes!r} but the evaluator's "
+                    f"classes={self.classes!r} include {sorted(missing)!r} "
+                    "that the aggregator does not. Aggregators must cover "
+                    "the evaluator's full class space."
+                )
+            if spec.type == evaluator_aggregator_type:
+                if spec.averaging != self.averaging:
+                    raise ValueError(
+                        f"Aggregator '{spec.type}' on evaluator '{self.name}' "
+                        f"has averaging={spec.averaging!r} but the evaluator's "
+                        f"averaging={self.averaging!r}. The per-evaluator "
+                        "headline and the aggregator would compute different "
+                        "scores."
+                    )
+                if (
+                    isinstance(spec, FScoreAggregatorSpec)
+                    and spec.f_value != self.f_value
+                ):
+                    raise ValueError(
+                        f"Aggregator 'fscore' on evaluator '{self.name}' has "
+                        f"f_value={spec.f_value} but the evaluator's f_value="
+                        f"{self.f_value}. The per-evaluator headline and the "
+                        "aggregator would compute different F-beta scores."
+                    )
+        return self
 
 
 class MulticlassClassificationEvaluator(
@@ -69,7 +142,16 @@ async def evaluate(
         agent_execution: AgentExecution,
         evaluation_criteria: MulticlassClassificationEvaluationCriteria,
     ) -> EvaluationResult:
-        """Evaluate multiclass classification by comparing predicted vs expected class."""
+        """Evaluate multiclass classification by comparing predicted vs expected class.
+
+        Configuration errors (e.g. ``expected_class`` not in the configured
+        ``classes``) raise — that's a setup mistake the user must fix. But a
+        predicted class outside the vocabulary (a sloppy LLM returning
+        "unknown", garbage, or an unconfigured label) returns a 0.0 score with
+        the OOV label preserved in the justification, mirroring the binary
+        evaluator's behavior. The dataset evaluator's confusion matrix
+        accounts for these via ``n_skipped``.
+        """
         predicted_class = str(self._get_actual_output(agent_execution)).lower()
         expected_class = evaluation_criteria.expected_class.lower()
         classes = [c.lower() for c in self.evaluator_config.classes]
@@ -82,14 +164,6 @@ async def evaluate(
                 category=UiPathEvaluationErrorCategory.USER,
             )
 
-        if predicted_class not in classes:
-            raise UiPathEvaluationError(
-                code="INVALID_PREDICTED_CLASS",
-                title="Predicted class not in configured classes",
-                detail=f"Predicted class '{predicted_class}' is not in the configured classes: {classes}",
-                category=UiPathEvaluationErrorCategory.USER,
-            )
-
         score = 1.0 if predicted_class == expected_class else 0.0
 
         justification = self.validate_justification(
@@ -114,14 +188,8 @@ def reduce_scores(self, results: list[EvaluationResultDto]) -> float:
         # Reconstruct confusion matrix: confusion[pred_idx][exp_idx]
         confusion = [[0] * k for _ in range(k)]
         for r in results:
-            if isinstance(r.details, BaseEvaluatorJustification):
-                details = r.details
-            elif isinstance(r.details, dict):
-                try:
-                    details = BaseEvaluatorJustification.model_validate(r.details)
-                except Exception:
-                    continue
-            else:
+            details = BaseEvaluatorJustification.try_from(r.details)
+            if details is None:
                 continue
             pred = details.actual
             exp = details.expected
diff --git a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
new file mode 100644
index 000000000..a15ac8e5a
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
@@ -0,0 +1,269 @@
+{
+  "evaluatorTypeId": "uipath-binary-classification",
+  "evaluatorConfigSchema": {
+    "$defs": {
+      "BinaryClassificationEvaluationCriteria": {
+        "description": "Per-datapoint criteria: which class this sample should belong to.",
+        "properties": {
+          "expected_class": {
+            "title": "Expected Class",
+            "type": "string"
+          }
+        },
+        "required": [
+          "expected_class"
+        ],
+        "title": "BinaryClassificationEvaluationCriteria",
+        "type": "object"
+      },
+      "FScoreAggregatorSpec": {
+        "description": "Run-level F-beta aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "fscore",
+            "default": "fscore",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          },
+          "f_value": {
+            "default": 1.0,
+            "exclusiveMinimum": 0,
+            "title": "F Value",
+            "type": "number"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "FScoreAggregatorSpec",
+        "type": "object"
+      },
+      "PrecisionAggregatorSpec": {
+        "description": "Run-level precision aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "precision",
+            "default": "precision",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "PrecisionAggregatorSpec",
+        "type": "object"
+      },
+      "RecallAggregatorSpec": {
+        "description": "Run-level recall aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "recall",
+            "default": "recall",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "RecallAggregatorSpec",
+        "type": "object"
+      }
+    },
+    "description": "Configuration for the binary classification evaluator.",
+    "properties": {
+      "name": {
+        "default": "BinaryClassificationEvaluator",
+        "title": "Name",
+        "type": "string"
+      },
+      "description": {
+        "default": "",
+        "description": "The description of the evaluator",
+        "title": "Description",
+        "type": "string"
+      },
+      "default_evaluation_criteria": {
+        "anyOf": [
+          {
+            "$ref": "#/$defs/BinaryClassificationEvaluationCriteria"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null
+      },
+      "target_output_key": {
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "items": {
+              "type": "string"
+            },
+            "type": "array"
+          }
+        ],
+        "default": "*",
+        "description": "Key or list of keys to extract output from agent execution",
+        "title": "Target Output Key"
+      },
+      "line_by_line_evaluator": {
+        "default": false,
+        "description": "If True, split output by delimiter and evaluate each line separately",
+        "title": "Line By Line Evaluator",
+        "type": "boolean"
+      },
+      "line_delimiter": {
+        "default": "\n",
+        "description": "Delimiter to split output when line_by_line_evaluator is True",
+        "title": "Line Delimiter",
+        "type": "string"
+      },
+      "positive_class": {
+        "title": "Positive Class",
+        "type": "string"
+      },
+      "metric_type": {
+        "default": "precision",
+        "enum": [
+          "precision",
+          "recall",
+          "f-score"
+        ],
+        "title": "Metric Type",
+        "type": "string"
+      },
+      "f_value": {
+        "default": 1.0,
+        "title": "F Value",
+        "type": "number"
+      },
+      "aggregators": {
+        "anyOf": [
+          {
+            "items": {
+              "discriminator": {
+                "mapping": {
+                  "fscore": "#/$defs/FScoreAggregatorSpec",
+                  "precision": "#/$defs/PrecisionAggregatorSpec",
+                  "recall": "#/$defs/RecallAggregatorSpec"
+                },
+                "propertyName": "type"
+              },
+              "oneOf": [
+                {
+                  "$ref": "#/$defs/PrecisionAggregatorSpec"
+                },
+                {
+                  "$ref": "#/$defs/RecallAggregatorSpec"
+                },
+                {
+                  "$ref": "#/$defs/FScoreAggregatorSpec"
+                }
+              ]
+            },
+            "type": "array"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null,
+        "title": "Aggregators"
+      }
+    },
+    "required": [
+      "positive_class"
+    ],
+    "title": "BinaryClassificationEvaluatorConfig",
+    "type": "object"
+  },
+  "evaluationCriteriaSchema": {
+    "description": "Per-datapoint criteria: which class this sample should belong to.",
+    "properties": {
+      "expected_class": {
+        "title": "Expected Class",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected_class"
+    ],
+    "title": "BinaryClassificationEvaluationCriteria",
+    "type": "object"
+  },
+  "justificationSchema": {
+    "description": "Base class for all evaluator justifications.",
+    "properties": {
+      "expected": {
+        "title": "Expected",
+        "type": "string"
+      },
+      "actual": {
+        "title": "Actual",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected",
+      "actual"
+    ],
+    "title": "BaseEvaluatorJustification",
+    "type": "object"
+  }
+}
\ No newline at end of file
diff --git a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
new file mode 100644
index 000000000..8cc971f75
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
@@ -0,0 +1,281 @@
+{
+  "evaluatorTypeId": "uipath-multiclass-classification",
+  "evaluatorConfigSchema": {
+    "$defs": {
+      "FScoreAggregatorSpec": {
+        "description": "Run-level F-beta aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "fscore",
+            "default": "fscore",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          },
+          "f_value": {
+            "default": 1.0,
+            "exclusiveMinimum": 0,
+            "title": "F Value",
+            "type": "number"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "FScoreAggregatorSpec",
+        "type": "object"
+      },
+      "MulticlassClassificationEvaluationCriteria": {
+        "description": "Per-datapoint criteria: which class this sample should belong to.",
+        "properties": {
+          "expected_class": {
+            "title": "Expected Class",
+            "type": "string"
+          }
+        },
+        "required": [
+          "expected_class"
+        ],
+        "title": "MulticlassClassificationEvaluationCriteria",
+        "type": "object"
+      },
+      "PrecisionAggregatorSpec": {
+        "description": "Run-level precision aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "precision",
+            "default": "precision",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "PrecisionAggregatorSpec",
+        "type": "object"
+      },
+      "RecallAggregatorSpec": {
+        "description": "Run-level recall aggregator (multiclass, micro or macro averaged).",
+        "properties": {
+          "type": {
+            "const": "recall",
+            "default": "recall",
+            "title": "Type",
+            "type": "string"
+          },
+          "classes": {
+            "items": {
+              "type": "string"
+            },
+            "minItems": 1,
+            "title": "Classes",
+            "type": "array"
+          },
+          "averaging": {
+            "enum": [
+              "macro",
+              "micro"
+            ],
+            "title": "Averaging",
+            "type": "string"
+          }
+        },
+        "required": [
+          "classes",
+          "averaging"
+        ],
+        "title": "RecallAggregatorSpec",
+        "type": "object"
+      }
+    },
+    "description": "Configuration for the multiclass classification evaluator.",
+    "properties": {
+      "name": {
+        "default": "MulticlassClassificationEvaluator",
+        "title": "Name",
+        "type": "string"
+      },
+      "description": {
+        "default": "",
+        "description": "The description of the evaluator",
+        "title": "Description",
+        "type": "string"
+      },
+      "default_evaluation_criteria": {
+        "anyOf": [
+          {
+            "$ref": "#/$defs/MulticlassClassificationEvaluationCriteria"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null
+      },
+      "target_output_key": {
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "items": {
+              "type": "string"
+            },
+            "type": "array"
+          }
+        ],
+        "default": "*",
+        "description": "Key or list of keys to extract output from agent execution",
+        "title": "Target Output Key"
+      },
+      "line_by_line_evaluator": {
+        "default": false,
+        "description": "If True, split output by delimiter and evaluate each line separately",
+        "title": "Line By Line Evaluator",
+        "type": "boolean"
+      },
+      "line_delimiter": {
+        "default": "\n",
+        "description": "Delimiter to split output when line_by_line_evaluator is True",
+        "title": "Line Delimiter",
+        "type": "string"
+      },
+      "classes": {
+        "items": {
+          "type": "string"
+        },
+        "title": "Classes",
+        "type": "array"
+      },
+      "metric_type": {
+        "default": "f-score",
+        "enum": [
+          "precision",
+          "recall",
+          "f-score"
+        ],
+        "title": "Metric Type",
+        "type": "string"
+      },
+      "averaging": {
+        "default": "macro",
+        "enum": [
+          "micro",
+          "macro"
+        ],
+        "title": "Averaging",
+        "type": "string"
+      },
+      "f_value": {
+        "default": 1.0,
+        "title": "F Value",
+        "type": "number"
+      },
+      "aggregators": {
+        "anyOf": [
+          {
+            "items": {
+              "discriminator": {
+                "mapping": {
+                  "fscore": "#/$defs/FScoreAggregatorSpec",
+                  "precision": "#/$defs/PrecisionAggregatorSpec",
+                  "recall": "#/$defs/RecallAggregatorSpec"
+                },
+                "propertyName": "type"
+              },
+              "oneOf": [
+                {
+                  "$ref": "#/$defs/PrecisionAggregatorSpec"
+                },
+                {
+                  "$ref": "#/$defs/RecallAggregatorSpec"
+                },
+                {
+                  "$ref": "#/$defs/FScoreAggregatorSpec"
+                }
+              ]
+            },
+            "type": "array"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null,
+        "title": "Aggregators"
+      }
+    },
+    "required": [
+      "classes"
+    ],
+    "title": "MulticlassClassificationEvaluatorConfig",
+    "type": "object"
+  },
+  "evaluationCriteriaSchema": {
+    "description": "Per-datapoint criteria: which class this sample should belong to.",
+    "properties": {
+      "expected_class": {
+        "title": "Expected Class",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected_class"
+    ],
+    "title": "MulticlassClassificationEvaluationCriteria",
+    "type": "object"
+  },
+  "justificationSchema": {
+    "description": "Base class for all evaluator justifications.",
+    "properties": {
+      "expected": {
+        "title": "Expected",
+        "type": "string"
+      },
+      "actual": {
+        "title": "Actual",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected",
+      "actual"
+    ],
+    "title": "BaseEvaluatorJustification",
+    "type": "object"
+  }
+}
\ No newline at end of file
diff --git a/packages/uipath/src/uipath/eval/runtime/_types.py b/packages/uipath/src/uipath/eval/runtime/_types.py
index 2aee5e599..fa84f0d9e 100644
--- a/packages/uipath/src/uipath/eval/runtime/_types.py
+++ b/packages/uipath/src/uipath/eval/runtime/_types.py
@@ -1,7 +1,7 @@
 import logging
 
 from opentelemetry.sdk.trace import ReadableSpan
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 from pydantic.alias_generators import to_camel
 
 from uipath.runtime import UiPathRuntimeResult
@@ -78,6 +78,9 @@ class UiPathEvalOutput(BaseModel):
 
     evaluation_set_name: str
     evaluation_set_results: list[UiPathEvalRunResult]
+    dataset_evaluator_results: dict[str, EvaluationResultDto] = Field(
+        default_factory=dict
+    )
 
     @property
     def score(self) -> float:
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
index 7f7614446..7167d7f20 100644
--- a/packages/uipath/src/uipath/eval/runtime/runtime.py
+++ b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -45,7 +45,15 @@
 from uipath.runtime.schema import UiPathRuntimeSchema
 
 from .._execution_context import ExecutionSpanCollector
+from ..evaluators._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
 from ..evaluators.base_evaluator import GenericBaseEvaluator
+from ..evaluators.binary_classification_evaluator import (
+    BinaryClassificationEvaluatorConfig,
+)
+from ..evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from ..evaluators.multiclass_classification_evaluator import (
+    MulticlassClassificationEvaluatorConfig,
+)
 from ..evaluators.output_evaluator import OutputEvaluationCriteria
 from ..helpers import get_agent_model
 from ..mocks._cache_manager import CacheManager
@@ -202,6 +210,97 @@ def compute_evaluator_scores(
     return final_score, agg_metrics_per_evaluator
 
 
+def compute_dataset_evaluator_results(
+    evaluation_set_results: list[UiPathEvalRunResult],
+    evaluators: Iterable[GenericBaseEvaluator[Any, Any, Any]],
+) -> dict[str, EvaluationResultDto]:
+    """Run any dataset-level aggregators embedded in per-datapoint evaluator configs.
+
+    Walks ``evaluators`` looking for any whose config carries an ``aggregators``
+    list (currently only Binary/Multiclass classification). For each aggregator
+    spec, builds the corresponding dataset evaluator via the factory and runs it
+    over the per-datapoint results that came from that source evaluator.
+
+    Args:
+        evaluation_set_results: Per-datapoint results from the run.
+        evaluators: Per-datapoint evaluator instances that ran during this eval
+            set. Their configs may carry ``aggregators`` lists.
+
+    Returns:
+        Dict mapping ``"{evaluator_name}.{aggregator_type}"`` to the run-level
+        EvaluationResultDto. When the same aggregator ``type`` appears more
+        than once on a source (e.g. macro+micro precision), each variant is
+        disambiguated as ``"{evaluator_name}.{type}.{averaging}"`` and, for
+        fscore, with the ``f_value`` suffix (``"...fbN"``), so a duplicate
+        type never overwrites a previous result. Aggregators whose source
+        produced no results are still invoked with an empty list so they emit
+        a zeroed result.
+    """
+    results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict(
+        list
+    )
+    for eval_run_result in evaluation_set_results:
+        for eval_run_result_dto in eval_run_result.evaluation_run_results:
+            if eval_run_result_dto.is_line_result:
+                continue
+            results_by_evaluator[eval_run_result_dto.evaluator_name].append(
+                eval_run_result_dto.result
+            )
+
+    dataset_results: dict[str, EvaluationResultDto] = {}
+    for evaluator in evaluators:
+        # Aggregators currently only live on classification evaluator configs.
+        # ``GenericBaseEvaluator`` doesn't declare ``evaluator_config``, so we
+        # retrieve it via ``getattr`` and narrow with ``isinstance`` to a
+        # classification config type before reading ``aggregators``. Widen the
+        # tuple if a future evaluator type grows an ``aggregators`` field.
+        config = getattr(evaluator, "evaluator_config", None)
+        if not isinstance(
+            config,
+            (
+                BinaryClassificationEvaluatorConfig,
+                MulticlassClassificationEvaluatorConfig,
+            ),
+        ):
+            continue
+        if not config.aggregators:
+            continue
+        source_name = config.name
+        source_results = results_by_evaluator.get(source_name, [])
+        # Count occurrences of each aggregator type to detect duplicates
+        # (e.g. macro+micro precision on the same source). The default key
+        # shape ``{source}.{type}`` collides on duplicates; disambiguate with
+        # ``.{averaging}`` (and ``.fb{f_value}`` for fscore variants) only
+        # when more than one aggregator of that type exists, to preserve the
+        # simple key shape in the common case.
+        type_counts: dict[str, int] = defaultdict(int)
+        for spec in config.aggregators:
+            type_counts[spec.type] += 1
+        for spec in config.aggregators:
+            dataset_evaluator = build_dataset_evaluator(spec, source_name)
+            key = _dataset_result_key(source_name, spec, type_counts[spec.type] > 1)
+            dataset_results[key] = EvaluationResultDto.from_evaluation_result(
+                dataset_evaluator.evaluate(source_results)
+            )
+    return dataset_results
+
+
+def _dataset_result_key(
+    source_name: str, spec: AggregatorSpec, disambiguate: bool
+) -> str:
+    """Build the result-dict key for a dataset evaluator.
+
+    Uses ``{source}.{type}`` for unique-type aggregators, and appends
+    ``.{averaging}`` (plus ``.fb{f_value}`` for fscore) when the same type
+    appears more than once on the same source.
+    """
+    if not disambiguate:
+        return f"{source_name}.{spec.type}"
+    if isinstance(spec, FScoreAggregatorSpec):
+        return f"{source_name}.{spec.type}.{spec.averaging}.fb{spec.f_value}"
+    return f"{source_name}.{spec.type}.{spec.averaging}"
+
+
 class UiPathEvalRuntime:
     """Specialized runtime for evaluation runs, with access to the factory."""
 
@@ -381,6 +480,19 @@ async def execute(self) -> UiPathRuntimeResult:
                         evaluators,
                     )
 
+                    # Run any dataset-level aggregators embedded in per-datapoint
+                    # classification evaluator configs (the ``aggregators`` list).
+                    # Each aggregator consumes per-datapoint results from its
+                    # parent evaluator and emits one run-level EvaluationResultDto
+                    # keyed ``{evaluator_name}.{aggregator_type}`` on
+                    # UiPathEvalOutput.dataset_evaluator_results.
+                    results.dataset_evaluator_results = (
+                        compute_dataset_evaluator_results(
+                            results.evaluation_set_results,
+                            evaluators,
+                        )
+                    )
+
                     # Configure span with output and metadata
                     await configure_eval_set_run_span(
                         span=span,
diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
new file mode 100644
index 000000000..d87d9013e
--- /dev/null
+++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
@@ -0,0 +1,227 @@
+"""End-to-end tests that run the classification sample projects through evaluate().
+
+These tests double as integration coverage for the binary and multiclass
+classification evaluators added in #1397 plus the embedded dataset-level
+aggregators added in #1669 — they wire each sample's main.py into a stand-in
+runtime, run the full eval set, and assert the per-row scores AND the
+specific aggregator scores produced by the embedded ``aggregators[]``. A
+regression that returns 0.0 for all aggregators (or one that swaps macro
+for micro silently) fails these tests.
+"""
+
+import importlib.util
+import uuid
+from pathlib import Path
+from types import ModuleType
+from typing import Any, AsyncGenerator
+
+import pytest
+
+from uipath.core.events import EventBus
+from uipath.core.tracing import UiPathTraceManager
+from uipath.eval.helpers import EvalHelpers
+from uipath.eval.runtime import UiPathEvalContext, evaluate
+from uipath.eval.runtime._types import UiPathEvalOutput
+from uipath.eval.runtime.runtime import compute_evaluator_scores
+from uipath.runtime import (
+    UiPathExecuteOptions,
+    UiPathRuntimeEvent,
+    UiPathRuntimeFactorySettings,
+    UiPathRuntimeProtocol,
+    UiPathRuntimeResult,
+    UiPathRuntimeStatus,
+    UiPathRuntimeStorageProtocol,
+    UiPathStreamOptions,
+)
+from uipath.runtime.schema import UiPathRuntimeSchema
+
+SAMPLES_DIR = Path(__file__).resolve().parents[3] / "samples"
+
+
+def _load_sample_main(sample_dir: Path) -> ModuleType:
+    """Import a sample's main.py as an isolated module."""
+    module_name = f"_eval_sample_{sample_dir.name}"
+    spec = importlib.util.spec_from_file_location(module_name, sample_dir / "main.py")
+    assert spec is not None and spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+class _SampleRuntime:
+    """Runtime that delegates execution to the sample's `main` function."""
+
+    def __init__(self, sample_main: Any) -> None:
+        self._sample_main = sample_main
+
+    async def execute(
+        self,
+        input: dict[str, Any] | None = None,
+        options: UiPathExecuteOptions | None = None,
+    ) -> UiPathRuntimeResult:
+        input_model = self._sample_main.EmailInput(**(input or {}))
+        output = await self._sample_main.main(input_model)
+        return UiPathRuntimeResult(
+            output={"category": output.category},
+            status=UiPathRuntimeStatus.SUCCESSFUL,
+        )
+
+    async def stream(
+        self,
+        input: dict[str, Any] | None = None,
+        options: UiPathStreamOptions | None = None,
+    ) -> AsyncGenerator[UiPathRuntimeEvent, None]:
+        yield await self.execute(input, None)
+
+    async def get_schema(self) -> UiPathRuntimeSchema:
+        return UiPathRuntimeSchema(
+            filePath="main.py",
+            uniqueId="main",
+            type="agent",
+            input={
+                "type": "object",
+                "properties": {
+                    "email_subject": {"type": "string"},
+                    "email_body": {"type": "string"},
+                },
+            },
+            output={
+                "type": "object",
+                "properties": {"category": {"type": "string"}},
+            },
+        )
+
+    async def dispose(self) -> None:
+        pass
+
+
+class _SampleFactory:
+    def __init__(self, sample_main: Any) -> None:
+        self._sample_main = sample_main
+
+    def discover_entrypoints(self) -> list[str]:
+        return ["main"]
+
+    async def get_storage(self) -> UiPathRuntimeStorageProtocol | None:
+        return None
+
+    async def get_settings(self) -> UiPathRuntimeFactorySettings | None:
+        return None
+
+    async def new_runtime(
+        self, entrypoint: str, runtime_id: str, **kwargs: Any
+    ) -> UiPathRuntimeProtocol:
+        return _SampleRuntime(self._sample_main)
+
+    async def dispose(self) -> None:
+        pass
+
+
+async def _run_sample(sample_dir: Path) -> tuple[UiPathEvalOutput, dict[str, float]]:
+    """Run the sample's eval set and return (per-row output, evaluator_averages)."""
+    sample_main = _load_sample_main(sample_dir)
+    factory = _SampleFactory(sample_main)
+
+    eval_set_path = str(sample_dir / "evaluations" / "eval-sets" / "default.json")
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    runtime = await factory.new_runtime("main", "test-runtime-id")
+    runtime_schema = await runtime.get_schema()
+
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+
+    result = await evaluate(
+        factory,
+        UiPathTraceManager(),
+        context,
+        EventBus(),
+    )
+
+    eval_output = UiPathEvalOutput.model_validate(result.output)
+    _, evaluator_averages = compute_evaluator_scores(
+        eval_output.evaluation_set_results, evaluators
+    )
+    return eval_output, evaluator_averages
+
+
+def _per_row_scores(output: UiPathEvalOutput) -> dict[str, float]:
+    return {
+        row.evaluation_name: row.evaluation_run_results[0].result.score
+        for row in output.evaluation_set_results
+    }
+
+
+async def test_binary_classification_sample_end_to_end():
+    """Binary spam classifier: 4/5 datapoints correct, but precision is 2/3 because of one FP."""
+    output, averages = await _run_sample(SAMPLES_DIR / "binary_classification_agent")
+
+    per_row = _per_row_scores(output)
+    assert per_row == {
+        "Spam: prize giveaway": 1.0,
+        "Spam: unsolicited promo": 1.0,
+        "Ham: legitimate invoice": 1.0,
+        "Ham: meeting request": 1.0,
+        "Ham mislabeled as spam (forces a false positive)": 0.0,
+    }
+    # Precision = TP / (TP + FP) = 2 / (2 + 1) = 0.6666...
+    assert averages["BinarySpamPrecision"] == pytest.approx(2 / 3, rel=1e-6)
+
+    # Dataset-level aggregators embedded on the evaluator config also fire.
+    # Each result keyed by "{evaluator_name}.{aggregator_type}".
+    keys = set(output.dataset_evaluator_results)
+    assert keys == {
+        "BinarySpamPrecision.precision",
+        "BinarySpamPrecision.recall",
+        "BinarySpamPrecision.fscore",
+    }
+    # Confusion matrix (predicted x expected, classes=[spam, ham]):
+    #   matrix[spam][spam] = 2  matrix[spam][ham] = 1  (the FP)
+    #   matrix[ham][spam]  = 0  matrix[ham][ham]  = 2
+    # per-class precision: spam = 2/3, ham = 1.0  → macro = (2/3 + 1) / 2 = 5/6
+    # per-class recall:    spam = 1.0, ham = 2/3  → macro = (1 + 2/3) / 2 = 5/6
+    # per-class F1:        spam = 0.8, ham = 0.8  → macro = 0.8
+    agg = output.dataset_evaluator_results
+    assert agg["BinarySpamPrecision.precision"].score == pytest.approx(5 / 6, rel=1e-6)
+    assert agg["BinarySpamPrecision.recall"].score == pytest.approx(5 / 6, rel=1e-6)
+    assert agg["BinarySpamPrecision.fscore"].score == pytest.approx(0.8, rel=1e-6)
+
+
+async def test_multiclass_classification_sample_end_to_end():
+    """Multiclass router: 6/7 correct, macro F1 = (0.8 + 0.8 + 1.0) / 3 = 0.8666..."""
+    output, averages = await _run_sample(
+        SAMPLES_DIR / "multiclass_classification_simple"
+    )
+
+    per_row = _per_row_scores(output)
+    assert per_row == {
+        "Payments: invoice reminder": 1.0,
+        "Payments: refund request": 1.0,
+        "Support: feature broken": 1.0,
+        "Support: how-to question": 1.0,
+        "Spam: prize giveaway": 1.0,
+        "Spam: marketing winner": 1.0,
+        "Support email accidentally routed to payments "
+        "(forces an FP for payments)": 0.0,
+    }
+    # payments F1=0.8 (P=2/3, R=1), support F1=0.8 (P=1, R=2/3), spam F1=1.0
+    # macro = mean = 2.6 / 3
+    assert averages["EmailMulticlassFScore"] == pytest.approx(2.6 / 3, rel=1e-6)
+
+    # Three embedded aggregators ran in addition to reduce_scores.
+    keys = set(output.dataset_evaluator_results)
+    assert keys == {
+        "EmailMulticlassFScore.precision",
+        "EmailMulticlassFScore.recall",
+        "EmailMulticlassFScore.fscore",
+    }
+    # The macro F1 computed by the embedded fscore aggregator should match
+    # reduce_scores' result (both walk the same confusion matrix).
+    fscore_result = output.dataset_evaluator_results["EmailMulticlassFScore.fscore"]
+    assert fscore_result.score == pytest.approx(2.6 / 3, rel=1e-6)
diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
new file mode 100644
index 000000000..69fbfda40
--- /dev/null
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -0,0 +1,528 @@
+"""Tests for dataset-level classification evaluators (Precision, Recall, FScore).
+
+Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases
+(empty input, out-of-vocab labels, malformed details), factory dispatch, and
+runtime-level routing where compute_dataset_evaluator_results walks
+per-datapoint evaluator configs' embedded ``aggregators`` lists.
+"""
+
+import uuid
+
+import pytest
+from pydantic import BaseModel
+
+from uipath.eval.evaluators._aggregator_specs import (
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDatasetEvaluator,
+    ClassificationDetails,
+)
+from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from uipath.eval.evaluators.multiclass_classification_evaluator import (
+    MulticlassClassificationEvaluator,
+)
+from uipath.eval.models.models import (
+    EvaluationResultDto,
+    NumericEvaluationResult,
+)
+from uipath.eval.runtime._types import (
+    UiPathEvalRunResult,
+    UiPathEvalRunResultDto,
+)
+from uipath.eval.runtime.runtime import compute_dataset_evaluator_results
+
+
+def _result(
+    expected: str, actual: str, score: float | None = None
+) -> EvaluationResultDto:
+    """Build an EvaluationResultDto carrying an expected/actual justification."""
+    if score is None:
+        score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(
+        score=score,
+        details=justification.model_dump(),
+    )
+
+
+def _precision(
+    classes: list[str], averaging: str = "macro"
+) -> ClassificationDatasetEvaluator:
+    spec = PrecisionAggregatorSpec(classes=classes, averaging=averaging)  # type: ignore[arg-type]
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
+
+
+def _recall(
+    classes: list[str], averaging: str = "macro"
+) -> ClassificationDatasetEvaluator:
+    spec = RecallAggregatorSpec(classes=classes, averaging=averaging)  # type: ignore[arg-type]
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
+
+
+def _fscore(
+    classes: list[str], averaging: str = "macro", f_value: float = 1.0
+) -> ClassificationDatasetEvaluator:
+    spec = FScoreAggregatorSpec(
+        classes=classes,
+        averaging=averaging,  # type: ignore[arg-type]
+        f_value=f_value,
+    )
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
+
+
+def _details(result: object) -> ClassificationDetails:
+    """Type-narrowing helper for asserting on details."""
+    assert isinstance(result, NumericEvaluationResult)
+    assert isinstance(result.details, ClassificationDetails)
+    return result.details
+
+
+def _multiclass_evaluator(
+    name: str,
+    classes: list[str],
+    aggregators: list[BaseModel],
+) -> MulticlassClassificationEvaluator:
+    """Build a per-datapoint multiclass evaluator with embedded aggregators."""
+    return MulticlassClassificationEvaluator.model_validate(
+        {
+            "id": str(uuid.uuid4()),
+            "evaluatorConfig": {
+                "name": name,
+                "classes": classes,
+                "aggregators": [spec.model_dump(by_alias=True) for spec in aggregators],
+            },
+        }
+    )
+
+
+class TestPrecisionEvaluator:
+    def test_empty_input_returns_zeroed_result(self) -> None:
+        result = _precision(["cat", "dog"]).evaluate([])
+        assert isinstance(result, NumericEvaluationResult)
+        assert result.score == 0.0
+        d = _details(result)
+        assert d.n_total == 0 and d.n_scored == 0
+        assert d.confusion_matrix == [[0, 0], [0, 0]]
+        assert d.per_class["cat"].tp == 0
+        assert d.per_class["cat"].tn == 0
+
+    def test_confusion_matrix_is_predicted_by_expected(self) -> None:
+        # Pin the documented orientation: confusion_matrix[predicted][expected].
+        # Differs from sklearn's [true][predicted] convention.
+        results = [
+            _result("cat", "cat"),  # expected=cat, predicted=cat -> [cat][cat]
+            _result("cat", "dog"),  # expected=cat, predicted=dog -> [dog][cat]
+            _result("dog", "dog"),  # expected=dog, predicted=dog -> [dog][dog]
+            _result("dog", "dog"),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        # classes -> index: cat=0, dog=1
+        # [predicted=cat][expected=cat] = 1
+        assert d.confusion_matrix[0][0] == 1
+        # [predicted=dog][expected=cat] = 1 (the FP for dog / FN for cat)
+        assert d.confusion_matrix[1][0] == 1
+        # [predicted=dog][expected=dog] = 2
+        assert d.confusion_matrix[1][1] == 2
+        # [predicted=cat][expected=dog] = 0
+        assert d.confusion_matrix[0][1] == 0
+
+    def test_precision_two_class_macro(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _precision(["yes", "no"], averaging="macro").evaluate(results)
+        d = _details(result)
+        # precision_yes = 2 / (2 + 1) = 2/3
+        # precision_no  = 0 / (0 + 1) = 0
+        # macro = (2/3 + 0) / 2 = 1/3
+        assert d.per_class["yes"].value == pytest.approx(2 / 3)
+        assert d.per_class["no"].value == pytest.approx(0.0)
+        assert d.macro == pytest.approx((2 / 3 + 0.0) / 2)
+        assert result.score == pytest.approx(d.macro)
+
+    def test_two_class_micro_equals_accuracy(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _precision(["yes", "no"], averaging="micro").evaluate(results)
+        d = _details(result)
+        assert d.micro == pytest.approx(0.5)
+        assert result.score == pytest.approx(0.5)
+
+    def test_three_class_macro(self) -> None:
+        pairs = [
+            ("cat", "cat"),
+            ("cat", "cat"),
+            ("cat", "dog"),
+            ("dog", "dog"),
+            ("dog", "dog"),
+            ("dog", "bird"),
+            ("bird", "bird"),
+            ("bird", "bird"),
+            ("bird", "cat"),
+        ]
+        result = _precision(["cat", "dog", "bird"], averaging="macro").evaluate(
+            [_result(e, a) for e, a in pairs]
+        )
+        d = _details(result)
+        for label in ("cat", "dog", "bird"):
+            m = d.per_class[label]
+            assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5
+            assert m.value == pytest.approx(2 / 3)
+        assert d.macro == pytest.approx(2 / 3)
+        assert result.score == pytest.approx(2 / 3)
+
+
+class TestRecallEvaluator:
+    def test_recall_two_class_macro(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _recall(["yes", "no"], averaging="macro").evaluate(results)
+        d = _details(result)
+        assert d.per_class["yes"].value == pytest.approx(2 / 3)
+        assert d.per_class["no"].value == pytest.approx(0.0)
+        assert result.score == pytest.approx(1 / 3)
+
+    def test_recall_differs_from_precision(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("no", "yes"),
+            _result("no", "yes"),
+            _result("no", "no"),
+        ]
+        p = _details(_precision(["yes", "no"], averaging="macro").evaluate(results))
+        r = _details(_recall(["yes", "no"], averaging="macro").evaluate(results))
+        assert p.per_class["yes"].value == pytest.approx(0.5)
+        assert p.per_class["no"].value == pytest.approx(1.0)
+        assert r.per_class["yes"].value == pytest.approx(1.0)
+        assert r.per_class["no"].value == pytest.approx(1 / 3)
+
+
+class TestFScoreEvaluator:
+    def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        f = _details(
+            _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results)
+        )
+        assert f.per_class["yes"].value == pytest.approx(2 / 3)
+        assert f.per_class["no"].value == pytest.approx(0.0)
+        assert f.macro == pytest.approx((2 / 3 + 0.0) / 2)
+
+    def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("no", "yes"),
+            _result("no", "yes"),
+            _result("no", "no"),
+        ]
+        f1 = _details(
+            _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results)
+        )
+        f2 = _details(
+            _fscore(["yes", "no"], averaging="macro", f_value=2.0).evaluate(results)
+        )
+        assert f2.per_class["yes"].value > f1.per_class["yes"].value
+
+    def test_three_class_micro_pools_across_classes(self) -> None:
+        pairs = [
+            ("cat", "cat"),
+            ("cat", "cat"),
+            ("cat", "dog"),
+            ("dog", "dog"),
+            ("dog", "dog"),
+            ("dog", "bird"),
+            ("bird", "bird"),
+            ("bird", "bird"),
+            ("bird", "cat"),
+        ]
+        d = _details(
+            _fscore(["cat", "dog", "bird"], averaging="micro", f_value=1.0).evaluate(
+                [_result(e, a) for e, a in pairs]
+            )
+        )
+        assert d.micro == pytest.approx(6 / 9)
+
+
+class TestSkippingAndEdgeCases:
+    def test_out_of_vocab_labels_are_skipped(self) -> None:
+        results = [
+            _result("cat", "cat"),
+            _result("cat", "platypus"),
+            _result("zebra", "dog"),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
+
+    def test_results_without_justification_are_skipped(self) -> None:
+        results = [
+            _result("cat", "cat"),
+            EvaluationResultDto(score=1.0, details="just a string"),
+            EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
+
+    def test_case_insensitive(self) -> None:
+        results = [_result("Cat", "CAT"), _result("DOG", "dog")]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.per_class["cat"].tp == 1
+        assert d.per_class["dog"].tp == 1
+
+
+class TestFactory:
+    """The factory now takes an AggregatorSpec instance + source name, not a dict."""
+
+    def test_builds_precision_from_spec(self) -> None:
+        spec = PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro")
+        evaluator = build_dataset_evaluator(spec, "intent_match")
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert evaluator.spec.type == "precision"
+        assert evaluator.source_evaluator == "intent_match"
+        assert evaluator.name == "intent_match.precision"
+
+    def test_builds_recall_from_spec(self) -> None:
+        spec = RecallAggregatorSpec(classes=["yes", "no"], averaging="micro")
+        evaluator = build_dataset_evaluator(spec, "intent_match")
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert evaluator.spec.type == "recall"
+        assert evaluator.name == "intent_match.recall"
+
+    def test_builds_fscore_from_spec(self) -> None:
+        spec = FScoreAggregatorSpec(
+            classes=["yes", "no"], averaging="macro", f_value=2.0
+        )
+        evaluator = build_dataset_evaluator(spec, "intent_match")
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert isinstance(evaluator.spec, FScoreAggregatorSpec)
+        assert evaluator.spec.f_value == 2.0
+
+
+class TestAggregatorSpecJsonRoundTrip:
+    """Pin the wire shape sent to the C# side."""
+
+    def test_precision_uses_self_contained_fields(self) -> None:
+        spec = PrecisionAggregatorSpec.model_validate(
+            {
+                "type": "precision",
+                "classes": ["book", "cancel", "reschedule"],
+                "averaging": "macro",
+            }
+        )
+        dumped = spec.model_dump(by_alias=True)
+        assert dumped == {
+            "type": "precision",
+            "classes": ["book", "cancel", "reschedule"],
+            "averaging": "macro",
+        }
+
+    def test_fscore_uses_camelcase_fvalue_on_wire(self) -> None:
+        spec = FScoreAggregatorSpec.model_validate(
+            {
+                "type": "fscore",
+                "classes": ["yes", "no"],
+                "averaging": "macro",
+                "fValue": 1.5,
+            }
+        )
+        assert spec.f_value == 1.5
+        dumped = spec.model_dump(by_alias=True)
+        assert dumped["fValue"] == 1.5
+        assert "f_value" not in dumped
+
+    def test_multiclass_evaluator_round_trips_aggregators(self) -> None:
+        """Per-datapoint evaluator config carries aggregators[]; survives dump+load."""
+        ev = _multiclass_evaluator(
+            "intent_classifier",
+            classes=["book", "cancel", "reschedule"],
+            aggregators=[
+                PrecisionAggregatorSpec(
+                    classes=["book", "cancel", "reschedule"], averaging="macro"
+                ),
+                FScoreAggregatorSpec(
+                    classes=["book", "cancel", "reschedule"],
+                    averaging="macro",
+                    f_value=1.0,
+                ),
+            ],
+        )
+        assert ev.evaluator_config.aggregators is not None
+        assert len(ev.evaluator_config.aggregators) == 2
+        assert ev.evaluator_config.aggregators[0].type == "precision"
+        assert ev.evaluator_config.aggregators[1].type == "fscore"
+
+
+class TestComputeDatasetEvaluatorResults:
+    """End-to-end: runtime walks evaluator configs' aggregators[]."""
+
+    def test_walks_aggregators_on_classification_evaluator(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+                RecallAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
+
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                    UiPathEvalRunResultDto(
+                        evaluator_name="some_other_evaluator",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=EvaluationResultDto(score=0.5),
+                    ),
+                ],
+            ),
+            UiPathEvalRunResult(
+                evaluation_name="dp2",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "no"),
+                    ),
+                ],
+            ),
+        ]
+
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        # Two aggregators on intent_match → two keys, prefixed by source name.
+        assert set(out) == {"intent_match.precision", "intent_match.recall"}
+        precision_dto = out["intent_match.precision"]
+        assert isinstance(precision_dto, EvaluationResultDto)
+        assert isinstance(precision_dto.details, dict)
+        # The unrelated 0.5 score from some_other_evaluator must NOT be in the matrix.
+        assert precision_dto.details["n_scored"] == 2
+
+    def test_evaluator_without_aggregators_is_skipped(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match", classes=["yes", "no"], aggregators=[]
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        assert out == {}
+
+    def test_line_by_line_subresults_are_excluded(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                        is_line_result=True,
+                    ),
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("no", "no"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        assert isinstance(out["intent_match.precision"].details, dict)
+        assert out["intent_match.precision"].details["n_scored"] == 1
+
+    def test_source_with_no_results_produces_zeroed_report(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="some_other_evaluator",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=EvaluationResultDto(score=1.0),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        dto = out["intent_match.precision"]
+        assert dto.score == 0.0
+        assert isinstance(dto.details, dict)
+        assert dto.details["n_scored"] == 0
+
+    def test_duplicate_aggregator_type_disambiguates_by_averaging(self) -> None:
+        """Two aggregators of the same type get distinct keys (no overwrite)."""
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="micro"),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        # Same type appears twice → averaging suffix disambiguates so neither
+        # is silently overwritten.
+        assert set(out) == {
+            "intent_match.precision.macro",
+            "intent_match.precision.micro",
+        }
diff --git a/packages/uipath/tests/evaluators/test_evaluator_methods.py b/packages/uipath/tests/evaluators/test_evaluator_methods.py
index ec795499d..0083aeec0 100644
--- a/packages/uipath/tests/evaluators/test_evaluator_methods.py
+++ b/packages/uipath/tests/evaluators/test_evaluator_methods.py
@@ -2608,12 +2608,20 @@ async def test_multiclass_classification_invalid_expected_class(self) -> None:
 
     @pytest.mark.asyncio
     async def test_multiclass_classification_invalid_predicted_class(self) -> None:
-        """Test that an invalid predicted class returns an error result."""
+        """Out-of-vocab predicted class returns score=0.0, not an error.
+
+        Mirrors binary classification's soft-fail behavior so a sloppy LLM
+        returning "fish" doesn't crash the whole eval set. The dataset
+        evaluator's confusion matrix counts the OOV prediction under
+        ``n_skipped``. Configuration errors (expected_class outside vocab)
+        still raise; only predicted_class is soft.
+        """
+        from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
         from uipath.eval.evaluators.multiclass_classification_evaluator import (
             MulticlassClassificationEvaluationCriteria,
             MulticlassClassificationEvaluator,
         )
-        from uipath.eval.models.models import ErrorEvaluationResult
+        from uipath.eval.models import NumericEvaluationResult
 
         execution = AgentExecution(
             agent_input={},
@@ -2630,5 +2638,148 @@ async def test_multiclass_classification_invalid_predicted_class(self) -> None:
         )
         criteria = MulticlassClassificationEvaluationCriteria(expected_class="cat")
         result = await evaluator.evaluate(execution, criteria)
-        assert isinstance(result, ErrorEvaluationResult)
+        assert isinstance(result, NumericEvaluationResult)
         assert result.score == 0.0
+        assert isinstance(result.details, BaseEvaluatorJustification)
+        assert result.details.actual == "fish"
+        assert result.details.expected == "cat"
+
+
+class TestClassificationConfigCrossValidators:
+    """Pydantic validators that catch internally-inconsistent classification configs.
+
+    Without these validators, a config with ``positive_class="yes"`` but an
+    aggregator declaring ``classes=["spam","ham"]`` silently scores against
+    completely disjoint label spaces — the per-evaluator headline and the
+    aggregator's confusion matrix both return numbers, neither one meaningful.
+    """
+
+    def test_binary_aggregator_missing_positive_class_rejected(self) -> None:
+        from uipath.eval.evaluators.binary_classification_evaluator import (
+            BinaryClassificationEvaluator,
+        )
+
+        config = {
+            "name": "SpamPrecision",
+            "positive_class": "spam",
+            "metric_type": "precision",
+            "aggregators": [
+                {
+                    "type": "precision",
+                    # "spam" is intentionally missing
+                    "classes": ["other", "ham"],
+                    "averaging": "macro",
+                }
+            ],
+        }
+        with pytest.raises(Exception) as exc_info:
+            BinaryClassificationEvaluator.model_validate(
+                {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+            )
+        assert "positive_class" in str(exc_info.value)
+
+    def test_binary_aggregator_fvalue_mismatch_rejected(self) -> None:
+        from uipath.eval.evaluators.binary_classification_evaluator import (
+            BinaryClassificationEvaluator,
+        )
+
+        config = {
+            "name": "SpamFScore",
+            "positive_class": "spam",
+            "metric_type": "f-score",
+            "f_value": 1.0,
+            "aggregators": [
+                {
+                    "type": "fscore",
+                    "classes": ["spam", "ham"],
+                    "averaging": "macro",
+                    "f_value": 2.0,  # diverges from evaluator-level 1.0
+                }
+            ],
+        }
+        with pytest.raises(Exception) as exc_info:
+            BinaryClassificationEvaluator.model_validate(
+                {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+            )
+        assert "f_value" in str(exc_info.value)
+
+    def test_multiclass_aggregator_missing_class_rejected(self) -> None:
+        from uipath.eval.evaluators.multiclass_classification_evaluator import (
+            MulticlassClassificationEvaluator,
+        )
+
+        config = {
+            "name": "IntentClassifier",
+            "classes": ["book", "cancel", "reschedule"],
+            "metric_type": "f-score",
+            "averaging": "macro",
+            "aggregators": [
+                {
+                    "type": "fscore",
+                    # "reschedule" is intentionally missing from the aggregator
+                    "classes": ["book", "cancel"],
+                    "averaging": "macro",
+                    "f_value": 1.0,
+                }
+            ],
+        }
+        with pytest.raises(Exception) as exc_info:
+            MulticlassClassificationEvaluator.model_validate(
+                {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+            )
+        assert "reschedule" in str(exc_info.value)
+
+    def test_multiclass_aggregator_averaging_mismatch_rejected(self) -> None:
+        from uipath.eval.evaluators.multiclass_classification_evaluator import (
+            MulticlassClassificationEvaluator,
+        )
+
+        config = {
+            "name": "IntentClassifier",
+            "classes": ["book", "cancel"],
+            "metric_type": "precision",
+            "averaging": "macro",
+            "aggregators": [
+                {
+                    "type": "precision",
+                    "classes": ["book", "cancel"],
+                    "averaging": "micro",  # diverges from evaluator-level macro
+                }
+            ],
+        }
+        with pytest.raises(Exception) as exc_info:
+            MulticlassClassificationEvaluator.model_validate(
+                {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+            )
+        assert "averaging" in str(exc_info.value)
+
+    def test_binary_aggregator_unrelated_type_does_not_cross_check(self) -> None:
+        """An aggregator whose ``type`` differs from the evaluator's ``metric_type``
+        should NOT be cross-checked for f_value / averaging matching — only the
+        positive_class containment rule applies.
+        """
+        from uipath.eval.evaluators.binary_classification_evaluator import (
+            BinaryClassificationEvaluator,
+        )
+
+        config = {
+            "name": "SpamPrecision",
+            "positive_class": "spam",
+            "metric_type": "precision",
+            "f_value": 1.0,
+            # evaluator computes precision; the aggregator below is an fscore
+            # with a different f_value — should be allowed because the
+            # evaluator headline isn't an fscore.
+            "aggregators": [
+                {
+                    "type": "fscore",
+                    "classes": ["spam", "ham"],
+                    "averaging": "macro",
+                    "f_value": 2.0,
+                }
+            ],
+        }
+        evaluator = BinaryClassificationEvaluator.model_validate(
+            {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+        )
+        assert evaluator.evaluator_config.aggregators is not None
diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock
index 86f8936e1..bd7f1f86e 100644
--- a/packages/uipath/uv.lock
+++ b/packages/uipath/uv.lock
@@ -2552,7 +2552,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.11.5"
+version = "2.11.6"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },