UiPath · ajay-kesavan · May 20, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py
@@ -0,0 +1,237 @@
+"""Runnable proof that the dataset-level evaluators work on realistic data.
+
+Five scenarios exercise the framework end-to-end at the SDK layer (no
+worker, no backend). Each prints the headline score plus a confusion
+matrix table, so the math is inspectable rather than a passing-test
+binary signal.
+
+Run::
+
+    cd packages/uipath
+    uv run python examples/dataset_evaluators_demo.py
+"""
+
+from __future__ import annotations
+
+from typing import Iterable
+
+from uipath.eval.evaluators._aggregator_specs import (
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDetails,
+)
+from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from uipath.eval.models.models import (
+    EvaluationResult,
+    EvaluationResultDto,
+    NumericEvaluationResult,
+)
+
+# ─── helpers ──────────────────────────────────────────────────────────────────
+
+
+def make_result(expected: str, actual: str) -> EvaluationResultDto:
+    """Build a single per-datapoint EvaluationResultDto.
+
+    Models what an upstream classification evaluator would produce after running
+    on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with the
+    expected/actual labels carried in the justification.
+    """
+    score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(score=score, details=justification.model_dump())
+
+
+def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]:
+    """Build a list of EvaluationResultDto from (expected, actual) pairs."""
+    return [make_result(e, a) for e, a in pairs]
+
+
+def print_header(title: str) -> None:
+    """Print a section header banner."""
+    print()
+    print("═" * 78)
+    print(f" {title}")
+    print("═" * 78)
+
+
+def report(
+    title: str,
+    result: EvaluationResult,
+    *,
+    show_json_tail: bool = False,  # kept for call-site compat; payload is always emitted
+) -> None:
+    """Render one scenario's result block as JSON — the actual wire shape."""
+    _ = show_json_tail
+    print_header(title)
+    assert isinstance(result, NumericEvaluationResult)
+    assert isinstance(result.details, ClassificationDetails)
+    print(f"  headline score = {result.score:.4f}")
+    print(result.details.model_dump_json(indent=2, by_alias=True))
+
+
+# ─── scenarios ────────────────────────────────────────────────────────────────
+
+
+def scenario_1_balanced_three_class() -> None:
+    """Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong."""
+    pairs = [
+        ("book", "book"),
+        ("book", "book"),
+        ("book", "cancel"),
+        ("cancel", "cancel"),
+        ("cancel", "cancel"),
+        ("cancel", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "book"),
+    ]
+    spec = PrecisionAggregatorSpec(
+        classes=["book", "cancel", "reschedule"], averaging="macro"
+    )
+    evaluator = build_dataset_evaluator(spec, source_evaluator="intent_match")
+    report(
+        "Scenario 1 — Balanced 3-class (intent recognition)\n"
+        "  Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.",
+        evaluator.evaluate(materialize_pairs(pairs)),
+        show_json_tail=True,
+    )
+
+
+def scenario_2_imbalanced_two_class() -> None:
+    """Rare-positive case — why macro vs micro matters."""
+    pairs: list[tuple[str, str]] = []
+    pairs += [("negative", "negative")] * 13
+    pairs += [("negative", "positive")] * 3
+    pairs += [("positive", "positive")] * 2
+    pairs += [("positive", "negative")] * 2
+
+    results = materialize_pairs(pairs)
+    classes = ["positive", "negative"]
+
+    macro = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=classes, averaging="macro"),
+        source_evaluator="positive_match",
+    )
+    micro = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=classes, averaging="micro"),
+        source_evaluator="positive_match",
+    )
+    report(
+        "Scenario 2a — Imbalanced 2-class, MACRO precision\n"
+        "  Rare positive class. Macro averages per-class, so the rare class\n"
+        "  having precision = 2/(2+3) = 0.40 drags the score down.",
+        macro.evaluate(results),
+    )
+    report(
+        "Scenario 2b — Same data, MICRO precision\n"
+        "  Pools TP/FP across classes. In a 2-class case this equals accuracy.",
+        micro.evaluate(results),
+    )
+
+
+def scenario_3_precision_vs_recall_vs_f() -> None:
+    """Same dataset, three different metrics — show they diverge on asymmetric data."""
+    pairs = [
+        ("yes", "yes"),
+        ("yes", "yes"),
+        ("no", "yes"),
+        ("no", "yes"),
+        ("no", "no"),
+        ("no", "no"),
+        ("yes", "no"),
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["yes", "no"]
+
+    evaluators = {
+        "Scenario 3a — Precision on a recall-favourable dataset": build_dataset_evaluator(
+            PrecisionAggregatorSpec(classes=classes, averaging="macro"),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)": build_dataset_evaluator(
+            RecallAggregatorSpec(classes=classes, averaging="macro"),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3c — F1 (harmonic mean of P and R)": build_dataset_evaluator(
+            FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)": build_dataset_evaluator(
+            FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=2.0),
+            source_evaluator="yes_match",
+        ),
+    }
+    for title, evaluator in evaluators.items():
+        report(title, evaluator.evaluate(results))
+
+
+def scenario_4_skipped_datapoints() -> None:
+    """Show how malformed / out-of-vocab data is reported, not silently dropped."""
+    results = [
+        make_result("cat", "cat"),
+        make_result("dog", "dog"),
+        make_result("cat", "platypus"),
+        make_result("zebra", "cat"),
+        EvaluationResultDto(score=1.0, details="bare string — no justification"),
+        EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+    ]
+    evaluator = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=["cat", "dog"], averaging="macro"),
+        source_evaluator="any_match",
+    )
+    report(
+        "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n"
+        "  6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n"
+        "  report so you can tell whether a low score is a real signal or\n"
+        "  just sparse data.",
+        evaluator.evaluate(results),
+    )
+
+
+def scenario_5_realistic_intent_classifier() -> None:
+    """A larger, more interesting 4-class dataset — uneven per-class performance."""
+    pairs = [
+        *[("book", "book")] * 10,
+        ("book", "cancel"),
+        *[("cancel", "cancel")] * 6,
+        ("cancel", "book"),
+        ("cancel", "modify"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "modify"),
+        ("reschedule", "modify"),
+        ("modify", "modify"),
+        ("modify", "reschedule"),
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["book", "cancel", "reschedule", "modify"]
+    macro_f1 = build_dataset_evaluator(
+        FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
+        source_evaluator="intent_match",
+    )
+    report(
+        "Scenario 5 — Realistic 4-class intent classifier\n"
+        "  Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n"
+        "  'modify' weakness; micro F1 would have hidden it under 'book' wins.",
+        macro_f1.evaluate(results),
+    )
+
+
+def main() -> None:
+    """Run every scenario sequentially."""
+    scenario_1_balanced_three_class()
+    scenario_2_imbalanced_two_class()
+    scenario_3_precision_vs_recall_vs_f()
+    scenario_4_skipped_datapoints()
+    scenario_5_realistic_intent_classifier()
+    print()
+    print("Done. All scenarios computed from real evaluator code.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.11.5"
+version = "2.11.6"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"

diff --git a/packages/uipath/samples/binary_classification_agent/bindings.json b/packages/uipath/samples/binary_classification_agent/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
@@ -0,0 +1,63 @@
+{
+  "version": "1.0",
+  "id": "SpamBinaryEval",
+  "name": "Binary spam classifier — precision",
+  "evaluatorRefs": ["BinarySpamPrecision"],
+  "evaluations": [
+    {
+      "id": "spam-prize",
+      "name": "Spam: prize giveaway",
+      "inputs": {
+        "email_subject": "You won a FREE iPhone!!!",
+        "email_body": "Congratulations! Click here to claim your prize now."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "spam-promo",
+      "name": "Spam: unsolicited promo",
+      "inputs": {
+        "email_subject": "Winner of the monthly drawing",
+        "email_body": "You've been selected. Click here to redeem."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "ham-invoice",
+      "name": "Ham: legitimate invoice",
+      "inputs": {
+        "email_subject": "Your March invoice is ready",
+        "email_body": "Your monthly invoice of $45.99 is attached. Payment is due March 15."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    },
+    {
+      "id": "ham-meeting",
+      "name": "Ham: meeting request",
+      "inputs": {
+        "email_subject": "Sync on Q2 planning",
+        "email_body": "Can we meet Wednesday at 2pm to align on next quarter's roadmap?"
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    },
+    {
+      "id": "ham-mislabeled",
+      "name": "Ham mislabeled as spam (forces a false positive)",
+      "inputs": {
+        "email_subject": "Free coffee in the break room!!!",
+        "email_body": "Just a heads up — the new espresso machine is set up."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    }
+  ]
+}
diff --git a/...ath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json b/...ath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
@@ -0,0 +1,34 @@
+{
+  "version": "1.0",
+  "id": "BinarySpamPrecision",
+  "description": "Precision on the 'spam' positive class, plus run-level aggregators",
+  "evaluatorTypeId": "uipath-binary-classification",
+  "evaluatorConfig": {
+    "name": "BinarySpamPrecision",
+    "targetOutputKey": "category",
+    "positiveClass": "spam",
+    "metricType": "precision",
+    "fValue": 1.0,
+    "defaultEvaluationCriteria": {
+      "expectedClass": "ham"
+    },
+    "aggregators": [
+      {
+        "type": "precision",
+        "classes": ["spam", "ham"],
+        "averaging": "macro"
+      },
+      {
+        "type": "recall",
+        "classes": ["spam", "ham"],
+        "averaging": "macro"
+      },
+      {
+        "type": "fscore",
+        "classes": ["spam", "ham"],
+        "averaging": "macro",
+        "fValue": 1.0
+      }
+    ]
+  }
+}
diff --git a/packages/uipath/samples/binary_classification_agent/main.py b/packages/uipath/samples/binary_classification_agent/main.py
@@ -0,0 +1,39 @@
+"""Rule-based spam/ham classifier demonstrating the binary classification evaluator."""
+
+from dataclasses import dataclass
+
+from uipath.tracing import traced
+
+SPAMMY_TOKENS = {
+    "free",
+    "winner",
+    "congratulations",
+    "click here",
+    "prize",
+    "!!!",
+}
+
+
+@dataclass
+class EmailInput:
+    email_subject: str
+    email_body: str
+
+
+@dataclass
+class Classification:
+    category: str
+
+
+@traced(name="classify_email", span_type="tool")
+def classify_email(subject: str, body: str) -> str:
+    """Return 'spam' if any spam-indicator token appears in the subject or body."""
+    text = f"{subject} {body}".lower()
+    return "spam" if any(token in text for token in SPAMMY_TOKENS) else "ham"
+
+
+@traced()
+async def main(input: EmailInput) -> Classification:
+    """Classify an email as 'spam' or 'ham'."""
+    category = classify_email(input.email_subject, input.email_body)
+    return Classification(category=category)
diff --git a/packages/uipath/samples/binary_classification_agent/pyproject.toml b/packages/uipath/samples/binary_classification_agent/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "binary-classification-agent"
+version = "0.0.1"
+description = "Rule-based spam/ham classifier demonstrating the binary classification evaluator"
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]