Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
6b11767
feat(eval): add evaluator type schemas for classification evaluators
ajay-kesavan May 20, 2026
037b60c
test(eval): add e2e tests + sample projects for classification evalua…
ajay-kesavan May 20, 2026
5e574f1
feat(eval): add dataset-level evaluator framework with precision/reca…
ajay-kesavan May 20, 2026
d6b7ab5
docs(eval): add runnable dataset evaluator demo + bump uv.lock for 2.…
ajay-kesavan May 20, 2026
e9ba8aa
Merge remote-tracking branch 'origin/main' into feat/eval-dataset-eva…
ajay-kesavan Jun 19, 2026
46c24e1
Merge remote-tracking branch 'origin/main' into feat/classification-e…
ajay-kesavan Jun 19, 2026
fb091e4
refactor(eval): embed aggregator specs in per-datapoint evaluator con…
ajay-kesavan Jun 19, 2026
d4e06b1
Merge branch 'feat/eval-dataset-evaluators' into feat/classification-…
ajay-kesavan Jun 19, 2026
77fcc10
feat(eval): wire sample classification evaluators to embedded aggrega…
ajay-kesavan Jun 19, 2026
c0436a3
refactor(eval): apply ponytail-review cleanup
ajay-kesavan Jun 19, 2026
05f6697
Merge branch 'feat/eval-dataset-evaluators' into feat/classification-…
ajay-kesavan Jun 19, 2026
50c64f4
refactor(eval): apply ponytail-review cleanup (justification + demo)
ajay-kesavan Jun 19, 2026
ad32c22
fix(eval): address adversarial-review feedback on dataset evaluators
ajay-kesavan Jun 19, 2026
cbbaf5f
Merge branch 'feat/eval-dataset-evaluators' into feat/classification-…
ajay-kesavan Jun 19, 2026
027901c
fix(eval): address adversarial-review feedback on classification samples
ajay-kesavan Jun 19, 2026
4d6afcc
fix(eval): address codex P1 + lint failures on dataset evaluators
ajay-kesavan Jun 19, 2026
c347fc7
Merge branch 'feat/eval-dataset-evaluators' into feat/classification-…
ajay-kesavan Jun 19, 2026
5d78205
test(eval): drop fscore-duplicate test that conflicts with #1663 H2 v…
ajay-kesavan Jun 19, 2026
363855d
fix(eval): publish aggregators in classification evaluator type schemas
ajay-kesavan Jun 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 237 additions & 0 deletions packages/uipath/examples/dataset_evaluators_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
"""Runnable proof that the dataset-level evaluators work on realistic data.

Five scenarios exercise the framework end-to-end at the SDK layer (no
worker, no backend). Each prints the headline score plus a confusion
matrix table, so the math is inspectable rather than a passing-test
binary signal.

Run::

cd packages/uipath
uv run python examples/dataset_evaluators_demo.py
"""

from __future__ import annotations

from typing import Iterable

from uipath.eval.evaluators._aggregator_specs import (
FScoreAggregatorSpec,
PrecisionAggregatorSpec,
RecallAggregatorSpec,
)
from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
from uipath.eval.evaluators.classification_dataset_evaluators import (
ClassificationDetails,
)
from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
from uipath.eval.models.models import (
EvaluationResult,
EvaluationResultDto,
NumericEvaluationResult,
)

# ─── helpers ──────────────────────────────────────────────────────────────────


def make_result(expected: str, actual: str) -> EvaluationResultDto:
"""Build a single per-datapoint EvaluationResultDto.

Models what an upstream classification evaluator would produce after running
on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with the
expected/actual labels carried in the justification.
"""
score = 1.0 if expected.lower() == actual.lower() else 0.0
justification = BaseEvaluatorJustification(expected=expected, actual=actual)
return EvaluationResultDto(score=score, details=justification.model_dump())


def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]:
"""Build a list of EvaluationResultDto from (expected, actual) pairs."""
return [make_result(e, a) for e, a in pairs]


def print_header(title: str) -> None:
"""Print a section header banner."""
print()
print("═" * 78)
print(f" {title}")
print("═" * 78)


def report(
title: str,
result: EvaluationResult,
*,
show_json_tail: bool = False, # kept for call-site compat; payload is always emitted
) -> None:
"""Render one scenario's result block as JSON — the actual wire shape."""
_ = show_json_tail
print_header(title)
assert isinstance(result, NumericEvaluationResult)
assert isinstance(result.details, ClassificationDetails)
print(f" headline score = {result.score:.4f}")
print(result.details.model_dump_json(indent=2, by_alias=True))


# ─── scenarios ────────────────────────────────────────────────────────────────


def scenario_1_balanced_three_class() -> None:
"""Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong."""
pairs = [
("book", "book"),
("book", "book"),
("book", "cancel"),
("cancel", "cancel"),
("cancel", "cancel"),
("cancel", "reschedule"),
("reschedule", "reschedule"),
("reschedule", "reschedule"),
("reschedule", "book"),
]
spec = PrecisionAggregatorSpec(
classes=["book", "cancel", "reschedule"], averaging="macro"
)
evaluator = build_dataset_evaluator(spec, source_evaluator="intent_match")
report(
"Scenario 1 — Balanced 3-class (intent recognition)\n"
" Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.",
evaluator.evaluate(materialize_pairs(pairs)),
show_json_tail=True,
)


def scenario_2_imbalanced_two_class() -> None:
"""Rare-positive case — why macro vs micro matters."""
pairs: list[tuple[str, str]] = []
pairs += [("negative", "negative")] * 13
pairs += [("negative", "positive")] * 3
pairs += [("positive", "positive")] * 2
pairs += [("positive", "negative")] * 2

results = materialize_pairs(pairs)
classes = ["positive", "negative"]

macro = build_dataset_evaluator(
PrecisionAggregatorSpec(classes=classes, averaging="macro"),
source_evaluator="positive_match",
)
micro = build_dataset_evaluator(
PrecisionAggregatorSpec(classes=classes, averaging="micro"),
source_evaluator="positive_match",
)
report(
"Scenario 2a — Imbalanced 2-class, MACRO precision\n"
" Rare positive class. Macro averages per-class, so the rare class\n"
" having precision = 2/(2+3) = 0.40 drags the score down.",
macro.evaluate(results),
)
report(
"Scenario 2b — Same data, MICRO precision\n"
" Pools TP/FP across classes. In a 2-class case this equals accuracy.",
micro.evaluate(results),
)


def scenario_3_precision_vs_recall_vs_f() -> None:
"""Same dataset, three different metrics — show they diverge on asymmetric data."""
pairs = [
("yes", "yes"),
("yes", "yes"),
("no", "yes"),
("no", "yes"),
("no", "no"),
("no", "no"),
("yes", "no"),
]
results = materialize_pairs(pairs)
classes = ["yes", "no"]

evaluators = {
"Scenario 3a — Precision on a recall-favourable dataset": build_dataset_evaluator(
PrecisionAggregatorSpec(classes=classes, averaging="macro"),
source_evaluator="yes_match",
),
"Scenario 3b — Recall (same data — note 'yes' recall is 1.0)": build_dataset_evaluator(
RecallAggregatorSpec(classes=classes, averaging="macro"),
source_evaluator="yes_match",
),
"Scenario 3c — F1 (harmonic mean of P and R)": build_dataset_evaluator(
FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
source_evaluator="yes_match",
),
"Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)": build_dataset_evaluator(
FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=2.0),
source_evaluator="yes_match",
),
}
for title, evaluator in evaluators.items():
report(title, evaluator.evaluate(results))


def scenario_4_skipped_datapoints() -> None:
"""Show how malformed / out-of-vocab data is reported, not silently dropped."""
results = [
make_result("cat", "cat"),
make_result("dog", "dog"),
make_result("cat", "platypus"),
make_result("zebra", "cat"),
EvaluationResultDto(score=1.0, details="bare string — no justification"),
EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
]
evaluator = build_dataset_evaluator(
PrecisionAggregatorSpec(classes=["cat", "dog"], averaging="macro"),
source_evaluator="any_match",
)
report(
"Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n"
" 6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n"
" report so you can tell whether a low score is a real signal or\n"
" just sparse data.",
evaluator.evaluate(results),
)


def scenario_5_realistic_intent_classifier() -> None:
"""A larger, more interesting 4-class dataset — uneven per-class performance."""
pairs = [
*[("book", "book")] * 10,
("book", "cancel"),
*[("cancel", "cancel")] * 6,
("cancel", "book"),
("cancel", "modify"),
("reschedule", "reschedule"),
("reschedule", "reschedule"),
("reschedule", "modify"),
("reschedule", "modify"),
("modify", "modify"),
("modify", "reschedule"),
]
results = materialize_pairs(pairs)
classes = ["book", "cancel", "reschedule", "modify"]
macro_f1 = build_dataset_evaluator(
FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
source_evaluator="intent_match",
)
report(
"Scenario 5 — Realistic 4-class intent classifier\n"
" Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n"
" 'modify' weakness; micro F1 would have hidden it under 'book' wins.",
macro_f1.evaluate(results),
)


def main() -> None:
"""Run every scenario sequentially."""
scenario_1_balanced_three_class()
scenario_2_imbalanced_two_class()
scenario_3_precision_vs_recall_vs_f()
scenario_4_skipped_datapoints()
scenario_5_realistic_intent_classifier()
print()
print("Done. All scenarios computed from real evaluator code.")


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion packages/uipath/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "uipath"
version = "2.11.5"
version = "2.11.6"
description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.11"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"version": "2.0",
"resources": []
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{
"version": "1.0",
"id": "SpamBinaryEval",
"name": "Binary spam classifier — precision",
"evaluatorRefs": ["BinarySpamPrecision"],
"evaluations": [
{
"id": "spam-prize",
"name": "Spam: prize giveaway",
"inputs": {
"email_subject": "You won a FREE iPhone!!!",
"email_body": "Congratulations! Click here to claim your prize now."
},
"evaluationCriterias": {
"BinarySpamPrecision": { "expectedClass": "spam" }
}
},
{
"id": "spam-promo",
"name": "Spam: unsolicited promo",
"inputs": {
"email_subject": "Winner of the monthly drawing",
"email_body": "You've been selected. Click here to redeem."
},
"evaluationCriterias": {
"BinarySpamPrecision": { "expectedClass": "spam" }
}
},
{
"id": "ham-invoice",
"name": "Ham: legitimate invoice",
"inputs": {
"email_subject": "Your March invoice is ready",
"email_body": "Your monthly invoice of $45.99 is attached. Payment is due March 15."
},
"evaluationCriterias": {
"BinarySpamPrecision": { "expectedClass": "ham" }
}
},
{
"id": "ham-meeting",
"name": "Ham: meeting request",
"inputs": {
"email_subject": "Sync on Q2 planning",
"email_body": "Can we meet Wednesday at 2pm to align on next quarter's roadmap?"
},
"evaluationCriterias": {
"BinarySpamPrecision": { "expectedClass": "ham" }
}
},
{
"id": "ham-mislabeled",
"name": "Ham mislabeled as spam (forces a false positive)",
"inputs": {
"email_subject": "Free coffee in the break room!!!",
"email_body": "Just a heads up — the new espresso machine is set up."
},
"evaluationCriterias": {
"BinarySpamPrecision": { "expectedClass": "ham" }
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"version": "1.0",
"id": "BinarySpamPrecision",
"description": "Precision on the 'spam' positive class, plus run-level aggregators",
"evaluatorTypeId": "uipath-binary-classification",
"evaluatorConfig": {
"name": "BinarySpamPrecision",
"targetOutputKey": "category",
"positiveClass": "spam",
"metricType": "precision",
"fValue": 1.0,
"defaultEvaluationCriteria": {
"expectedClass": "ham"
},
"aggregators": [
{
"type": "precision",
"classes": ["spam", "ham"],
"averaging": "macro"
},
{
"type": "recall",
"classes": ["spam", "ham"],
"averaging": "macro"
},
{
"type": "fscore",
"classes": ["spam", "ham"],
"averaging": "macro",
"fValue": 1.0
}
]
}
}
39 changes: 39 additions & 0 deletions packages/uipath/samples/binary_classification_agent/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Rule-based spam/ham classifier demonstrating the binary classification evaluator."""

from dataclasses import dataclass

from uipath.tracing import traced

SPAMMY_TOKENS = {
"free",
"winner",
"congratulations",
"click here",
"prize",
"!!!",
}


@dataclass
class EmailInput:
email_subject: str
email_body: str


@dataclass
class Classification:
category: str


@traced(name="classify_email", span_type="tool")
def classify_email(subject: str, body: str) -> str:
"""Return 'spam' if any spam-indicator token appears in the subject or body."""
text = f"{subject} {body}".lower()
return "spam" if any(token in text for token in SPAMMY_TOKENS) else "ham"


@traced()
async def main(input: EmailInput) -> Classification:
"""Classify an email as 'spam' or 'ham'."""
category = classify_email(input.email_subject, input.email_body)
return Classification(category=category)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[project]
name = "binary-classification-agent"
version = "0.0.1"
description = "Rule-based spam/ham classifier demonstrating the binary classification evaluator"
requires-python = ">=3.11"
dependencies = ["uipath"]

[dependency-groups]
dev = ["uipath-dev"]
Loading
Loading