"""
Benchmark tasks for evaluating reasoning under structured stochasticity.
Tasks are chosen to have:
- Exact algorithmic solutions (objectively verifiable)
- Scalable complexity (parameterized difficulty)
- Minimal heuristic shortcuts (tests true reasoning)
Tower of Hanoi is the flagship task because:
1. It has exact recursive structure
2. Complexity grows exponentially (2^n - 1 moves)
3. Early commitment to wrong approach is fatal
4. Solution is easily verifiable
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Optional
import re
[docs]
@dataclass
class TaskInstance:
"""A single instance of a task."""
prompt: str
complexity: int # Task-specific complexity measure
ground_truth: Optional[str] = None # Expected solution if known
metadata: dict = field(default_factory=dict)
[docs]
@dataclass
class TaskResult:
"""Result of attempting a task."""
response: str
is_correct: bool
complexity: int
error_message: Optional[str] = None
metadata: dict = field(default_factory=dict)
[docs]
class Task(ABC):
"""Abstract base class for benchmark tasks."""
name: str = "base_task"
[docs]
@abstractmethod
def generate_instance(self, complexity: int) -> TaskInstance:
"""Generate a task instance at given complexity level."""
pass
[docs]
@abstractmethod
def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult:
"""Verify if a response correctly solves the task."""
pass
[docs]
def generate_batch(
self,
complexity_range: tuple[int, int],
instances_per_level: int = 1
) -> list[TaskInstance]:
"""Generate multiple instances across complexity levels."""
instances = []
for c in range(complexity_range[0], complexity_range[1] + 1):
for _ in range(instances_per_level):
instances.append(self.generate_instance(c))
return instances
[docs]
class TowerOfHanoi(Task):
"""
Tower of Hanoi puzzle.
The classic recursive puzzle: move n disks from peg A to peg C,
using peg B as auxiliary. Rules:
- Only one disk can be moved at a time
- A larger disk cannot be placed on a smaller disk
Optimal solution requires 2^n - 1 moves.
This task is ideal because:
- Solution is deterministic and verifiable
- Requires recursive planning
- No shortcuts - must actually solve it
- Complexity is well-defined (number of disks)
"""
name = "tower_of_hanoi"
[docs]
def __init__(self, prompt_style: str = "standard"):
"""
Args:
prompt_style: How to phrase the task
- "standard": Clear, direct instructions
- "minimal": Bare problem statement
- "detailed": Include rules explanation
"""
self.prompt_style = prompt_style
[docs]
def generate_instance(self, complexity: int) -> TaskInstance:
"""
Generate a Tower of Hanoi instance.
Args:
complexity: Number of disks (n)
"""
n = complexity
if self.prompt_style == "minimal":
prompt = f"Solve Tower of Hanoi with {n} disks. Move all disks from A to C."
elif self.prompt_style == "detailed":
prompt = f"""Tower of Hanoi Puzzle
You have {n} disks of different sizes stacked on peg A (largest at bottom, smallest at top).
Your goal is to move all disks to peg C.
Rules:
1. You can only move one disk at a time
2. You can only move the top disk from any peg
3. You cannot place a larger disk on top of a smaller disk
Pegs available: A, B, C
List each move in the format: "Move disk from X to Y"
Solve this step by step."""
else: # standard
prompt = f"""Solve the Tower of Hanoi puzzle with {n} disks.
Move all disks from peg A to peg C, using peg B as auxiliary.
Rules: Move one disk at a time. Never place a larger disk on a smaller one.
Provide the solution as a sequence of moves in format: "Move disk from X to Y"
"""
# Generate ground truth solution
ground_truth = self._solve(n, "A", "C", "B")
return TaskInstance(
prompt=prompt,
complexity=n,
ground_truth=ground_truth,
metadata={
"num_disks": n,
"optimal_moves": 2**n - 1,
"prompt_style": self.prompt_style
}
)
def _solve(self, n: int, source: str, target: str, auxiliary: str) -> str:
"""Generate optimal solution recursively."""
moves = []
self._solve_recursive(n, source, target, auxiliary, moves)
return "\n".join(moves)
def _solve_recursive(
self, n: int, source: str, target: str, auxiliary: str, moves: list
):
if n == 1:
moves.append(f"Move disk from {source} to {target}")
return
self._solve_recursive(n - 1, source, auxiliary, target, moves)
moves.append(f"Move disk from {source} to {target}")
self._solve_recursive(n - 1, auxiliary, target, source, moves)
[docs]
def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult:
"""
Verify a Tower of Hanoi solution by simulation.
Parses moves from response and simulates execution,
checking all rules are followed and final state is correct.
"""
n = instance.metadata["num_disks"]
# Parse moves from response
moves = self._parse_moves(response)
if not moves:
return TaskResult(
response=response,
is_correct=False,
complexity=instance.complexity,
error_message="No valid moves found in response"
)
# Simulate
pegs = {
"A": list(range(n, 0, -1)), # [n, n-1, ..., 2, 1] (bottom to top)
"B": [],
"C": []
}
for i, (source, target) in enumerate(moves):
# Validate move
if source not in pegs or target not in pegs:
return TaskResult(
response=response,
is_correct=False,
complexity=instance.complexity,
error_message=f"Move {i+1}: Invalid peg '{source}' or '{target}'"
)
if not pegs[source]:
return TaskResult(
response=response,
is_correct=False,
complexity=instance.complexity,
error_message=f"Move {i+1}: Cannot move from empty peg {source}"
)
disk = pegs[source][-1]
if pegs[target] and pegs[target][-1] < disk:
return TaskResult(
response=response,
is_correct=False,
complexity=instance.complexity,
error_message=f"Move {i+1}: Cannot place disk {disk} on smaller disk {pegs[target][-1]}"
)
# Execute move
pegs[source].pop()
pegs[target].append(disk)
# Check final state
expected_c = list(range(n, 0, -1))
is_correct = pegs["A"] == [] and pegs["B"] == [] and pegs["C"] == expected_c
return TaskResult(
response=response,
is_correct=is_correct,
complexity=instance.complexity,
error_message=None if is_correct else "Final state incorrect: not all disks on peg C",
metadata={
"num_moves": len(moves),
"optimal_moves": 2**n - 1,
"efficiency": (2**n - 1) / len(moves) if moves else 0
}
)
def _parse_moves(self, response: str) -> list[tuple[str, str]]:
"""Extract moves from response text."""
moves = []
# Try various patterns
patterns = [
r"[Mm]ove\s+(?:disk\s+)?(?:from\s+)?([ABC])\s+(?:to\s+)?([ABC])",
r"([ABC])\s*(?:->|→|to)\s*([ABC])",
r"([ABC])\s+([ABC])", # Very lenient
]
for pattern in patterns:
found = re.findall(pattern, response.upper())
if found:
moves = [(m[0], m[1]) for m in found]
break
return moves
[docs]
class ArithmeticSequence(Task):
"""
Multi-step arithmetic task.
Generates a sequence of arithmetic operations that must be
performed in order. Tests sequential reasoning without shortcuts.
"""
name = "arithmetic_sequence"
[docs]
def __init__(self, operations: list[str] = None):
self.operations = operations or ["+", "-", "*"]
[docs]
def generate_instance(self, complexity: int) -> TaskInstance:
"""
Generate arithmetic sequence.
Args:
complexity: Number of operations
"""
import random
# Generate sequence
result = random.randint(1, 20)
expression_parts = [str(result)]
for _ in range(complexity):
op = random.choice(self.operations)
if op == "*":
operand = random.randint(2, 5)
elif op == "-":
operand = random.randint(1, min(result, 10))
else: # +
operand = random.randint(1, 20)
expression_parts.append(f" {op} {operand}")
if op == "+":
result += operand
elif op == "-":
result -= operand
else:
result *= operand
expression = "".join(expression_parts)
prompt = f"Calculate step by step: {expression}"
return TaskInstance(
prompt=prompt,
complexity=complexity,
ground_truth=str(result),
metadata={
"expression": expression,
"num_operations": complexity
}
)
[docs]
def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult:
"""Check if response contains correct answer."""
expected = instance.ground_truth
# Look for the number in response
numbers = re.findall(r"-?\d+", response)
is_correct = expected in numbers
# Also check if final answer is stated
final_patterns = [
rf"(?:answer|result|equals|=)\s*{expected}",
rf"{expected}\s*$",
]
for pattern in final_patterns:
if re.search(pattern, response, re.IGNORECASE):
is_correct = True
break
return TaskResult(
response=response,
is_correct=is_correct,
complexity=instance.complexity,
metadata={"expected": expected, "found_numbers": numbers}
)
[docs]
class LogicalDeduction(Task):
"""
Logical deduction puzzles.
"A is taller than B. B is taller than C. Who is shortest?"
Complexity scales with number of entities and comparisons.
"""
name = "logical_deduction"
NAMES = ["Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Henry"]
COMPARISONS = ["taller than", "shorter than", "older than", "younger than"]
[docs]
def generate_instance(self, complexity: int) -> TaskInstance:
"""
Generate a logical ordering puzzle.
Args:
complexity: Number of entities to order
"""
import random
n = min(complexity + 2, len(self.NAMES)) # complexity=1 -> 3 people
names = random.sample(self.NAMES, n)
# Create ground truth ordering
random.shuffle(names)
ordering = names.copy() # First is "most", last is "least"
comparison = random.choice(self.COMPARISONS)
# Generate clues (enough to determine ordering)
clues = []
for i in range(len(ordering) - 1):
clues.append(f"{ordering[i]} is {comparison} {ordering[i+1]}")
# Shuffle clues to make it harder
random.shuffle(clues)
# Determine question
if "taller" in comparison or "older" in comparison:
question = f"Who is the {'shortest' if 'taller' in comparison else 'youngest'}?"
answer = ordering[-1]
else:
question = f"Who is the {'tallest' if 'shorter' in comparison else 'oldest'}?"
answer = ordering[-1]
prompt = f"""Given the following facts:
{chr(10).join('- ' + c for c in clues)}
{question}"""
return TaskInstance(
prompt=prompt,
complexity=complexity,
ground_truth=answer,
metadata={
"num_entities": n,
"ordering": ordering,
"comparison_type": comparison
}
)
[docs]
def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult:
"""Check if response contains correct name."""
expected = instance.ground_truth
is_correct = expected.lower() in response.lower()
return TaskResult(
response=response,
is_correct=is_correct,
complexity=instance.complexity
)
[docs]
class TaskFactory:
"""Factory for creating task instances."""
TASKS = {
"tower_of_hanoi": TowerOfHanoi,
"arithmetic": ArithmeticSequence,
"logical_deduction": LogicalDeduction,
}
[docs]
@classmethod
def create(cls, task_name: str, **kwargs) -> Task:
"""Create a task by name."""
if task_name not in cls.TASKS:
raise ValueError(f"Unknown task: {task_name}. Available: {list(cls.TASKS.keys())}")
return cls.TASKS[task_name](**kwargs)
[docs]
@classmethod
def list_tasks(cls) -> list[str]:
"""List available tasks."""
return list(cls.TASKS.keys())