Source code for structured_stochasticity.tasks

"""
Benchmark tasks for evaluating reasoning under structured stochasticity.

Tasks are chosen to have:
- Exact algorithmic solutions (objectively verifiable)
- Scalable complexity (parameterized difficulty)
- Minimal heuristic shortcuts (tests true reasoning)

Tower of Hanoi is the flagship task because:
1. It has exact recursive structure
2. Complexity grows exponentially (2^n - 1 moves)
3. Early commitment to wrong approach is fatal
4. Solution is easily verifiable
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Optional
import re


[docs] @dataclass class TaskInstance: """A single instance of a task.""" prompt: str complexity: int # Task-specific complexity measure ground_truth: Optional[str] = None # Expected solution if known metadata: dict = field(default_factory=dict)
[docs] @dataclass class TaskResult: """Result of attempting a task.""" response: str is_correct: bool complexity: int error_message: Optional[str] = None metadata: dict = field(default_factory=dict)
[docs] class Task(ABC): """Abstract base class for benchmark tasks.""" name: str = "base_task"
[docs] @abstractmethod def generate_instance(self, complexity: int) -> TaskInstance: """Generate a task instance at given complexity level.""" pass
[docs] @abstractmethod def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult: """Verify if a response correctly solves the task.""" pass
[docs] def generate_batch( self, complexity_range: tuple[int, int], instances_per_level: int = 1 ) -> list[TaskInstance]: """Generate multiple instances across complexity levels.""" instances = [] for c in range(complexity_range[0], complexity_range[1] + 1): for _ in range(instances_per_level): instances.append(self.generate_instance(c)) return instances
[docs] class TowerOfHanoi(Task): """ Tower of Hanoi puzzle. The classic recursive puzzle: move n disks from peg A to peg C, using peg B as auxiliary. Rules: - Only one disk can be moved at a time - A larger disk cannot be placed on a smaller disk Optimal solution requires 2^n - 1 moves. This task is ideal because: - Solution is deterministic and verifiable - Requires recursive planning - No shortcuts - must actually solve it - Complexity is well-defined (number of disks) """ name = "tower_of_hanoi"
[docs] def __init__(self, prompt_style: str = "standard"): """ Args: prompt_style: How to phrase the task - "standard": Clear, direct instructions - "minimal": Bare problem statement - "detailed": Include rules explanation """ self.prompt_style = prompt_style
[docs] def generate_instance(self, complexity: int) -> TaskInstance: """ Generate a Tower of Hanoi instance. Args: complexity: Number of disks (n) """ n = complexity if self.prompt_style == "minimal": prompt = f"Solve Tower of Hanoi with {n} disks. Move all disks from A to C." elif self.prompt_style == "detailed": prompt = f"""Tower of Hanoi Puzzle You have {n} disks of different sizes stacked on peg A (largest at bottom, smallest at top). Your goal is to move all disks to peg C. Rules: 1. You can only move one disk at a time 2. You can only move the top disk from any peg 3. You cannot place a larger disk on top of a smaller disk Pegs available: A, B, C List each move in the format: "Move disk from X to Y" Solve this step by step.""" else: # standard prompt = f"""Solve the Tower of Hanoi puzzle with {n} disks. Move all disks from peg A to peg C, using peg B as auxiliary. Rules: Move one disk at a time. Never place a larger disk on a smaller one. Provide the solution as a sequence of moves in format: "Move disk from X to Y" """ # Generate ground truth solution ground_truth = self._solve(n, "A", "C", "B") return TaskInstance( prompt=prompt, complexity=n, ground_truth=ground_truth, metadata={ "num_disks": n, "optimal_moves": 2**n - 1, "prompt_style": self.prompt_style } )
def _solve(self, n: int, source: str, target: str, auxiliary: str) -> str: """Generate optimal solution recursively.""" moves = [] self._solve_recursive(n, source, target, auxiliary, moves) return "\n".join(moves) def _solve_recursive( self, n: int, source: str, target: str, auxiliary: str, moves: list ): if n == 1: moves.append(f"Move disk from {source} to {target}") return self._solve_recursive(n - 1, source, auxiliary, target, moves) moves.append(f"Move disk from {source} to {target}") self._solve_recursive(n - 1, auxiliary, target, source, moves)
[docs] def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult: """ Verify a Tower of Hanoi solution by simulation. Parses moves from response and simulates execution, checking all rules are followed and final state is correct. """ n = instance.metadata["num_disks"] # Parse moves from response moves = self._parse_moves(response) if not moves: return TaskResult( response=response, is_correct=False, complexity=instance.complexity, error_message="No valid moves found in response" ) # Simulate pegs = { "A": list(range(n, 0, -1)), # [n, n-1, ..., 2, 1] (bottom to top) "B": [], "C": [] } for i, (source, target) in enumerate(moves): # Validate move if source not in pegs or target not in pegs: return TaskResult( response=response, is_correct=False, complexity=instance.complexity, error_message=f"Move {i+1}: Invalid peg '{source}' or '{target}'" ) if not pegs[source]: return TaskResult( response=response, is_correct=False, complexity=instance.complexity, error_message=f"Move {i+1}: Cannot move from empty peg {source}" ) disk = pegs[source][-1] if pegs[target] and pegs[target][-1] < disk: return TaskResult( response=response, is_correct=False, complexity=instance.complexity, error_message=f"Move {i+1}: Cannot place disk {disk} on smaller disk {pegs[target][-1]}" ) # Execute move pegs[source].pop() pegs[target].append(disk) # Check final state expected_c = list(range(n, 0, -1)) is_correct = pegs["A"] == [] and pegs["B"] == [] and pegs["C"] == expected_c return TaskResult( response=response, is_correct=is_correct, complexity=instance.complexity, error_message=None if is_correct else "Final state incorrect: not all disks on peg C", metadata={ "num_moves": len(moves), "optimal_moves": 2**n - 1, "efficiency": (2**n - 1) / len(moves) if moves else 0 } )
def _parse_moves(self, response: str) -> list[tuple[str, str]]: """Extract moves from response text.""" moves = [] # Try various patterns patterns = [ r"[Mm]ove\s+(?:disk\s+)?(?:from\s+)?([ABC])\s+(?:to\s+)?([ABC])", r"([ABC])\s*(?:->|→|to)\s*([ABC])", r"([ABC])\s+([ABC])", # Very lenient ] for pattern in patterns: found = re.findall(pattern, response.upper()) if found: moves = [(m[0], m[1]) for m in found] break return moves
[docs] class ArithmeticSequence(Task): """ Multi-step arithmetic task. Generates a sequence of arithmetic operations that must be performed in order. Tests sequential reasoning without shortcuts. """ name = "arithmetic_sequence"
[docs] def __init__(self, operations: list[str] = None): self.operations = operations or ["+", "-", "*"]
[docs] def generate_instance(self, complexity: int) -> TaskInstance: """ Generate arithmetic sequence. Args: complexity: Number of operations """ import random # Generate sequence result = random.randint(1, 20) expression_parts = [str(result)] for _ in range(complexity): op = random.choice(self.operations) if op == "*": operand = random.randint(2, 5) elif op == "-": operand = random.randint(1, min(result, 10)) else: # + operand = random.randint(1, 20) expression_parts.append(f" {op} {operand}") if op == "+": result += operand elif op == "-": result -= operand else: result *= operand expression = "".join(expression_parts) prompt = f"Calculate step by step: {expression}" return TaskInstance( prompt=prompt, complexity=complexity, ground_truth=str(result), metadata={ "expression": expression, "num_operations": complexity } )
[docs] def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult: """Check if response contains correct answer.""" expected = instance.ground_truth # Look for the number in response numbers = re.findall(r"-?\d+", response) is_correct = expected in numbers # Also check if final answer is stated final_patterns = [ rf"(?:answer|result|equals|=)\s*{expected}", rf"{expected}\s*$", ] for pattern in final_patterns: if re.search(pattern, response, re.IGNORECASE): is_correct = True break return TaskResult( response=response, is_correct=is_correct, complexity=instance.complexity, metadata={"expected": expected, "found_numbers": numbers} )
[docs] class LogicalDeduction(Task): """ Logical deduction puzzles. "A is taller than B. B is taller than C. Who is shortest?" Complexity scales with number of entities and comparisons. """ name = "logical_deduction" NAMES = ["Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Henry"] COMPARISONS = ["taller than", "shorter than", "older than", "younger than"]
[docs] def generate_instance(self, complexity: int) -> TaskInstance: """ Generate a logical ordering puzzle. Args: complexity: Number of entities to order """ import random n = min(complexity + 2, len(self.NAMES)) # complexity=1 -> 3 people names = random.sample(self.NAMES, n) # Create ground truth ordering random.shuffle(names) ordering = names.copy() # First is "most", last is "least" comparison = random.choice(self.COMPARISONS) # Generate clues (enough to determine ordering) clues = [] for i in range(len(ordering) - 1): clues.append(f"{ordering[i]} is {comparison} {ordering[i+1]}") # Shuffle clues to make it harder random.shuffle(clues) # Determine question if "taller" in comparison or "older" in comparison: question = f"Who is the {'shortest' if 'taller' in comparison else 'youngest'}?" answer = ordering[-1] else: question = f"Who is the {'tallest' if 'shorter' in comparison else 'oldest'}?" answer = ordering[-1] prompt = f"""Given the following facts: {chr(10).join('- ' + c for c in clues)} {question}""" return TaskInstance( prompt=prompt, complexity=complexity, ground_truth=answer, metadata={ "num_entities": n, "ordering": ordering, "comparison_type": comparison } )
[docs] def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult: """Check if response contains correct name.""" expected = instance.ground_truth is_correct = expected.lower() in response.lower() return TaskResult( response=response, is_correct=is_correct, complexity=instance.complexity )
[docs] class TaskFactory: """Factory for creating task instances.""" TASKS = { "tower_of_hanoi": TowerOfHanoi, "arithmetic": ArithmeticSequence, "logical_deduction": LogicalDeduction, }
[docs] @classmethod def create(cls, task_name: str, **kwargs) -> Task: """Create a task by name.""" if task_name not in cls.TASKS: raise ValueError(f"Unknown task: {task_name}. Available: {list(cls.TASKS.keys())}") return cls.TASKS[task_name](**kwargs)
[docs] @classmethod def list_tasks(cls) -> list[str]: """List available tasks.""" return list(cls.TASKS.keys())