Source code for structured_stochasticity.tasks

"""
Benchmark tasks for evaluating reasoning under structured stochasticity.

Tasks are chosen to have:
- Exact algorithmic solutions (objectively verifiable)
- Scalable complexity (parameterized difficulty)
- Minimal heuristic shortcuts (tests true reasoning)

Tower of Hanoi is the flagship task because:
1. It has exact recursive structure
2. Complexity grows exponentially (2^n - 1 moves)
3. Early commitment to wrong approach is fatal
4. Solution is easily verifiable
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Optional
import re



[docs]
@dataclass
class TaskInstance:
    """A single instance of a task."""
    prompt: str
    complexity: int  # Task-specific complexity measure
    ground_truth: Optional[str] = None  # Expected solution if known
    metadata: dict = field(default_factory=dict)




[docs]
@dataclass 
class TaskResult:
    """Result of attempting a task."""
    response: str
    is_correct: bool
    complexity: int
    error_message: Optional[str] = None
    metadata: dict = field(default_factory=dict)




[docs]
class Task(ABC):
    """Abstract base class for benchmark tasks."""
    
    name: str = "base_task"
    

[docs]
    @abstractmethod
    def generate_instance(self, complexity: int) -> TaskInstance:
        """Generate a task instance at given complexity level."""
        pass

    

[docs]
    @abstractmethod
    def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult:
        """Verify if a response correctly solves the task."""
        pass

    

[docs]
    def generate_batch(
        self, 
        complexity_range: tuple[int, int],
        instances_per_level: int = 1
    ) -> list[TaskInstance]:
        """Generate multiple instances across complexity levels."""
        instances = []
        for c in range(complexity_range[0], complexity_range[1] + 1):
            for _ in range(instances_per_level):
                instances.append(self.generate_instance(c))
        return instances





[docs]
class TowerOfHanoi(Task):
    """
    Tower of Hanoi puzzle.
    
    The classic recursive puzzle: move n disks from peg A to peg C,
    using peg B as auxiliary. Rules:
    - Only one disk can be moved at a time
    - A larger disk cannot be placed on a smaller disk
    
    Optimal solution requires 2^n - 1 moves.
    
    This task is ideal because:
    - Solution is deterministic and verifiable
    - Requires recursive planning
    - No shortcuts - must actually solve it
    - Complexity is well-defined (number of disks)
    """
    
    name = "tower_of_hanoi"
    

[docs]
    def __init__(self, prompt_style: str = "standard"):
        """
        Args:
            prompt_style: How to phrase the task
                - "standard": Clear, direct instructions
                - "minimal": Bare problem statement
                - "detailed": Include rules explanation
        """
        self.prompt_style = prompt_style

    

[docs]
    def generate_instance(self, complexity: int) -> TaskInstance:
        """
        Generate a Tower of Hanoi instance.
        
        Args:
            complexity: Number of disks (n)
        """
        n = complexity
        
        if self.prompt_style == "minimal":
            prompt = f"Solve Tower of Hanoi with {n} disks. Move all disks from A to C."
        
        elif self.prompt_style == "detailed":
            prompt = f"""Tower of Hanoi Puzzle

You have {n} disks of different sizes stacked on peg A (largest at bottom, smallest at top).
Your goal is to move all disks to peg C.

Rules:
1. You can only move one disk at a time
2. You can only move the top disk from any peg
3. You cannot place a larger disk on top of a smaller disk

Pegs available: A, B, C

List each move in the format: "Move disk from X to Y"
Solve this step by step."""
        
        else:  # standard
            prompt = f"""Solve the Tower of Hanoi puzzle with {n} disks.

Move all disks from peg A to peg C, using peg B as auxiliary.
Rules: Move one disk at a time. Never place a larger disk on a smaller one.

Provide the solution as a sequence of moves in format: "Move disk from X to Y"
"""
        
        # Generate ground truth solution
        ground_truth = self._solve(n, "A", "C", "B")
        
        return TaskInstance(
            prompt=prompt,
            complexity=n,
            ground_truth=ground_truth,
            metadata={
                "num_disks": n,
                "optimal_moves": 2**n - 1,
                "prompt_style": self.prompt_style
            }
        )

    
    def _solve(self, n: int, source: str, target: str, auxiliary: str) -> str:
        """Generate optimal solution recursively."""
        moves = []
        self._solve_recursive(n, source, target, auxiliary, moves)
        return "\n".join(moves)
    
    def _solve_recursive(
        self, n: int, source: str, target: str, auxiliary: str, moves: list
    ):
        if n == 1:
            moves.append(f"Move disk from {source} to {target}")
            return
        self._solve_recursive(n - 1, source, auxiliary, target, moves)
        moves.append(f"Move disk from {source} to {target}")
        self._solve_recursive(n - 1, auxiliary, target, source, moves)
    

[docs]
    def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult:
        """
        Verify a Tower of Hanoi solution by simulation.
        
        Parses moves from response and simulates execution,
        checking all rules are followed and final state is correct.
        """
        n = instance.metadata["num_disks"]
        
        # Parse moves from response
        moves = self._parse_moves(response)
        
        if not moves:
            return TaskResult(
                response=response,
                is_correct=False,
                complexity=instance.complexity,
                error_message="No valid moves found in response"
            )
        
        # Simulate
        pegs = {
            "A": list(range(n, 0, -1)),  # [n, n-1, ..., 2, 1] (bottom to top)
            "B": [],
            "C": []
        }
        
        for i, (source, target) in enumerate(moves):
            # Validate move
            if source not in pegs or target not in pegs:
                return TaskResult(
                    response=response,
                    is_correct=False,
                    complexity=instance.complexity,
                    error_message=f"Move {i+1}: Invalid peg '{source}' or '{target}'"
                )
            
            if not pegs[source]:
                return TaskResult(
                    response=response,
                    is_correct=False,
                    complexity=instance.complexity,
                    error_message=f"Move {i+1}: Cannot move from empty peg {source}"
                )
            
            disk = pegs[source][-1]
            
            if pegs[target] and pegs[target][-1] < disk:
                return TaskResult(
                    response=response,
                    is_correct=False,
                    complexity=instance.complexity,
                    error_message=f"Move {i+1}: Cannot place disk {disk} on smaller disk {pegs[target][-1]}"
                )
            
            # Execute move
            pegs[source].pop()
            pegs[target].append(disk)
        
        # Check final state
        expected_c = list(range(n, 0, -1))
        is_correct = pegs["A"] == [] and pegs["B"] == [] and pegs["C"] == expected_c
        
        return TaskResult(
            response=response,
            is_correct=is_correct,
            complexity=instance.complexity,
            error_message=None if is_correct else "Final state incorrect: not all disks on peg C",
            metadata={
                "num_moves": len(moves),
                "optimal_moves": 2**n - 1,
                "efficiency": (2**n - 1) / len(moves) if moves else 0
            }
        )

    
    def _parse_moves(self, response: str) -> list[tuple[str, str]]:
        """Extract moves from response text."""
        moves = []
        
        # Try various patterns
        patterns = [
            r"[Mm]ove\s+(?:disk\s+)?(?:from\s+)?([ABC])\s+(?:to\s+)?([ABC])",
            r"([ABC])\s*(?:->|→|to)\s*([ABC])",
            r"([ABC])\s+([ABC])",  # Very lenient
        ]
        
        for pattern in patterns:
            found = re.findall(pattern, response.upper())
            if found:
                moves = [(m[0], m[1]) for m in found]
                break
        
        return moves




[docs]
class ArithmeticSequence(Task):
    """
    Multi-step arithmetic task.
    
    Generates a sequence of arithmetic operations that must be
    performed in order. Tests sequential reasoning without shortcuts.
    """
    
    name = "arithmetic_sequence"
    

[docs]
    def __init__(self, operations: list[str] = None):
        self.operations = operations or ["+", "-", "*"]

    

[docs]
    def generate_instance(self, complexity: int) -> TaskInstance:
        """
        Generate arithmetic sequence.
        
        Args:
            complexity: Number of operations
        """
        import random
        
        # Generate sequence
        result = random.randint(1, 20)
        expression_parts = [str(result)]
        
        for _ in range(complexity):
            op = random.choice(self.operations)
            
            if op == "*":
                operand = random.randint(2, 5)
            elif op == "-":
                operand = random.randint(1, min(result, 10))
            else:  # +
                operand = random.randint(1, 20)
            
            expression_parts.append(f" {op} {operand}")
            
            if op == "+":
                result += operand
            elif op == "-":
                result -= operand
            else:
                result *= operand
        
        expression = "".join(expression_parts)
        
        prompt = f"Calculate step by step: {expression}"
        
        return TaskInstance(
            prompt=prompt,
            complexity=complexity,
            ground_truth=str(result),
            metadata={
                "expression": expression,
                "num_operations": complexity
            }
        )

    

[docs]
    def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult:
        """Check if response contains correct answer."""
        expected = instance.ground_truth
        
        # Look for the number in response
        numbers = re.findall(r"-?\d+", response)
        
        is_correct = expected in numbers
        
        # Also check if final answer is stated
        final_patterns = [
            rf"(?:answer|result|equals|=)\s*{expected}",
            rf"{expected}\s*$",
        ]
        
        for pattern in final_patterns:
            if re.search(pattern, response, re.IGNORECASE):
                is_correct = True
                break
        
        return TaskResult(
            response=response,
            is_correct=is_correct,
            complexity=instance.complexity,
            metadata={"expected": expected, "found_numbers": numbers}
        )





[docs]
class LogicalDeduction(Task):
    """
    Logical deduction puzzles.
    
    "A is taller than B. B is taller than C. Who is shortest?"
    
    Complexity scales with number of entities and comparisons.
    """
    
    name = "logical_deduction"
    
    NAMES = ["Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Henry"]
    COMPARISONS = ["taller than", "shorter than", "older than", "younger than"]
    

[docs]
    def generate_instance(self, complexity: int) -> TaskInstance:
        """
        Generate a logical ordering puzzle.
        
        Args:
            complexity: Number of entities to order
        """
        import random
        
        n = min(complexity + 2, len(self.NAMES))  # complexity=1 -> 3 people
        names = random.sample(self.NAMES, n)
        
        # Create ground truth ordering
        random.shuffle(names)
        ordering = names.copy()  # First is "most", last is "least"
        
        comparison = random.choice(self.COMPARISONS)
        
        # Generate clues (enough to determine ordering)
        clues = []
        for i in range(len(ordering) - 1):
            clues.append(f"{ordering[i]} is {comparison} {ordering[i+1]}")
        
        # Shuffle clues to make it harder
        random.shuffle(clues)
        
        # Determine question
        if "taller" in comparison or "older" in comparison:
            question = f"Who is the {'shortest' if 'taller' in comparison else 'youngest'}?"
            answer = ordering[-1]
        else:
            question = f"Who is the {'tallest' if 'shorter' in comparison else 'oldest'}?"
            answer = ordering[-1]
        
        prompt = f"""Given the following facts:
{chr(10).join('- ' + c for c in clues)}

{question}"""
        
        return TaskInstance(
            prompt=prompt,
            complexity=complexity,
            ground_truth=answer,
            metadata={
                "num_entities": n,
                "ordering": ordering,
                "comparison_type": comparison
            }
        )

    

[docs]
    def verify_solution(self, instance: TaskInstance, response: str) -> TaskResult:
        """Check if response contains correct name."""
        expected = instance.ground_truth
        is_correct = expected.lower() in response.lower()
        
        return TaskResult(
            response=response,
            is_correct=is_correct,
            complexity=instance.complexity
        )





[docs]
class TaskFactory:
    """Factory for creating task instances."""
    
    TASKS = {
        "tower_of_hanoi": TowerOfHanoi,
        "arithmetic": ArithmeticSequence,
        "logical_deduction": LogicalDeduction,
    }
    

[docs]
    @classmethod
    def create(cls, task_name: str, **kwargs) -> Task:
        """Create a task by name."""
        if task_name not in cls.TASKS:
            raise ValueError(f"Unknown task: {task_name}. Available: {list(cls.TASKS.keys())}")
        return cls.TASKS[task_name](**kwargs)

    

[docs]
    @classmethod
    def list_tasks(cls) -> list[str]:
        """List available tasks."""
        return list(cls.TASKS.keys())
Structured Stochasticity

Navigation

Related Topics

Source code for structured_stochasticity.tasks