Source code for structured_stochasticity.injection

"""
Noise injection strategies for structured stochasticity.

This module provides different approaches to injecting noise into hidden states:
- Gaussian: Standard normal noise scaled by a factor
- Uniform: Uniform noise in a range
- Annealed: Noise that decreases over the generation process

The key insight is that noise injection should enable "trajectory resampling" -
allowing the model to escape local optima in reasoning space, analogous to how
unexpected external stimuli can help humans break out of thinking loops.
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional

import torch
import torch.nn as nn



[docs]
@dataclass
class InjectionState:
    """Tracks state across injection calls for stateful strategies."""
    step: int = 0
    total_steps: Optional[int] = None
    

[docs]
    def advance(self):
        self.step += 1

    

[docs]
    def reset(self):
        self.step = 0





[docs]
class NoiseInjector(ABC):
    """Abstract base class for noise injection strategies."""
    

[docs]
    def __init__(self, scale: float = 0.1, device: str = "cuda"):
        self.scale = scale
        self.device = device
        self.state = InjectionState()

    

[docs]
    @abstractmethod
    def sample(self, shape: tuple[int, ...]) -> torch.Tensor:
        """Sample noise tensor of given shape."""
        pass

    

[docs]
    def inject(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """
        Inject noise into hidden states.
        
        Args:
            hidden_states: Tensor of shape (batch, seq_len, hidden_dim)
            
        Returns:
            Perturbed hidden states of same shape
        """
        noise = self.sample(hidden_states.shape).to(hidden_states.device)
        self.state.advance()
        return hidden_states + noise

    

[docs]
    def reset(self):
        """Reset internal state (call between generations)."""
        self.state.reset()

    
    def __repr__(self):
        return f"{self.__class__.__name__}(scale={self.scale})"




[docs]
class GaussianNoiseInjector(NoiseInjector):
    """
    Injects Gaussian noise scaled by a constant factor.
    
    This is the simplest strategy: z ~ N(0, scale²)
    
    The scale parameter controls the magnitude of perturbation.
    Too small: won't escape attractor basins
    Too large: destroys coherent reasoning
    """
    

[docs]
    def __init__(self, scale: float = 0.1, device: str = "cuda"):
        super().__init__(scale, device)

    

[docs]
    def sample(self, shape: tuple[int, ...]) -> torch.Tensor:
        return torch.randn(shape, device=self.device) * self.scale





[docs]
class UniformNoiseInjector(NoiseInjector):
    """
    Injects uniform noise in range [-scale, scale].
    
    Uniform noise has bounded magnitude, which may be preferable
    when you want to guarantee perturbations stay within a range.
    """
    

[docs]
    def __init__(self, scale: float = 0.1, device: str = "cuda"):
        super().__init__(scale, device)

    

[docs]
    def sample(self, shape: tuple[int, ...]) -> torch.Tensor:
        return (torch.rand(shape, device=self.device) * 2 - 1) * self.scale





[docs]
class AnnealedNoiseInjector(NoiseInjector):
    """
    Injects noise that decreases over the generation process.
    
    Motivation: Strong perturbation early (when problem framing matters most)
    tapering to stability later (when solution is crystallizing).
    
    Scale at step t: scale * (anneal_factor ^ t)
    
    This mirrors a natural intuition: you want to explore different
    framings early, then commit and execute once you've found a good path.
    """
    

[docs]
    def __init__(
        self, 
        scale: float = 0.1, 
        anneal_factor: float = 0.95,
        min_scale: float = 0.01,
        device: str = "cuda"
    ):
        super().__init__(scale, device)
        self.anneal_factor = anneal_factor
        self.min_scale = min_scale

    
    @property
    def current_scale(self) -> float:
        annealed = self.scale * (self.anneal_factor ** self.state.step)
        return max(annealed, self.min_scale)
    

[docs]
    def sample(self, shape: tuple[int, ...]) -> torch.Tensor:
        return torch.randn(shape, device=self.device) * self.current_scale





[docs]
class LayerSelectiveInjector(NoiseInjector):
    """
    Applies different noise scales to different layers.
    
    This allows testing the hypothesis that early layers (problem framing)
    vs late layers (output realization) have different sensitivity to
    perturbation.
    
    Args:
        layer_scales: Dict mapping layer index to noise scale
        default_scale: Scale for layers not in layer_scales
    """
    

[docs]
    def __init__(
        self,
        layer_scales: dict[int, float],
        default_scale: float = 0.0,
        device: str = "cuda"
    ):
        super().__init__(scale=default_scale, device=device)
        self.layer_scales = layer_scales
        self.current_layer: Optional[int] = None

    

[docs]
    def set_layer(self, layer_idx: int):
        """Set which layer we're currently injecting into."""
        self.current_layer = layer_idx

    
    @property
    def current_scale(self) -> float:
        if self.current_layer is None:
            return self.scale
        return self.layer_scales.get(self.current_layer, self.scale)
    

[docs]
    def sample(self, shape: tuple[int, ...]) -> torch.Tensor:
        return torch.randn(shape, device=self.device) * self.current_scale





[docs]
class OncePerGenerationInjector(NoiseInjector):
    """
    Samples noise once and reuses it for entire generation.
    
    This corresponds to the formalism in the paper:
        z ~ P(z|X)  [sampled once]
        h = f_θ(X, z)
    
    The same z influences all tokens, creating a consistent
    "reasoning trajectory" rather than per-token perturbation.
    """
    

[docs]
    def __init__(
        self,
        scale: float = 0.1,
        latent_dim: Optional[int] = None,
        device: str = "cuda"
    ):
        super().__init__(scale, device)
        self.latent_dim = latent_dim
        self._cached_noise: Optional[torch.Tensor] = None
        self._cached_shape: Optional[tuple] = None

    

[docs]
    def sample(self, shape: tuple[int, ...]) -> torch.Tensor:
        # If shape changed or no cache, resample
        if self._cached_noise is None or self._cached_shape != shape:
            if self.latent_dim is not None:
                # Sample low-dim latent and project
                # For now, simple approach: sample full shape
                # TODO: Add learned projection from low-dim latent
                pass
            self._cached_noise = torch.randn(shape, device=self.device) * self.scale
            self._cached_shape = shape
        return self._cached_noise

    

[docs]
    def reset(self):
        """Reset forces new noise sample on next generation."""
        super().reset()
        self._cached_noise = None
        self._cached_shape = None





[docs]
def create_injector(
    strategy: str,
    scale: float = 0.1,
    device: str = "cuda",
    **kwargs
) -> NoiseInjector:
    """
    Factory function to create noise injectors.
    
    Args:
        strategy: One of "gaussian", "uniform", "annealed", "once", "layer_selective"
        scale: Base noise scale
        device: Torch device
        **kwargs: Strategy-specific arguments
        
    Returns:
        Configured NoiseInjector instance
    """
    strategies = {
        "gaussian": GaussianNoiseInjector,
        "uniform": UniformNoiseInjector,
        "annealed": AnnealedNoiseInjector,
        "once": OncePerGenerationInjector,
        "layer_selective": LayerSelectiveInjector,
    }
    
    if strategy not in strategies:
        raise ValueError(f"Unknown strategy: {strategy}. Choose from {list(strategies.keys())}")
    
    return strategies[strategy](scale=scale, device=device, **kwargs)
Structured Stochasticity

Navigation

Related Topics

Source code for structured_stochasticity.injection