Source code for structured_stochasticity.injection

"""
Noise injection strategies for structured stochasticity.

This module provides different approaches to injecting noise into hidden states:
- Gaussian: Standard normal noise scaled by a factor
- Uniform: Uniform noise in a range
- Annealed: Noise that decreases over the generation process

The key insight is that noise injection should enable "trajectory resampling" -
allowing the model to escape local optima in reasoning space, analogous to how
unexpected external stimuli can help humans break out of thinking loops.
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional

import torch
import torch.nn as nn


[docs] @dataclass class InjectionState: """Tracks state across injection calls for stateful strategies.""" step: int = 0 total_steps: Optional[int] = None
[docs] def advance(self): self.step += 1
[docs] def reset(self): self.step = 0
[docs] class NoiseInjector(ABC): """Abstract base class for noise injection strategies."""
[docs] def __init__(self, scale: float = 0.1, device: str = "cuda"): self.scale = scale self.device = device self.state = InjectionState()
[docs] @abstractmethod def sample(self, shape: tuple[int, ...]) -> torch.Tensor: """Sample noise tensor of given shape.""" pass
[docs] def inject(self, hidden_states: torch.Tensor) -> torch.Tensor: """ Inject noise into hidden states. Args: hidden_states: Tensor of shape (batch, seq_len, hidden_dim) Returns: Perturbed hidden states of same shape """ noise = self.sample(hidden_states.shape).to(hidden_states.device) self.state.advance() return hidden_states + noise
[docs] def reset(self): """Reset internal state (call between generations).""" self.state.reset()
def __repr__(self): return f"{self.__class__.__name__}(scale={self.scale})"
[docs] class GaussianNoiseInjector(NoiseInjector): """ Injects Gaussian noise scaled by a constant factor. This is the simplest strategy: z ~ N(0, scale²) The scale parameter controls the magnitude of perturbation. Too small: won't escape attractor basins Too large: destroys coherent reasoning """
[docs] def __init__(self, scale: float = 0.1, device: str = "cuda"): super().__init__(scale, device)
[docs] def sample(self, shape: tuple[int, ...]) -> torch.Tensor: return torch.randn(shape, device=self.device) * self.scale
[docs] class UniformNoiseInjector(NoiseInjector): """ Injects uniform noise in range [-scale, scale]. Uniform noise has bounded magnitude, which may be preferable when you want to guarantee perturbations stay within a range. """
[docs] def __init__(self, scale: float = 0.1, device: str = "cuda"): super().__init__(scale, device)
[docs] def sample(self, shape: tuple[int, ...]) -> torch.Tensor: return (torch.rand(shape, device=self.device) * 2 - 1) * self.scale
[docs] class AnnealedNoiseInjector(NoiseInjector): """ Injects noise that decreases over the generation process. Motivation: Strong perturbation early (when problem framing matters most) tapering to stability later (when solution is crystallizing). Scale at step t: scale * (anneal_factor ^ t) This mirrors a natural intuition: you want to explore different framings early, then commit and execute once you've found a good path. """
[docs] def __init__( self, scale: float = 0.1, anneal_factor: float = 0.95, min_scale: float = 0.01, device: str = "cuda" ): super().__init__(scale, device) self.anneal_factor = anneal_factor self.min_scale = min_scale
@property def current_scale(self) -> float: annealed = self.scale * (self.anneal_factor ** self.state.step) return max(annealed, self.min_scale)
[docs] def sample(self, shape: tuple[int, ...]) -> torch.Tensor: return torch.randn(shape, device=self.device) * self.current_scale
[docs] class LayerSelectiveInjector(NoiseInjector): """ Applies different noise scales to different layers. This allows testing the hypothesis that early layers (problem framing) vs late layers (output realization) have different sensitivity to perturbation. Args: layer_scales: Dict mapping layer index to noise scale default_scale: Scale for layers not in layer_scales """
[docs] def __init__( self, layer_scales: dict[int, float], default_scale: float = 0.0, device: str = "cuda" ): super().__init__(scale=default_scale, device=device) self.layer_scales = layer_scales self.current_layer: Optional[int] = None
[docs] def set_layer(self, layer_idx: int): """Set which layer we're currently injecting into.""" self.current_layer = layer_idx
@property def current_scale(self) -> float: if self.current_layer is None: return self.scale return self.layer_scales.get(self.current_layer, self.scale)
[docs] def sample(self, shape: tuple[int, ...]) -> torch.Tensor: return torch.randn(shape, device=self.device) * self.current_scale
[docs] class OncePerGenerationInjector(NoiseInjector): """ Samples noise once and reuses it for entire generation. This corresponds to the formalism in the paper: z ~ P(z|X) [sampled once] h = f_θ(X, z) The same z influences all tokens, creating a consistent "reasoning trajectory" rather than per-token perturbation. """
[docs] def __init__( self, scale: float = 0.1, latent_dim: Optional[int] = None, device: str = "cuda" ): super().__init__(scale, device) self.latent_dim = latent_dim self._cached_noise: Optional[torch.Tensor] = None self._cached_shape: Optional[tuple] = None
[docs] def sample(self, shape: tuple[int, ...]) -> torch.Tensor: # If shape changed or no cache, resample if self._cached_noise is None or self._cached_shape != shape: if self.latent_dim is not None: # Sample low-dim latent and project # For now, simple approach: sample full shape # TODO: Add learned projection from low-dim latent pass self._cached_noise = torch.randn(shape, device=self.device) * self.scale self._cached_shape = shape return self._cached_noise
[docs] def reset(self): """Reset forces new noise sample on next generation.""" super().reset() self._cached_noise = None self._cached_shape = None
[docs] def create_injector( strategy: str, scale: float = 0.1, device: str = "cuda", **kwargs ) -> NoiseInjector: """ Factory function to create noise injectors. Args: strategy: One of "gaussian", "uniform", "annealed", "once", "layer_selective" scale: Base noise scale device: Torch device **kwargs: Strategy-specific arguments Returns: Configured NoiseInjector instance """ strategies = { "gaussian": GaussianNoiseInjector, "uniform": UniformNoiseInjector, "annealed": AnnealedNoiseInjector, "once": OncePerGenerationInjector, "layer_selective": LayerSelectiveInjector, } if strategy not in strategies: raise ValueError(f"Unknown strategy: {strategy}. Choose from {list(strategies.keys())}") return strategies[strategy](scale=scale, device=device, **kwargs)