<?php

declare(strict_types=1);

namespace Drupal\ai_dropsolid\Service;

use Drupal\ai\Utility\TextChunker;
use Drupal\ai\Utility\TokenizerInterface;

/**
 * Provides a token-aware text chunker with separator-based splitting.
 *
 * This chunker implements an efficient token-budget strategy that minimizes
 * expensive tokenization API calls while ensuring chunks stay within token
 * limits. It uses a three-phase approach:
 *
 * 1. **Density Probing** (≤3 token calls): Samples 3 strategic text segments
 *    (beginning, middle, end) to estimate the maximum token-to-character
 *    density across the document. This accounts for variance in token density
 *    based on content type (e.g., URLs, emoji, multilingual text).
 *
 * 2. **Character-Budget Chunking** (0 token calls): Converts the token limit
 *    to a conservative character budget using the probed density. Splits text
 *    on natural boundaries (paragraphs, sentences, words) using a prioritized
 *    separator hierarchy: "\n\n" > "\n" > ". " > "\t" > " ".
 *
 * 3. **Risk-Based Validation** (≤1 token call): Optionally validates the
 *    single "riskiest" chunk (longest, most non-ASCII, most emoji) to catch
 *    edge cases. If validation fails, re-chunks with a tighter budget.
 *
 * **Total API Calls**: 3-4 tokenization calls per document (vs. N calls for
 * per-chunk validation where N = number of chunks).
 *
 * **Example Usage**:
 * @code
 * $chunker = \Drupal::service('ai_dropsolid.token_aware_text_chunker');
 * $chunker->setModel('text-embedding-3-small');
 *
 * $text = "Long document text...";
 * $chunks = $chunker->chunkText($text, 350, 70);
 * // Returns: ["chunk1...", "chunk2...", "chunk3..."]
 * // Each chunk ≤350 tokens with 70-token overlap between consecutive chunks.
 * @endcode
 *
 * **How It Works**:
 *
 * For a 350-token limit document:
 * - Probes 3 segments (10%, 50%, 90% positions) to find max density
 * - If max observed density is 0.34 tokens/char, applies 3% variance buffer
 * - Guard density becomes max(0.36, 0.34 * 1.03) = 0.36 tokens/char
 * - Effective token budget: (350 - 10 safety) * 0.99 fill = 337 tokens
 * - Character budget: 337 / 0.36 = 936 characters per chunk
 * - Splits on separators, snapping to paragraph/sentence boundaries
 * - Validates the riskiest chunk (e.g., one with URLs or emoji)
 * - If validation passes, returns chunks without further API calls
 *
 * **Why This Approach**:
 * - Traditional per-chunk validation costs N tokenization API calls
 * - Density-based estimation reduces this to 3-4 calls total
 * - Separator-aware splitting preserves semantic coherence
 * - Conservative safety margins (10 tokens) + variance buffers (3%) prevent
 *   overruns while still achieving 95%+ token budget utilization
 *
 * @see \Drupal\ai\Utility\TextChunker
 * @see \Drupal\ai_dropsolid\Service\TokenAwareTextChunkerInterface
 */
final class TokenAwareTextChunker extends TextChunker implements TokenAwareTextChunkerInterface {

  /**
   * Ordered list of preferred separators (highest priority first).
   *
   * Separator hierarchy ensures chunks break on natural boundaries:
   * - "\n\n": Paragraph breaks (strongest semantic boundary)
   * - "\n": Line breaks (structural boundary)
   * - ". ": Sentence endings with space (semantic unit completion)
   * - "\t": Tab characters (formatted content boundaries)
   * - " ": Whitespace (word boundaries, last resort)
   *
   * Note: Standalone "." removed to prevent splitting URLs (e.g., ".com").
   */
  private const DEFAULT_SEPARATORS = ["\n\n", "\n", '. ', "\t", ' '];

  /**
   * Probe centres expressed as ratios of the document length.
   */
  private const PROBE_CENTERS = [0.10, 0.50, 0.90];

  /**
   * Minimum probe window in characters.
   */
  private const MIN_PROBE_WINDOW = 1200;

  /**
   * Maximum probe window in characters.
   */
  private const MAX_PROBE_WINDOW = 2200;

  /**
   * Divider used to scale probe window against document length.
   */
  private const WINDOW_FRACTION = 8;

  /**
   * Variance multiplier applied to the observed density guard.
   *
   * Adds a 3% safety buffer to the observed maximum token density to account
   * for content heterogeneity. This prevents edge cases where a dense segment
   * (e.g., technical terms, URLs) appears after the probe points.
   *
   * Lower values = tighter budgets (more API calls if chunks exceed limits).
   * Higher values = looser budgets (fewer API calls but lower utilization).
   *
   * Tuned to 0.03 (3%) for 95%+ budget utilization with <1% validation rate.
   */
  private const VARIANCE_FACTOR = 0.03;

  /**
   * Calibrated upper bound for tokens per character (XLM-R baseline).
   *
   * Based on XLM-RoBERTa tokenizer characteristics, which averages ~0.25
   * tokens/char for English prose and peaks at ~0.36 for dense content.
   * This serves as a conservative ceiling when observed density is lower.
   *
   * Used as: guard_density = max(MODEL_MAX_DENSITY, observed_density * 1.03)
   */
  private const MODEL_MAX_DENSITY = 0.36;

  /**
   * Tokens kept in reserve to protect against occasional spikes.
   *
   * Reserves 10 tokens from the maximum to handle edge cases like:
   * - Tokenizer model differences (slight variations between models)
   * - Rounding errors in character/token conversion
   * - Unexpected dense segments (e.g., base64 strings, code blocks)
   *
   * For a 350-token limit, this reserves ~3% as safety margin.
   */
  private const TOKEN_SAFETY_MARGIN = 10;

  /**
   * Target fill ratio applied to the effective token limit.
   *
   * Aims for 99% utilization of the effective token budget (after safety
   * margin). This balances high utilization with minimal overage risk.
   *
   * Combined with TOKEN_SAFETY_MARGIN, yields ~96% of original limit:
   * For 350 tokens: (350 - 10) * 0.99 = 336 effective tokens (~96%).
   */
  private const TARGET_FILL = 0.99;

  /**
   * Maximum fraction of the chunk budget allowed for overlap.
   *
   * Caps overlap at 60% of the character budget to ensure meaningful new
   * content in each chunk. If user requests 70-token overlap but char budget
   * is 900 chars (mapping to 324 tokens), this caps overlap at 540 chars.
   *
   * Prevents: Chunks with 90%+ overlap and minimal new information.
   */
  private const MAX_OVERLAP_FRACTION = 0.60;

  /**
   * Minimum fraction of the chunk budget enforced as stride.
   *
   * Ensures each chunk advances at least 40% of the character budget,
   * preventing infinitesimal progress through the document.
   *
   * For 900-char budget: min stride = 360 chars (even with high overlap).
   * This guarantees at least 40% new content per chunk.
   */
  private const MIN_STRIDE_FRACTION = 0.40;

  /**
   * Minimum fraction of the budget remaining after snapping.
   *
   * If boundary snapping would shrink a chunk below 75% of the target size,
   * the original (non-snapped) position is used instead. This prevents
   * overly aggressive snapping that creates many small chunks.
   *
   * For 900-char budget: chunk must be ≥675 chars after snapping.
   * If snapping yields 600 chars, ignore snap and use original 900-char cut.
   */
  private const MIN_POST_SNAP_FRACTION = 0.75;

  /**
   * Maximum number of re-chunking attempts after validation.
   */
  private const MAX_VALIDATION_ATTEMPTS = 1;

  /**
   * Reduction applied to the character budget when re-chunking.
   */
  private const VALIDATION_REDUCTION_RATIO = 0.90;

  /**
   * Search window for snapping to boundaries (in characters).
   */
  private const BOUNDARY_SEARCH_WINDOW = 400;

  /**
   * Regex pattern used to detect emoji characters.
   */
  private const EMOJI_PATTERN = '/\p{Extended_Pictographic}/u';

  /**
   * Regex pattern used to detect non-ASCII characters.
   */
  private const NON_ASCII_PATTERN = '/[^\x20-\x7E]/u';

  /**
   * Constructs a new TokenAwareTextChunker.
   *
   * This constructor explicitly defines the tokenizer dependency to maintain
   * clarity in the service definition and ensure proper type documentation.
   *
   * @param \Drupal\ai\Utility\TokenizerInterface $tokenizer
   *   The tokenizer service for token counting operations.
   *
   * @phpcs:disable Generic.CodeAnalysis.UselessOverridingMethod.Found
   */
  public function __construct(TokenizerInterface $tokenizer) {
    parent::__construct($tokenizer);
  }

  /**
   * {@inheritdoc}
   */
  public function chunkText(string $text, int $maxSize, int $minOverlap): array {
    $this->validateChunkingParameters($maxSize, $minOverlap);

    $normalizedText = $this->normalizeTextForProcessing($text);
    if ($normalizedText === '') {
      return [];
    }

    $plan = NULL;
    $hardMaxChars = (int) floor(($maxSize - self::TOKEN_SAFETY_MARGIN) / max(self::MODEL_MAX_DENSITY, 1e-6));

    if ($hardMaxChars > 0 && mb_strlen($normalizedText) <= $hardMaxChars) {
      $chunks = [$normalizedText];
    }
    else {
      $plan = $this->buildChunkingPlan($normalizedText, $maxSize, $minOverlap);
      $charBudget = $plan['char_budget'];
      $overlapChars = $plan['overlap_chars'];
      $minStride = $plan['min_stride'];
      $guardDensity = $plan['guard_density'];
      $textLength = mb_strlen($normalizedText);

      $chunks = [];
      $remainingAttempts = self::MAX_VALIDATION_ATTEMPTS;

      while (TRUE) {
        $currentOverlap = min($overlapChars, (int) floor(self::MAX_OVERLAP_FRACTION * $charBudget));
        $currentMinStride = min($charBudget, max($minStride, (int) floor(self::MIN_STRIDE_FRACTION * $charBudget)));

        $chunks = $this->chunkWithSeparators($normalizedText, $charBudget, $currentOverlap, $currentMinStride);
        $validationChunk = $this->pickChunkForValidation($chunks);

        if ($validationChunk === NULL) {
          break;
        }

        $tokens = $this->countTokens($validationChunk);
        if ($tokens <= $maxSize) {
          break;
        }

        $chunkLength = mb_strlen($validationChunk);
        if ($chunkLength > 0) {
          $observedDensity = $tokens / $chunkLength;
          $guardDensity = max($guardDensity, $observedDensity * (1 + self::VARIANCE_FACTOR));
          $recalculatedPlan = $this->deriveBudgetsFromDensity($guardDensity, $maxSize, $minOverlap, $textLength);

          $tokenRatio = max($recalculatedPlan['effective_limit'] / $tokens, 0.01);
          $nextCharBudget = (int) floor(max(1, min(
            $recalculatedPlan['char_budget'],
            $charBudget * $tokenRatio
          )));

          if ($nextCharBudget >= $charBudget) {
            $nextCharBudget = max(1, (int) floor($charBudget * self::VALIDATION_REDUCTION_RATIO));
          }

          $charBudget = $nextCharBudget;
          $overlapChars = min($recalculatedPlan['overlap_chars'], (int) floor(self::MAX_OVERLAP_FRACTION * $charBudget));
          $minStride = min($charBudget, max(1, (int) floor(self::MIN_STRIDE_FRACTION * $charBudget)));
        }
        else {
          $charBudget = max(1, (int) floor($charBudget * self::VALIDATION_REDUCTION_RATIO));
          $overlapChars = min($overlapChars, (int) floor(self::MAX_OVERLAP_FRACTION * $charBudget));
          $minStride = max(1, (int) floor(self::MIN_STRIDE_FRACTION * $charBudget));
        }

        if ($remainingAttempts === 0) {
          break;
        }
        $remainingAttempts--;
      }
    }

    $chunks = $this->sanitizeChunks($chunks);

    return $chunks;
  }

  /**
   * {@inheritdoc}
   */
  public function countTokens(string $text): int {
    if ($text === '') {
      return 0;
    }

    try {
      return $this->tokenizer->countTokens($text);
    }
    catch (\Throwable $throwable) {
      throw new \RuntimeException('Failed to count tokens using the configured tokenizer.', 0, $throwable);
    }
  }

  /**
   * {@inheritdoc}
   */
  public function setModel(string $model): void {
    $this->tokenizer->setModel($model);
  }

  /**
   * Builds the chunking plan (budgets and guard density).
   *
   * This method converts the token-based limits into character-based budgets
   * using density probes. The guard density includes a variance buffer to
   * account for content heterogeneity (e.g., dense technical terms, URLs).
   *
   * **Math Breakdown**:
   * 1. Probe token density at 3 positions → max observed density
   * 2. Guard density = max(MODEL_MAX_DENSITY, observed * (1 + VARIANCE_FACTOR))
   *    - MODEL_MAX_DENSITY (0.36): XLM-RoBERTa baseline (conservative ceiling)
   *    - VARIANCE_FACTOR (0.03): 3% buffer for density spikes
   * 3. Effective token limit = (maxTokens - SAFETY_MARGIN) * TARGET_FILL
   *    - SAFETY_MARGIN (10): Reserve tokens for edge cases
   *    - TARGET_FILL (0.99): Aim for 99% utilization
   * 4. Character budget = effective tokens / guard density
   * 5. Overlap chars = minOverlap tokens / guard density
   * 6. Min stride = 40% of char budget (ensures meaningful progress)
   *
   * **Example** (350 token limit):
   * - Observed density: 0.34 tokens/char
   * - Guard density: max(0.36, 0.34 * 1.03) = 0.36
   * - Effective limit: (350 - 10) * 0.99 = 336 tokens
   * - Char budget: 336 / 0.36 = 933 chars
   * - Overlap (70 tokens): 70 / 0.36 = 194 chars
   * - Min stride: 933 * 0.40 = 373 chars
   *
   * @param string $text
   *   The normalized text.
   * @param int $maxTokens
   *   Maximum tokens allowed per chunk.
   * @param int $minOverlap
   *   Minimum overlap in tokens.
   *
   * @return array
   *   Array containing the character budget, overlap characters, minimum
   *   stride, and guard density.
   */
  private function buildChunkingPlan(string $text, int $maxTokens, int $minOverlap): array {
    $length = mb_strlen($text);
    $maxObserved = $this->determineTokenDensity($text, $length);

    $guardDensity = max(self::MODEL_MAX_DENSITY, $maxObserved * (1 + self::VARIANCE_FACTOR));
    $plan = $this->deriveBudgetsFromDensity($guardDensity, $maxTokens, $minOverlap, $length);
    $plan['guard_density'] = $guardDensity;

    return $plan;
  }

  /**
   * Derives character budgets using a specific guard density.
   *
   * @param float $guardDensity
   *   The guard density to use for conversions.
   * @param int $maxTokens
   *   Maximum tokens allowed per chunk.
   * @param int $minOverlap
   *   Minimum overlap in tokens.
   * @param int $length
   *   The total character length of the text.
   *
   * @return array
   *   Array containing the character budget, overlap characters, minimum
   *   stride, and effective limit.
   */
  private function deriveBudgetsFromDensity(float $guardDensity, int $maxTokens, int $minOverlap, int $length): array {
    $guardDensity = max($guardDensity, 1e-6);

    $effectiveLimit = max(1, (int) floor(($maxTokens - self::TOKEN_SAFETY_MARGIN) * self::TARGET_FILL));

    $charBudget = max(1, (int) floor($effectiveLimit / $guardDensity));
    $charBudget = min($charBudget, $length);

    $overlapChars = $minOverlap > 0 ? max(1, (int) ceil($minOverlap / $guardDensity)) : 0;
    $overlapChars = min($overlapChars, (int) floor(self::MAX_OVERLAP_FRACTION * $charBudget));

    $minStride = max(1, (int) floor(self::MIN_STRIDE_FRACTION * $charBudget));

    return [
      'effective_limit' => $effectiveLimit,
      'char_budget' => $charBudget,
      'overlap_chars' => $overlapChars,
      'min_stride' => $minStride,
    ];
  }

  /**
   * Determines the maximum observed token density using probes.
   *
   * Probes strategic text segments to estimate the worst-case token density.
   * This approach assumes density can vary across a document (e.g., technical
   * jargon at the start, prose in the middle, URLs at the end). By sampling
   * multiple locations, we capture the maximum density and use it as a
   * conservative guard for character budget calculations.
   *
   * **Probe Strategy**:
   * - Short texts (<1200 chars): Single probe at 50% position
   * - Longer texts: Three probes at 10%, 50%, 90% positions
   * - Probe window: 1200-2200 chars (adaptive based on document length)
   *
   * **Why This Works**:
   * - Token density varies by content type (ASCII: ~0.25, emoji: ~0.50)
   * - Probing captures local maxima without tokenizing entire document
   * - Three strategic probes balance accuracy vs. API call cost
   *
   * @param string $text
   *   The normalized text.
   * @param int $length
   *   The character length of the text.
   *
   * @return float
   *   The maximum observed tokens per character ratio.
   */
  private function determineTokenDensity(string $text, int $length): float {
    if ($length === 0) {
      return 0.0;
    }

    $segments = $this->buildProbeSegments($text, $length);
    $segments = array_values(array_unique($segments));

    $densities = [];

    foreach ($segments as $segment) {
      if ($segment === '') {
        continue;
      }
      $segmentLength = mb_strlen($segment);
      if ($segmentLength === 0) {
        continue;
      }
      $tokens = max(1, $this->countTokens($segment));
      $densities[] = $tokens / $segmentLength;
    }

    if (empty($densities)) {
      return 0.0;
    }

    return (float) max($densities);
  }

  /**
   * Builds the probe segments used to estimate token density.
   *
   * @param string $text
   *   The normalized text.
   * @param int $length
   *   The character length of the text.
   *
   * @return array
   *   Array of probe segments.
   */
  private function buildProbeSegments(string $text, int $length): array {
    $window = $this->calculateProbeWindow($length);
    $centers = $this->determineProbeCenters($length);

    $segments = [];
    foreach ($centers as $center) {
      [$start, $end] = $this->resolveProbeBounds($text, $length, $window, $center);
      if ($end > $start) {
        $segments[] = mb_substr($text, $start, $end - $start);
      }
    }

    if (empty($segments)) {
      $segments[] = $text;
    }

    return $segments;
  }

  /**
   * Calculates the preferred probe window length.
   *
   * @param int $length
   *   The character length of the document.
   *
   * @return int
   *   Window length in characters.
   */
  private function calculateProbeWindow(int $length): int {
    if ($length <= self::MIN_PROBE_WINDOW) {
      return $length;
    }

    $window = (int) ceil($length / self::WINDOW_FRACTION);
    $window = max(self::MIN_PROBE_WINDOW, $window);
    return min(self::MAX_PROBE_WINDOW, $window);
  }

  /**
   * Determines probe centres based on document length.
   *
   * @param int $length
   *   The character length of the document.
   *
   * @return array
   *   Array of probe centre ratios.
   */
  private function determineProbeCenters(int $length): array {
    if ($length <= self::MIN_PROBE_WINDOW) {
      return [0.50];
    }
    return self::PROBE_CENTERS;
  }

  /**
   * Calculates probe bounds for a given centre.
   *
   * @param string $text
   *   The text.
   * @param int $length
   *   The text length.
   * @param int $window
   *   Desired window length.
   * @param float $center
   *   Centre expressed as ratio of the document length.
   *
   * @return array
   *   A two-element array [start, end].
   */
  private function resolveProbeBounds(string $text, int $length, int $window, float $center): array {
    $halfWindow = (int) floor($window / 2);
    $centerChar = (int) round($center * $length);
    $start = max(0, $centerChar - $halfWindow);
    $start = min($start, max(0, $length - $window));
    $start = $this->snapBackwardToBoundary($text, $start);

    $end = min($length, $start + $window);
    if ($end <= $start) {
      $end = min($length, $start + $window);
    }

    return [$start, $end];
  }

  /**
   * Chunks the text using the computed character budget and separators.
   *
   * This method implements the core splitting logic using a separator
   * hierarchy. It iterates through the text, creating chunks of approximately
   * the character budget size while snapping to natural boundaries.
   *
   * **Separator Priority** (highest to lowest):
   * 1. "\n\n" - Paragraph breaks (preserves document structure)
   * 2. "\n"   - Line breaks (preserves line structure)
   * 3. ". "   - Sentence endings (preserves semantic units)
   * 4. "\t"   - Tab boundaries (preserves formatted content)
   * 5. " "    - Word boundaries (last resort, preserves words)
   *
   * **Chunking Algorithm**:
   * 1. Set chunk end at position: start + charBudget
   * 2. Snap backward to nearest separator (within 400-char window)
   * 3. If snapped chunk < 75% of budget, use original position
   * 4. Extract chunk from start to end
   * 5. Calculate next start with overlap consideration
   * 6. Snap forward to next separator (ensures clean boundaries)
   * 7. Repeat until document end
   *
   * **Overlap Handling**:
   * - Target overlap: end - overlapChars
   * - Min stride enforced: 40% of char budget (prevents tiny steps)
   * - Actual next start: max(start + minStride, targetOverlap)
   *
   * **Last Chunk Handling**:
   * - If last chunk < 66% of char budget → merge with previous chunk
   * - Prevents tiny trailing chunks with minimal new content
   *
   * @param string $text
   *   The normalized text.
   * @param int $charBudget
   *   Allowed characters per chunk.
   * @param int $overlapChars
   *   Desired overlap in characters.
   * @param int $minStride
   *   Minimum stride enforced between consecutive chunks.
   *
   * @return array
   *   Raw (untrimmed) chunks.
   */
  private function chunkWithSeparators(string $text, int $charBudget, int $overlapChars, int $minStride): array {
    $chunks = [];
    $length = mb_strlen($text);
    $start = 0;
    $minChunkLength = max(1, (int) floor(self::MIN_POST_SNAP_FRACTION * $charBudget));
    $minChunkLength = min($minChunkLength, $charBudget);

    while ($start < $length) {
      $endGuess = min($length, $start + $charBudget);
      $end = $this->snapBackwardToBoundary($text, $endGuess);

      if (($end - $start) < $minChunkLength) {
        $end = $endGuess;
      }

      if ($end <= $start) {
        $end = min($length, max($start + 1, $endGuess));
      }

      $chunk = mb_substr($text, $start, $end - $start);

      if ($end >= $length) {
        // This is the last chunk - check if it's too small.
        // Use character threshold based on at least 66% of the char budget.
        $minMeaningfulChars = (int) floor($charBudget * 0.66);

        if (mb_strlen($chunk) < $minMeaningfulChars && !empty($chunks)) {
          // Merge with previous chunk instead of creating a tiny trailing
          // chunk.
          $lastIdx = count($chunks) - 1;
          $chunks[$lastIdx] .= $chunk;
        }
        elseif ($chunk !== '') {
          $chunks[] = $chunk;
        }
        break;
      }

      if ($chunk !== '') {
        $chunks[] = $chunk;
      }

      $targetStart = $overlapChars > 0 ? max(0, $end - $overlapChars) : $end;
      $nextStart = max($start + $minStride, $targetStart);
      $nextStart = $this->snapForwardFrom($text, $nextStart);
      if ($nextStart <= $start) {
        $nextStart = min($length, $start + $minStride);
      }

      $start = $nextStart;
    }

    return $chunks;
  }

  /**
   * Validates chunking parameters for correctness.
   *
   * @param int $maxSize
   *   The maximum chunk size in tokens.
   * @param int $minOverlap
   *   The minimum overlap in tokens.
   *
   * @throws \InvalidArgumentException
   *   When parameters are invalid.
   */
  private function validateChunkingParameters(int $maxSize, int $minOverlap): void {
    if ($maxSize <= 0) {
      throw new \InvalidArgumentException('Maximum chunk size must be a positive integer.');
    }

    if ($minOverlap < 0) {
      throw new \InvalidArgumentException('Minimum overlap cannot be negative.');
    }

    if ($minOverlap >= $maxSize) {
      throw new \InvalidArgumentException('Minimum overlap must be less than maximum chunk size.');
    }
  }

  /**
   * Normalizes text for consistent processing.
   *
   * This method performs comprehensive text normalization including:
   * - Line break standardization.
   * - Whitespace cleanup.
   * - Paragraph structure normalization.
   *
   * @param string $text
   *   The text to normalize.
   *
   * @return string
   *   The normalized text ready for processing.
   */
  private function normalizeTextForProcessing(string $text): string {
    // Normalize all line break variations to LF.
    $text = str_replace(["\r\n", "\r"], "\n", $text);

    // Remove leading and trailing whitespace from each line.
    $text = preg_replace('/^[ \t]+|[ \t]+$/m', '', $text) ?? $text;

    // Collapse any sequence of 2+ line breaks into exactly two newlines.
    // This preserves paragraph structure while removing excessive spacing.
    $text = preg_replace('/(\n\s*){2,}/', "\n\n", $text) ?? $text;

    // Remove leading and trailing newlines from the entire text.
    return trim($text, "\n");
  }

  /**
   * Picks the chunk that is most likely to exceed the token budget.
   *
   * This method implements risk-based validation to minimize API calls. Instead
   * of validating every chunk, we identify the single "riskiest" chunk using
   * heuristics. If this worst-case chunk passes validation, we assume all
   * others will too (since they were created with the same character budget).
   *
   * **Risk Scoring Heuristics**:
   * - Base score: Character length (longer = riskier)
   * - Non-ASCII bonus: +600 points per non-ASCII ratio (denser tokens)
   * - Emoji bonus: +900 points per emoji ratio (very dense tokens)
   *
   * **Why This Works**:
   * - Token density correlates with character type (ASCII < non-ASCII < emoji)
   * - Longest chunk with most special chars = worst-case scenario
   * - If worst-case passes, others (same budget, better content) will too
   * - Reduces N validations → 1 validation (N-1 fewer API calls)
   *
   * @param array $chunks
   *   The chunk candidates.
   *
   * @return string|null
   *   The riskiest chunk or NULL if no validation is required.
   */
  private function pickChunkForValidation(array $chunks): ?string {
    if (empty($chunks)) {
      return NULL;
    }

    $bestChunk = NULL;
    $bestScore = -INF;

    foreach ($chunks as $chunk) {
      $score = $this->calculateRiskScore($chunk);
      if ($score > $bestScore) {
        $bestScore = $score;
        $bestChunk = $chunk;
      }
    }

    return $bestChunk;
  }

  /**
   * Computes a heuristic risk score for a chunk.
   *
   * @param string $chunk
   *   The chunk to evaluate.
   *
   * @return float
   *   The risk score.
   */
  private function calculateRiskScore(string $chunk): float {
    $length = max(1, mb_strlen($chunk));
    $score = (float) $length;

    $nonAsciiMatches = [];
    $nonAscii = preg_match_all(self::NON_ASCII_PATTERN, $chunk, $nonAsciiMatches);
    if ($nonAscii > 0) {
      $score += ($nonAscii / $length) * 600;
    }

    $emojiMatches = [];
    $emoji = preg_match_all(self::EMOJI_PATTERN, $chunk, $emojiMatches);
    if ($emoji > 0) {
      $score += ($emoji / $length) * 900;
    }

    return $score;
  }

  /**
   * Sanitizes the chunks by trimming and removing empty entries.
   *
   * @param array $chunks
   *   Raw chunk list.
   *
   * @return array
   *   Cleaned chunk list.
   */
  private function sanitizeChunks(array $chunks): array {
    $chunks = array_map(static fn (string $chunk): string => trim($chunk), $chunks);
    $chunks = array_filter($chunks, static fn (string $chunk): bool => $chunk !== '');

    return array_values($chunks);
  }

  /**
   * Snaps position backwards to the nearest preferred separator.
   *
   * This method searches backward from a position to find the closest natural
   * text boundary. It uses a prioritized separator list to prefer paragraph
   * breaks over sentence breaks over word breaks.
   *
   * **Search Strategy**:
   * - Search window: 400 characters backward from position
   * - Priority: Earlier separator index = higher priority
   * - Tie-breaking: Closer distance wins, then higher priority wins
   *
   * **Why 400 Characters**:
   * - Average sentence: 75-150 characters
   * - Average paragraph: 200-400 characters
   * - 400-char window captures at least 2-3 sentences or 1 paragraph
   * - Balances boundary quality vs. search cost
   *
   * **Example**:
   * Position 936, text has "\n\n" at 920 and ". " at 930:
   * - Both within window, "\n\n" is closer (16 chars vs. 6 chars)
   * - But ". " wins (6 < 16 distance), returns position 932
   *
   * @param string $text
   *   The text.
   * @param int $position
   *   The starting position.
   *
   * @return int
   *   The snapped position.
   */
  private function snapBackwardToBoundary(string $text, int $position): int {
    $length = mb_strlen($text);
    $position = max(0, min($position, $length));

    if ($position === 0 || $length === 0) {
      return 0;
    }

    $windowStart = max(0, $position - self::BOUNDARY_SEARCH_WINDOW);
    $segmentLength = $position - $windowStart;

    if ($segmentLength <= 0) {
      return $position;
    }

    $segment = mb_substr($text, $windowStart, $segmentLength);

    $bestCandidate = $position;
    $bestDistance = PHP_INT_MAX;
    $bestPriority = PHP_INT_MAX;

    foreach (self::DEFAULT_SEPARATORS as $priority => $separator) {
      $separatorLength = mb_strlen($separator);
      if ($separatorLength === 0) {
        continue;
      }

      $offset = 0;
      while (($relative = mb_strpos($segment, $separator, $offset)) !== FALSE) {
        $offset = $relative + 1;
        $candidate = $windowStart + $relative + $separatorLength;

        if ($candidate > $position) {
          continue;
        }

        $distance = $position - $candidate;
        if ($distance < $bestDistance || ($distance === $bestDistance && $priority < $bestPriority)) {
          $bestDistance = $distance;
          $bestPriority = $priority;
          $bestCandidate = $candidate;
        }
      }
    }

    if ($bestDistance !== PHP_INT_MAX) {
      return max(0, $bestCandidate);
    }

    return $position;
  }

  /**
   * Snaps position forward to the next preferred separator.
   *
   * @param string $text
   *   The text.
   * @param int $position
   *   The starting position.
   *
   * @return int
   *   The snapped position.
   */
  private function snapForwardFrom(string $text, int $position): int {
    $length = mb_strlen($text);
    $position = max(0, min($position, $length));

    if ($position >= $length) {
      return $length;
    }

    $windowEnd = min($length, $position + self::BOUNDARY_SEARCH_WINDOW);
    $segmentLength = $windowEnd - $position;

    if ($segmentLength <= 0) {
      return $position;
    }

    $segment = mb_substr($text, $position, $segmentLength);
    $bestCandidate = $position;
    $bestDistance = PHP_INT_MAX;
    $bestPriority = PHP_INT_MAX;

    foreach (self::DEFAULT_SEPARATORS as $priority => $separator) {
      $separatorLength = mb_strlen($separator);
      if ($separatorLength === 0) {
        continue;
      }

      $offset = 0;
      while (($relative = mb_strpos($segment, $separator, $offset)) !== FALSE) {
        $offset = $relative + 1;

        $candidate = $position + $relative + $separatorLength;
        if ($candidate < $position) {
          continue;
        }

        $distance = $candidate - $position;
        if ($distance < $bestDistance || ($distance === $bestDistance && $priority < $bestPriority)) {
          $bestDistance = $distance;
          $bestPriority = $priority;
          $bestCandidate = $candidate;
        }
      }
    }

    if ($bestDistance !== PHP_INT_MAX) {
      return min($length, $bestCandidate);
    }

    return $position;
  }

}
