<?php

declare(strict_types=1);

namespace Drupal\ai_dropsolid\Service;

use Drupal\ai\Utility\TextChunker;
use Drupal\ai\Utility\TextChunkerInterface;
use Drupal\ai\Utility\TokenizerInterface;
use Drupal\ai_dropsolid\Utility\RecursiveCharacterTextSplitter;

/**
 * Provides a recursive text chunker service for Dropsolid AI integrations.
 *
 * This service implements intelligent text chunking by converting token-based
 * size constraints to character-based limits, then using a recursive splitting
 * strategy that respects text boundaries and maintains semantic coherence.
 *
 * Key features:
 * - Token-to-character conversion with configurable multipliers
 * - Recursive splitting with separator hierarchy
 * - Intelligent text overlap between chunks
 * - Text normalization for consistent processing
 */
final class RecursiveTextChunker extends TextChunker implements TextChunkerInterface, RecursiveTextChunkerInterface {

  /**
   * Approximate multiplier to convert tokens to characters.
   *
   * This heuristic assumes ~4 characters per token for English text.
   * Adjust based on your specific tokenization model and language requirements.
   */
  private const TOKEN_TO_CHARACTER_MULTIPLIER = 4;

  /**
   * Buffer factor applied to character calculations.
   *
   * This safety margin helps ensure character-based chunking doesn't exceed
   * token limits, accounting for tokenization variations.
   */
  private const BUFFER_FACTOR = 0.9;

  /**
   * Default separators for the recursive text splitter.
   */
  private const DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""];

  /**
   * Constructs a new RecursiveTextChunker.
   *
   * @param \Drupal\ai\Utility\TokenizerInterface $tokenizer
   *   The tokenizer service for token counting operations.
   */
  public function __construct(
    protected TokenizerInterface $tokenizer,
  ) {
    // Parent constructor expects TokenizerInterface - passed directly.
  }

  /**
   * {@inheritdoc}
   */
  public function chunkText(string $text, int $maxSize, int $minOverlap): array {
    $this->validateChunkingParameters($maxSize, $minOverlap);

    // Convert token-based constraints to character-based with safety buffer.
    $characterChunkSize = $this->convertTokensToCharacters($maxSize);
    $characterOverlap = $this->convertTokensToCharacters($minOverlap);

    // Ensure minimum viable values for the splitter.
    $characterChunkSize = max(1, $characterChunkSize);
    $characterOverlap = max(0, $characterOverlap);

    // Initialize the recursive character splitter.
    $textSplitter = new RecursiveCharacterTextSplitter(
      chunkSize: $characterChunkSize,
      chunkOverlap: $characterOverlap,
      separators: self::DEFAULT_SEPARATORS,
    );

    // Normalize and split the text.
    $normalizedText = $this->normalizeTextForProcessing($text);

    return $textSplitter->split($normalizedText);
  }

  /**
   * {@inheritdoc}
   */
  public function countTokens(string $text): int {
    return $this->tokenizer->countTokens($text);
  }

  /**
   * {@inheritdoc}
   */
  public function setModel(string $model): void {
    $this->tokenizer->setModel($model);
  }

  /**
   * Validates chunking parameters for correctness.
   *
   * @param int $maxSize
   *   The maximum chunk size in tokens.
   * @param int $minOverlap
   *   The minimum overlap in tokens.
   *
   * @throws \InvalidArgumentException
   *   When parameters are invalid.
   */
  private function validateChunkingParameters(int $maxSize, int $minOverlap): void {
    if ($maxSize <= 0) {
      throw new \InvalidArgumentException('Maximum chunk size must be a positive integer.');
    }

    if ($minOverlap < 0) {
      throw new \InvalidArgumentException('Minimum overlap cannot be negative.');
    }

    if ($minOverlap >= $maxSize) {
      throw new \InvalidArgumentException('Minimum overlap must be less than maximum chunk size.');
    }
  }

  /**
   * Converts token count to approximate character count.
   *
   * @param int $tokenCount
   *   The number of tokens to convert.
   *
   * @return int
   *   The approximate character count with buffer applied.
   */
  private function convertTokensToCharacters(int $tokenCount): int {
    return (int) round($tokenCount * self::TOKEN_TO_CHARACTER_MULTIPLIER * self::BUFFER_FACTOR);
  }

  /**
   * Normalizes text for consistent processing.
   *
   * This method performs comprehensive text normalization including:
   * - Line break standardization.
   * - Whitespace cleanup.
   * - Paragraph structure normalization.
   *
   * @param string $text
   *   The text to normalize.
   *
   * @return string
   *   The normalized text ready for processing.
   */
  private function normalizeTextForProcessing(string $text): string {
    // Normalize all line break variations to LF.
    $text = str_replace(["\r\n", "\r"], "\n", $text);

    // Remove leading and trailing whitespace from each line.
    $text = preg_replace('/^[ \t]+|[ \t]+$/m', '', $text);

    // Collapse any sequence of 2+ line breaks into exactly two newlines.
    // This preserves paragraph structure while removing excessive spacing.
    $text = preg_replace('/(\n\s*){2,}/', "\n\n", $text);

    // Remove leading and trailing newlines from the entire text.
    return trim($text, "\n");
  }

}
