<?php

declare(strict_types=1);

namespace Drupal\ai_dropsolid\Utility;

/**
 * Provides a recursive text splitter with configurable overlap.
 *
 * This dependency-free text splitter follows a hierarchical approach to text
 * segmentation, recursively splitting by separator precedence until all
 * segments meet the configured size constraints. It then intelligently
 * re-merges segments with optional overlap while respecting text boundaries.
 *
 * Key features:
 * - Normalizes excessive blank lines to prevent orphaned headings
 * - Recursively splits using ordered separator hierarchy
 * - Greedily re-merges segments into final chunks
 * - Maintains character overlap between chunks without breaking words/headings
 *
 * @example
 * @code
 *   $splitter = new RecursiveCharacterTextSplitter(
 *     chunkSize: 1000,
 *     chunkOverlap: 200,
 *     separators: ["\n\n", "\n", ". ", " ", ""]
 *   );
 *   $chunks = $splitter->split($text);
 * @endcode
 */
final class RecursiveCharacterTextSplitter {

  /**
   * The default separator hierarchy for text splitting.
   */
  private const DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""];

  /**
   * The default glue string for re-joining text segments.
   */
  private const DEFAULT_GLUE = "\n\n";

  /**
   * Minimum overlap percentage threshold for safe overlap calculation.
   */
  private const MIN_OVERLAP_THRESHOLD = 0.6;

  /**
   * Maximum length of a chunk in characters.
   */
  private readonly int $chunkSize;

  /**
   * Desired number of overlapping characters between chunks.
   */
  private readonly int $chunkOverlap;

  /**
   * Ordered list of separators from coarse to fine-grained.
   */
  private readonly array $separators;

  /**
   * Constructs a new RecursiveCharacterTextSplitter.
   *
   * @param int $chunkSize
   *   Maximum characters per chunk. Must be positive.
   * @param int $chunkOverlap
   *   Characters of overlap between chunks. Must be less than chunk size.
   * @param array $separators
   *   Separator hierarchy from coarse to fine. Including an empty string ("")
   *   enables character-level fallback splitting.
   *
   * @throws \InvalidArgumentException
   *   When chunk size is non-positive or overlap exceeds chunk size.
   */
  public function __construct(
    int $chunkSize = 1000,
    int $chunkOverlap = 200,
    array $separators = self::DEFAULT_SEPARATORS,
  ) {
    if ($chunkSize <= 0) {
      throw new \InvalidArgumentException('Chunk size must be a positive integer.');
    }

    if ($chunkOverlap < 0) {
      throw new \InvalidArgumentException('Chunk overlap cannot be negative.');
    }

    if ($chunkOverlap >= $chunkSize) {
      throw new \InvalidArgumentException('Chunk overlap must be less than chunk size.');
    }

    $this->chunkSize = $chunkSize;
    $this->chunkOverlap = $chunkOverlap;
    $this->separators = $separators;
  }

  /**
   * Splits the provided text into appropriately sized chunks.
   *
   * @param string $text
   *   The input text to be split into chunks.
   *
   * @return array
   *   An array of text chunks, each respecting the configured size limits.
   */
  public function split(string $text): array {
    if (trim($text) === '') {
      return [];
    }

    $normalizedText = $this->normalizeText($text);
    $leafSegments = $this->recursiveSplit($normalizedText, 0);

    return $this->mergeSegmentsWithOverlap($leafSegments);
  }

  /**
   * Normalizes line breaks and collapses excessive blank lines.
   *
   * @param string $text
   *   The text to normalize.
   *
   * @return string
   *   The normalized text.
   */
  private function normalizeText(string $text): string {
    // Normalize all line break variations to LF.
    $text = str_replace(["\r\n", "\r"], "\n", $text);

    // Remove trailing whitespace from each line.
    $text = preg_replace('/^[ \t]+|[ \t]+$/m', '', $text);

    // Collapse three or more consecutive newlines to exactly two.
    $text = preg_replace('/(\n\s*){3,}/', "\n\n", $text);

    return trim($text);
  }

  /**
   * Recursively splits text using the configured separator hierarchy.
   *
   * @param string $text
   *   The text segment to split.
   * @param int $separatorIndex
   *   Current position in the separator hierarchy.
   *
   * @return array
   *   Array of leaf segments, each within the configured size limit.
   */
  private function recursiveSplit(string $text, int $separatorIndex): array {
    // Base case: text is small enough or no more separators available.
    if (mb_strlen($text) <= $this->chunkSize || $separatorIndex >= count($this->separators)) {
      return [$text];
    }

    $separator = $this->separators[$separatorIndex];

    // If separator not found in text, try the next separator.
    if ($separator !== '' && mb_strpos($text, $separator) === FALSE) {
      return $this->recursiveSplit($text, $separatorIndex + 1);
    }

    // Split by current separator and recursively process each part.
    $parts = $separator === '' ? mb_str_split($text) : explode($separator, $text);
    $leafSegments = [];

    foreach ($parts as $part) {
      if ($part === '') {
        continue;
      }

      $leafSegments = array_merge(
        $leafSegments,
        $this->recursiveSplit($part, $separatorIndex + 1)
      );
    }

    return $leafSegments;
  }

  /**
   * Merges leaf segments into final chunks with intelligent overlap.
   *
   * @param array $segments
   *   Array of leaf text segments.
   *
   * @return array
   *   Final chunks with appropriate overlap between consecutive chunks.
   */
  private function mergeSegmentsWithOverlap(array $segments): array {
    $chunks = [];
    $currentBuffer = '';

    foreach ($segments as $segment) {
      $candidateLength = mb_strlen($currentBuffer)
        + ($currentBuffer === '' ? 0 : mb_strlen(self::DEFAULT_GLUE))
        + mb_strlen($segment);

      // If adding this segment fits within chunk size, add it to buffer.
      if ($candidateLength <= $this->chunkSize) {
        $currentBuffer .= ($currentBuffer === '' ? '' : self::DEFAULT_GLUE) . $segment;
        continue;
      }

      // Flush current buffer as a chunk.
      if ($currentBuffer !== '') {
        $chunks[] = trim($currentBuffer);
      }

      // Calculate overlap from the previous chunk.
      $overlap = '';
      if ($this->chunkOverlap > 0 && !empty($chunks)) {
        $overlap = $this->calculateSafeOverlap($chunks[array_key_last($chunks)]);
      }

      // Start new buffer with overlap and current segment.
      $currentBuffer = $overlap;
      if ($currentBuffer !== '') {
        $currentBuffer .= self::DEFAULT_GLUE;
      }
      $currentBuffer .= $segment;
    }

    // Add remaining buffer as final chunk.
    if ($currentBuffer !== '') {
      $chunks[] = trim($currentBuffer);
    }

    return $chunks;
  }

  /**
   * Calculates safe overlap that respects text boundaries.
   *
   * @param string $text
   *   The text from which to extract overlap.
   *
   * @return string
   *   Safe overlap text that doesn't break at inappropriate boundaries.
   */
  private function calculateSafeOverlap(string $text): string {
    $overlapTail = mb_substr($text, -$this->chunkOverlap);
    $minimumLength = (int) round($this->chunkOverlap * self::MIN_OVERLAP_THRESHOLD);
    $bestStartPosition = 0;

    // Try to find a good separator boundary for clean overlap.
    foreach ($this->separators as $separator) {
      if ($separator === '') {
        continue;
      }

      $position = mb_strrpos($overlapTail, $separator);
      while ($position !== FALSE) {
        $candidateLength = mb_strlen($overlapTail) - ($position + mb_strlen($separator));

        if ($candidateLength >= $minimumLength) {
          $bestStartPosition = $position + mb_strlen($separator);
          break 2;
        }

        // Look for previous occurrence of this separator.
        $position = mb_strrpos($overlapTail, $separator, -(mb_strlen($overlapTail) - $position + 1));
      }
    }

    // Fallback to word boundary if no separator worked.
    if ($bestStartPosition === 0) {
      $spacePosition = mb_strrpos($overlapTail, ' ');
      if ($spacePosition !== FALSE && mb_strlen($overlapTail) - ($spacePosition + 1) >= $minimumLength) {
        $bestStartPosition = $spacePosition + 1;
      }
    }

    return mb_substr($overlapTail, $bestStartPosition);
  }

}
