<?php

declare(strict_types=1);

namespace Drupal\tmgmt_laratranslate\Service;

use Drupal\tmgmt_laratranslate\Enum\KeepSeparator;
use Psr\Log\LoggerInterface;

/**
 * Recursive character text splitter service.
 *
 * Recursively splits text by trying different separators to find one that
 * works. This is a PHP port of LangChain's RecursiveCharacterTextSplitter.
 *
 * @see https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/character.py
 */
class RecursiveCharacterTextSplitter {

  /**
   * The separators to use for splitting.
   *
   * @var array<int, string>
   */
  private array $separators;

  /**
   * Whether to keep the separator in the output.
   *
   * @var \Drupal\tmgmt_laratranslate\Enum\KeepSeparator
   */
  private KeepSeparator $keepSeparator;

  /**
   * Whether separators are regex patterns.
   *
   * @var bool
   */
  private bool $isSeparatorRegex;

  /**
   * Maximum chunk size in characters.
   *
   * @var int
   */
  private int $chunkSize;

  /**
   * Number of characters to overlap between chunks.
   *
   * @var int
   */
  private int $chunkOverlap;

  /**
   * Function to calculate length of text.
   *
   * @var callable
   */
  private $lengthFunction;

  /**
   * Constructs a RecursiveCharacterTextSplitter service.
   *
   * @param \Psr\Log\LoggerInterface $logger
   *   The logger channel (autowired).
   */
  public function __construct(
    private readonly LoggerInterface $logger,
  ) {
    // Default configuration.
    $this->separators = ["\n\n", "\n", " ", ""];
    $this->keepSeparator = KeepSeparator::Yes;
    $this->isSeparatorRegex = FALSE;
    $this->chunkSize = 4000;
    $this->chunkOverlap = 200;
    $this->lengthFunction = fn (string $text): int => mb_strlen($text, 'UTF-8');
  }

  /**
   * Configure the text splitter.
   *
   * @param array<string, mixed> $config
   *   Configuration options:
   *   - separators: Array of separator strings
   *     (default: ["\n\n", "\n", " ", ""])
   *   - keep_separator: KeepSeparator - Enum case (default: KeepSeparator::Yes)
   *   - is_separator_regex: bool - Whether separators are regex patterns
   *     (default: FALSE)
   *   - chunk_size: int - Maximum chunk size in characters (default: 4000)
   *   - chunk_overlap: int - Overlap between chunks (default: 200)
   *   - length_function: callable - Custom length function (default: mb_strlen)
   *
   * @return $this
   */
  public function configure(array $config): static {
    if (isset($config['separators'])) {
      $this->separators = $config['separators'];
    }
    if (isset($config['keep_separator'])) {
      $this->keepSeparator = KeepSeparator::fromValue($config['keep_separator']);
    }
    if (isset($config['is_separator_regex'])) {
      $this->isSeparatorRegex = (bool) $config['is_separator_regex'];
    }
    if (isset($config['chunk_size'])) {
      $this->chunkSize = (int) $config['chunk_size'];
    }
    if (isset($config['chunk_overlap'])) {
      $this->chunkOverlap = (int) $config['chunk_overlap'];
    }
    if (isset($config['length_function'])) {
      $this->lengthFunction = $config['length_function'];
    }

    return $this;
  }

  /**
   * Split text into chunks.
   *
   * @param string $text
   *   The text to split.
   *
   * @return array<int, string>
   *   Array of text chunks.
   */
  public function splitText(string $text): array {
    return $this->splitTextRecursive($text, $this->separators);
  }

  /**
   * Create a splitter configured for a specific programming language.
   *
   * @param string $language
   *   The language identifier (e.g., 'php', 'python', 'javascript').
   * @param array<string, mixed> $config
   *   Additional configuration options.
   *
   * @return static
   *   A new instance configured for the language.
   */
  public function forLanguage(string $language, array $config = []): static {
    $separators = $this->getSeparatorsForLanguage($language);
    $config['separators'] = $separators;
    $config['is_separator_regex'] = TRUE;

    $instance = clone $this;

    return $instance->configure($config);
  }

  /**
   * Reassamble chunks into a single text.
   *
   * @param array<int, string> $chunks
   *   Array of text chunks.
   *
   * @return string
   *   The reassembled text.
   */
  public function reassembleChunks(array $chunks): string {
    return $this->joinDocs($chunks, ' ') ?? '';
  }

  /**
   * Get the length of text using the configured length function.
   *
   * @param string $text
   *   The text to measure.
   *
   * @return int
   *   The length of the text.
   */
  public function getTextLength(string $text): int {
    return ($this->lengthFunction)($text);
  }

  /**
   * Recursively split text by trying different separators.
   *
   * @param string $text
   *   The text to split.
   * @param array<int, string> $separators
   *   Array of separators to try.
   *
   * @return array<int, string>
   *   Array of text chunks.
   */
  private function splitTextRecursive(string $text, array $separators): array {
    $finalChunks = [];

    // Get appropriate separator to use.
    $separator = $separators !== [] ? end($separators) : '';
    $new_separators = [];

    foreach ($separators as $i => $s) {
      $separator_pattern = $this->isSeparatorRegex ? $s : preg_quote($s, '/');

      if ($s === '') {
        $separator = $s;
        break;
      }

      $separator_match = preg_match('/' . $separator_pattern . '/u', $text);
      if ($separator_match !== FALSE && $separator_match > 0) {
        $separator = $s;
        $new_separators = array_slice($separators, $i + 1);
        break;
      }
    }

    $separator_pattern = $this->isSeparatorRegex ? $separator : preg_quote($separator, '/');
    $splits = $this->splitTextWithRegex($text, $separator_pattern, $this->keepSeparator);

    // Now merge things, recursively splitting longer texts.
    $good_splits = [];
    $separator_to_use = $this->keepSeparator === KeepSeparator::No ? $separator : "";

    foreach ($splits as $s) {
      if (($this->lengthFunction)($s) < $this->chunkSize) {
        $good_splits[] = $s;
      }
      else {
        if ($good_splits !== []) {
          $mergedText = $this->mergeSplits($good_splits, $separator_to_use);
          $finalChunks = array_merge($finalChunks, $mergedText);
          $good_splits = [];
        }

        if ($new_separators === []) {
          $finalChunks[] = $s;
        }
        else {
          $otherInfo = $this->splitTextRecursive($s, $new_separators);
          $finalChunks = array_merge($finalChunks, $otherInfo);
        }
      }
    }

    if ($good_splits !== []) {
      $mergedText = $this->mergeSplits($good_splits, $separator_to_use);
      $finalChunks = array_merge($finalChunks, $mergedText);
    }

    return $finalChunks;
  }

  /**
   * Split text using regex pattern.
   *
   * @param string $text
   *   The text to split.
   * @param string $separator
   *   The separator pattern (already escaped if needed).
   * @param \Drupal\tmgmt_laratranslate\Enum\KeepSeparator $keep_separator
   *   How to handle the separator.
   *
   * @return array<int, string>
   *   Array of text splits.
   */
  private function splitTextWithRegex(string $text, string $separator, KeepSeparator $keep_separator): array {
    if ($separator === '') {
      // Split into individual characters.
      $splits = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY);

      return $splits !== FALSE ? $splits : [];
    }

    if ($keep_separator !== KeepSeparator::No) {
      // Keep separators by using capturing groups.
      $split_matches = preg_split('/(' . $separator . ')/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);

      if ($split_matches === FALSE) {
        return [];
      }

      $splits = [];

      if ($keep_separator === KeepSeparator::End) {
        // Attach separator to end of preceding text.
        for ($i = 0; $i < count($split_matches) - 1; $i += 2) {
          $splits[] = $split_matches[$i] . ($split_matches[$i + 1] ?? '');
        }
        // Add last element if odd number of elements.
        if (count($split_matches) % 2 === 1) {
          $splits[] = $split_matches[count($split_matches) - 1];
        }
      }
      elseif ($keep_separator === KeepSeparator::Start) {
        // Attach separator to start of following text.
        for ($i = 1; $i < count($split_matches); $i += 2) {
          $splits[] = $split_matches[$i] . ($split_matches[$i + 1] ?? '');
        }
        // Add first element.
        if ($split_matches !== []) {
          array_unshift($splits, $split_matches[0]);
        }
      }
      else {
        // Keep separator as-is (KeepSeparator::Yes).
        $splits = $split_matches;
      }
    }
    else {
      // Don't keep separator.
      $splits = preg_split('/' . $separator . '/u', $text, -1, PREG_SPLIT_NO_EMPTY);
      if ($splits === FALSE) {
        return [];
      }
    }

    return $splits;
  }

  /**
   * Merge splits into chunks respecting size limits.
   *
   * @param array<int, string> $splits
   *   Array of text splits to merge.
   * @param string $separator
   *   Separator to use when joining splits.
   *
   * @return array<int, string>
   *   Array of merged chunks.
   */
  private function mergeSplits(array $splits, string $separator): array {
    $separator_len = ($this->lengthFunction)($separator);

    $docs = [];
    $current_doc = [];
    $total = 0;

    foreach ($splits as $d) {
      $len = ($this->lengthFunction)($d);

      // Check if adding this split would exceed chunk size.
      $potential_total = $total + $len + ($current_doc === [] ? 0 : $separator_len);

      if ($potential_total > $this->chunkSize) {
        if ($total > $this->chunkSize) {
          $this->logger->warning(
            'Created a chunk of size @size, which is longer than the specified @chunk_size',
            [
              '@size' => $total,
              '@chunk_size' => $this->chunkSize,
            ]
          );
        }

        if ($current_doc !== []) {
          $doc = $this->joinDocs($current_doc, $separator);
          if ($doc !== NULL) {
            $docs[] = $doc;
          }

          // Keep overlap from previous chunk.
          while ($total > $this->chunkOverlap || (
              $total + $len + ($current_doc === [] ? 0 : $separator_len) > $this->chunkSize
              && $total > 0
            )) {
            $total -= ($this->lengthFunction)($current_doc[0]) + (count($current_doc) > 1 ? $separator_len : 0);
            array_shift($current_doc);
          }
        }
      }

      $current_doc[] = $d;
      $total += $len + (count($current_doc) > 1 ? $separator_len : 0);
    }

    // Add the last chunk.
    if ($current_doc !== []) {
      $doc = $this->joinDocs($current_doc, $separator);
      if ($doc !== NULL) {
        $docs[] = $doc;
      }
    }

    return $docs;
  }

  /**
   * Join documents with separator.
   *
   * @param array<int, string> $docs
   *   Array of document parts.
   * @param string $separator
   *   Separator to use.
   *
   * @return string|null
   *   Joined document or NULL if empty.
   */
  private function joinDocs(array $docs, string $separator): ?string {
    $text = implode($separator, $docs);
    $text = trim($text);
    return $text === '' ? NULL : $text;
  }

  /**
   * Get language-specific separators for code splitting.
   *
   * @param string $language
   *   The language identifier.
   *
   * @return array<int, string>
   *   Array of separator patterns.
   *
   * @throws \InvalidArgumentException
   *   If language is not supported.
   */
  private function getSeparatorsForLanguage(string $language): array {
    return match (strtolower($language)) {
      'php' => [
        // Split along function definitions.
        "\nfunction ",
        // Split along class definitions.
        "\nclass ",
        // Split along control flow statements.
        "\nif ",
        "\nforeach ",
        "\nwhile ",
        "\ndo ",
        "\nswitch ",
        "\ncase ",
        // Split by the normal type of lines.
        "\n\n",
        "\n",
        " ",
        "",
      ],
      'python' => [
        // Split along class definitions.
        "\nclass ",
        "\ndef ",
        "\n\tdef ",
        // Split by normal type of lines.
        "\n\n",
        "\n",
        " ",
        "",
      ],
      'javascript', 'js' => [
        // Split along function definitions.
        "\nfunction ",
        "\nconst ",
        "\nlet ",
        "\nvar ",
        "\nclass ",
        // Split along control flow statements.
        "\nif ",
        "\nfor ",
        "\nwhile ",
        "\nswitch ",
        "\ncase ",
        "\ndefault ",
        // Split by normal type of lines.
        "\n\n",
        "\n",
        " ",
        "",
      ],
      'typescript', 'ts' => [
        "\nenum ",
        "\ninterface ",
        "\nnamespace ",
        "\ntype ",
        // Split along class definitions.
        "\nclass ",
        // Split along function definitions.
        "\nfunction ",
        "\nconst ",
        "\nlet ",
        "\nvar ",
        // Split along control flow statements.
        "\nif ",
        "\nfor ",
        "\nwhile ",
        "\nswitch ",
        "\ncase ",
        "\ndefault ",
        // Split by normal type of lines.
        "\n\n",
        "\n",
        " ",
        "",
      ],
      'java' => [
        // Split along class definitions.
        "\nclass ",
        // Split along method definitions.
        "\npublic ",
        "\nprotected ",
        "\nprivate ",
        "\nstatic ",
        // Split along control flow statements.
        "\nif ",
        "\nfor ",
        "\nwhile ",
        "\nswitch ",
        "\ncase ",
        // Split by normal type of lines.
        "\n\n",
        "\n",
        " ",
        "",
      ],
      'go' => [
        // Split along function definitions.
        "\nfunc ",
        "\nvar ",
        "\nconst ",
        "\ntype ",
        // Split along control flow statements.
        "\nif ",
        "\nfor ",
        "\nswitch ",
        "\ncase ",
        // Split by normal type of lines.
        "\n\n",
        "\n",
        " ",
        "",
      ],
      'rust' => [
        // Split along function definitions.
        "\nfn ",
        "\nconst ",
        "\nlet ",
        // Split along control flow statements.
        "\nif ",
        "\nwhile ",
        "\nfor ",
        "\nloop ",
        "\nmatch ",
        // Split by normal type of lines.
        "\n\n",
        "\n",
        " ",
        "",
      ],
      'markdown', 'md' => [
        // Split along Markdown headings.
        "\n#{1,6} ",
        // End of code block.
        "```\n",
        // Horizontal lines.
        "\n\\*\\*\\*+\n",
        "\n---+\n",
        "\n___+\n",
        "\n\n",
        "\n",
        " ",
        "",
      ],
      'html' => [
        // Split along HTML tags.
        "<body",
        "<div",
        "<p",
        "<br",
        "<li",
        "<h1",
        "<h2",
        "<h3",
        "<h4",
        "<h5",
        "<h6",
        "<span",
        "<table",
        "<tr",
        "<td",
        "<th",
        "<ul",
        "<ol",
        "<header",
        "<footer",
        "<nav",
        "<head",
        "<style",
        "<script",
        "<meta",
        "<title",
        "",
      ],
      default => throw new \InvalidArgumentException(
        sprintf('Language "%s" is not supported', $language)
      ),
    };
  }

}
