<?php

declare(strict_types=1);

namespace Drupal\ai_dropsolid\Tokenizer;

use Drupal\ai\Utility\TokenizerInterface;
use Drupal\Core\Config\ConfigFactoryInterface;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\File\FileSystemInterface;
use Drupal\file\FileInterface;
use Drupal\key\KeyRepositoryInterface;
use GuzzleHttp\ClientInterface;
use GuzzleHttp\Exception\GuzzleException;
use Psr\Log\LoggerInterface;

/**
 * Lightweight tokenizer tailored for Dropsolid's XLM-Roberta models.
 */
final class DropsolidXlmRobertaTokenizer implements TokenizerInterface {

  private const MODE_LITE_LLM = 'lite_llm';

  private const MODE_CLI_SENTENCEPIECE = 'cli_sentencepiece';

  private const MODE_NONE = 'none';

  /**
   * Default model identifier used when an unknown model is supplied.
   */
  private const DEFAULT_MODEL = 'xlm-roberta-base';

  /**
   * Mapping of chat model select options to human-readable labels.
   */
  private const SUPPORTED_OPTIONS = [
    'dropsolid_xlmr__xlm-roberta-base' => 'Dropsolid.ai - XLM-Roberta Base',
  ];

  public function __construct(
    private readonly ConfigFactoryInterface $configFactory,
    private readonly FileSystemInterface $fileSystem,
    private readonly EntityTypeManagerInterface $entityTypeManager,
    private readonly KeyRepositoryInterface $keyRepository,
    private readonly ClientInterface $httpClient,
    private readonly LoggerInterface $logger,
  ) {}

  /**
   * Active model identifier.
   */
  private string $model = self::DEFAULT_MODEL;

  /**
   * {@inheritdoc}
   */
  public function setModel(string $model): void {
    $this->model = $this->supportsModel($model) ? $model : self::DEFAULT_MODEL;
  }

  /**
   * {@inheritdoc}
   */
  public function getSupportedModels(): array {
    return self::SUPPORTED_OPTIONS;
  }

  /**
   * {@inheritdoc}
   */
  public function getTokens(string $chunk): array {
    return $this->tokenize($chunk);
  }

  /**
   * {@inheritdoc}
   */
  public function countTokens(string $chunk): int {
    $chunk = trim($chunk);
    if ($chunk === '') {
      return 0;
    }

    $mode = $this->getConfiguredMode();

    if ($mode === self::MODE_CLI_SENTENCEPIECE) {
      $tokens = $this->tokenizeWithSentencePiece($chunk);
      if ($tokens !== NULL) {
        return count($tokens);
      }
    }

    if ($mode === self::MODE_LITE_LLM) {
      $count = $this->countTokensWithLiteLlm($chunk);
      if ($count !== NULL) {
        return $count;
      }
    }

    return mb_strlen($chunk);
  }

  /**
   * Returns encoded tokens grouped by maximum chunk size.
   *
   * @param string $text
   *   The text to encode.
   * @param int $maxSize
   *   Maximum number of tokens per chunk.
   *
   * @return string[][]
   *   The encoded chunks.
   */
  public function getEncodedChunks(string $text, int $maxSize): array {
    if ($maxSize <= 0) {
      throw new \InvalidArgumentException('Maximum chunk size must be greater than zero.');
    }

    $tokens = $this->tokenize($text);
    if ($tokens === []) {
      return [];
    }

    $chunks = [];
    foreach (array_chunk($tokens, $maxSize) as $chunk) {
      $chunks[] = $chunk;
    }
    return $chunks;
  }

  /**
   * Decodes a chunk back into its original text representation.
   *
   * @param string[] $encodedChunk
   *   The encoded chunk to decode.
   *
   * @return string
   *   The decoded text.
   */
  public function decodeChunk(array $encodedChunk): string {
    return implode('', $encodedChunk);
  }

  /**
   * Determines whether the supplied model is supported.
   */
  public function supportsModel(string $model): bool {
    return in_array($model, $this->getSupportedModelIdentifiers(), TRUE);
  }

  /**
   * Returns the active model identifier.
   */
  public function getActiveModel(): string {
    return $this->model;
  }

  /**
   * Retrieves the supported model identifiers.
   *
   * @return string[]
   *   The model identifiers without provider prefixes.
   */
  private function getSupportedModelIdentifiers(): array {
    return array_map(
      static function (string $option): string {
        $parts = explode('__', $option);
        return $parts[1] ?? $option;
      },
      array_keys(self::SUPPORTED_OPTIONS),
    );
  }

  /**
   * Splits text into pseudo tokens while preserving whitespace.
   *
   * @param string $text
   *   The text to tokenize.
   *
   * @return string[]
   *   Array of grapheme-level tokens.
   */
  private function tokenize(string $text): array {
    if ($text === '') {
      return [];
    }

    $mode = $this->getConfiguredMode();
    if ($mode === self::MODE_CLI_SENTENCEPIECE) {
      $cliTokens = $this->tokenizeWithSentencePiece($text);
      if ($cliTokens !== NULL) {
        return $cliTokens;
      }
      $this->logger->notice('SentencePiece tokenization unavailable; falling back to configured HTTP tokenizer.');
      // Fallback to LiteLLM when CLI is misconfigured.
      $mode = self::MODE_LITE_LLM;
    }

    if ($mode === self::MODE_LITE_LLM) {
      $liteTokens = $this->tokenizeWithLiteLlm($text);
      if ($liteTokens !== NULL) {
        return $liteTokens;
      }
      $this->logger->notice('LiteLLM tokenizer unavailable; falling back to character tokenization.');
    }

    return $this->fallbackTokenize($text);
  }

  /**
   * Provides fallback character-level tokenization.
   */
  private function fallbackTokenize(string $text): array {
    $tokens = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY);
    if ($tokens === FALSE) {
      throw new \RuntimeException('Failed to tokenize text for XLM-Roberta.');
    }
    return $tokens;
  }

  /**
   * Tokenizes the text using the configured SentencePiece executable.
   */
  private function tokenizeWithSentencePiece(string $text): ?array {
    $cliConfig = $this->configFactory->get('ai_dropsolid.settings')->get('tokenizer.cli') ?? [];
    $executableName = trim((string) ($cliConfig['executable_name'] ?? 'spm_encode'));
    if ($executableName === '') {
      $this->logger->warning('SentencePiece executable name is missing in Dropsolid tokenizer configuration.');
      return NULL;
    }

    $executablePath = trim((string) ($cliConfig['executable_path'] ?? ''));
    $fullExecutablePath = $executablePath !== '' ? $executablePath . DIRECTORY_SEPARATOR . $executableName : $executableName;

    $modelPath = $this->resolveSentencePieceModelPath($cliConfig);
    if ($modelPath === NULL) {
      return NULL;
    }

    $command = sprintf(
      '%s --model=%s --output_format=piece',
      escapeshellarg($fullExecutablePath),
      escapeshellarg($modelPath),
    );

    $descriptorSpec = [
      0 => ['pipe', 'r'],
      1 => ['pipe', 'w'],
      2 => ['pipe', 'w'],
    ];

    $process = proc_open($command, $descriptorSpec, $pipes);
    if (!is_resource($process)) {
      $this->logger->error('Failed to start SentencePiece process for Dropsolid tokenizer.');
      return NULL;
    }

    fwrite($pipes[0], $text);
    fclose($pipes[0]);

    $stdout = stream_get_contents($pipes[1]) ?: '';
    fclose($pipes[1]);

    $stderr = stream_get_contents($pipes[2]) ?: '';
    fclose($pipes[2]);

    $status = proc_close($process);
    if ($status !== 0) {
      $this->logger->error('SentencePiece tokenization failed: @error', ['@error' => trim($stderr) ?: 'Unknown error']);
      return NULL;
    }

    $stdout = trim($stdout);
    if ($stdout === '') {
      $this->logger->warning('SentencePiece tokenization returned empty output. Falling back to character tokenization.');
      return NULL;
    }

    $tokens = preg_split('/\s+/', $stdout, -1, PREG_SPLIT_NO_EMPTY);
    if ($tokens === FALSE) {
      $this->logger->error('Unable to split SentencePiece output into tokens.');
      return NULL;
    }

    return array_values($tokens);
  }

  /**
   * Resolves the file system path to the configured SentencePiece model.
   */
  private function resolveSentencePieceModelPath(array $cliConfig): ?string {
    $modelPath = trim((string) ($cliConfig['model_path'] ?? ''));
    if ($modelPath !== '' && file_exists($modelPath)) {
      return $modelPath;
    }

    $modelFileId = $cliConfig['model_file'] ?? NULL;
    if ($modelFileId) {
      $file = $this->loadFileEntity((int) $modelFileId);
      if ($file instanceof FileInterface) {
        $realPath = $this->fileSystem->realpath($file->getFileUri());
        if ($realPath && file_exists($realPath)) {
          return $realPath;
        }
        $this->logger->warning('Configured SentencePiece model file does not exist on disk: @uri', ['@uri' => $file->getFileUri()]);
      }
    }

    $this->logger->warning('SentencePiece model path is not configured or file is missing.');
    return NULL;
  }

  /**
   * Loads a file entity by ID.
   */
  private function loadFileEntity(int $fid): ?FileInterface {
    $storage = $this->entityTypeManager->getStorage('file');
    $file = $storage->load($fid);
    return $file instanceof FileInterface ? $file : NULL;
  }

  /**
   * Returns the configured tokenizer mode.
   */
  private function getConfiguredMode(): string {
    $mode = (string) ($this->configFactory->get('ai_dropsolid.settings')->get('tokenizer.mode') ?? self::MODE_LITE_LLM);
    if (!in_array($mode, [self::MODE_LITE_LLM, self::MODE_CLI_SENTENCEPIECE, self::MODE_NONE], TRUE)) {
      return self::MODE_LITE_LLM;
    }
    return $mode;
  }

  /**
   * Tokenizes the text by delegating to LiteLLM's tokenizer endpoint.
   */
  private function tokenizeWithLiteLlm(string $text): ?array {
    $context = $this->resolveLiteLlmContext();
    if ($context === NULL) {
      return NULL;
    }

    [$endpoint, $apiKey] = $context;

    try {
      $response = $this->httpClient->request('POST', $endpoint, [
        'headers' => [
          'Authorization' => 'Bearer ' . $apiKey,
        ],
        'json' => [
          'model' => 'eu-e5large-embeddings-selfhosted',
          'prompt' => $text,
        ],
        'timeout' => 10,
      ]);

      $body = (string) $response->getBody();
      $decoded = json_decode($body, TRUE);
      if (!is_array($decoded)) {
        return NULL;
      }

      if (!empty($decoded['original_tokens']) && is_array($decoded['original_tokens'])) {
        return array_values($decoded['original_tokens']);
      }

      if (!empty($decoded['total_tokens'])) {
        return NULL;
      }
    }
    catch (GuzzleException $e) {
      $this->logger->error('LiteLLM tokenizer HTTP request failed: @message', ['@message' => $e->getMessage()]);
    }
    catch (\Exception $e) {
      $this->logger->error('LiteLLM tokenizer request failed: @message', ['@message' => $e->getMessage()]);
    }

    return NULL;
  }

  /**
   * Retrieves token count from LiteLLM.
   */
  private function countTokensWithLiteLlm(string $text): ?int {
    $context = $this->resolveLiteLlmContext();
    if ($context === NULL) {
      return NULL;
    }

    [$endpoint, $apiKey] = $context;

    try {
      $response = $this->httpClient->request('POST', $endpoint, [
        'headers' => [
          'Authorization' => 'Bearer ' . $apiKey,
        ],
        'json' => [
          'model' => 'eu-e5large-embeddings-selfhosted',
          'prompt' => $text,
        ],
        'timeout' => 10,
      ]);

      $body = (string) $response->getBody();
      $decoded = json_decode($body, TRUE);

      if (is_array($decoded) && isset($decoded['total_tokens'])) {
        return (int) $decoded['total_tokens'];
      }
    }
    catch (GuzzleException $e) {
      $this->logger->error('LiteLLM token count request failed: @message', ['@message' => $e->getMessage()]);
    }
    catch (\Exception $e) {
      $this->logger->error('LiteLLM token count unexpected error: @message', ['@message' => $e->getMessage()]);
    }

    return NULL;
  }

  /**
   * Resolves the LiteLLM endpoint and API key.
   */
  private function resolveLiteLlmContext(): ?array {
    $liteConfig = $this->configFactory->get('ai_provider_litellm.settings');
    $host = trim((string) ($liteConfig->get('host') ?? ''));
    $apiKeyId = $liteConfig->get('api_key');

    if ($host === '' || !$apiKeyId) {
      return NULL;
    }

    $key = $this->keyRepository->getKey($apiKeyId);
    $apiKey = $key ? (string) $key->getKeyValue() : (string) $apiKeyId;
    if ($apiKey === '') {
      return NULL;
    }

    return [rtrim($host, '/') . '/utils/token_counter', $apiKey];
  }

}
