<?php

declare(strict_types=1);

namespace Drupal\ai_dropsolid\Form;

use Drupal\Core\Config\Config;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Extension\ModuleHandlerInterface;
use Drupal\Core\File\FileSystemInterface;
use Drupal\Core\Form\ConfigFormBase;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\Site\Settings;
use Drupal\Core\StreamWrapper\StreamWrapperManagerInterface;
use Drupal\Core\Url;
use Drupal\key\KeyRepositoryInterface;
use GuzzleHttp\ClientInterface;
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\RequestOptions;
use Symfony\Component\DependencyInjection\ContainerInterface;

/**
 * Configures tokenizer behaviour for Dropsolid AI.
 */
final class TokenizerSettingsForm extends ConfigFormBase {

  public const MODE_LITE_LLM = 'lite_llm';

  public const MODE_CLI_SENTENCEPIECE = 'cli_sentencepiece';

  public const MODE_NONE = 'none';

  /**
   * Test string for tokenizer verification.
   *
   * Contains diverse characters: emojis, Unicode, punctuation, and mixed case
   * to ensure tokenizer handles various input types correctly.
   */
  private const TOKENIZER_TEST_STRING = '🚀 Dropsolid AI—tokenization test: "Hello, 世界!" #2025 ✨';

  /**
   * {@inheritdoc}
   */
  public static function create(ContainerInterface $container): static {
    /** @var static $instance */
    $instance = parent::create($container);
    $instance->fileSystem = $container->get('file_system');
    $instance->keyRepository = $container->get('key.repository');
    $instance->httpClient = $container->get('http_client');
    $instance->moduleHandler = $container->get('module_handler');
    $instance->entityTypeManager = $container->get('entity_type.manager');
    $instance->streamWrapperManager = $container->get('stream_wrapper_manager');
    $instance->settings = $container->get('settings');
    return $instance;
  }

  /**
   * Filesystem service used for directory preparation.
   */
  protected FileSystemInterface $fileSystem;

  /**
   * Provides access to LiteLLM key storage.
   */
  protected KeyRepositoryInterface $keyRepository;

  /**
   * HTTP client for remote tokenizer validation.
   */
  protected ClientInterface $httpClient;

  /**
   * Module handler service for module existence checks.
   */
  protected ModuleHandlerInterface $moduleHandler;

  /**
   * Entity type manager service for loading file entities.
   */
  protected EntityTypeManagerInterface $entityTypeManager;

  /**
   * Stream wrapper manager service for checking stream wrapper availability.
   */
  protected StreamWrapperManagerInterface $streamWrapperManager;

  /**
   * Settings service for accessing site configuration.
   */
  protected Settings $settings;

  /**
   * {@inheritdoc}
   */
  protected function getEditableConfigNames(): array {
    return ['ai_dropsolid.settings'];
  }

  /**
   * {@inheritdoc}
   */
  public function getFormId(): string {
    return 'ai_dropsolid_tokenizer_settings';
  }

  /**
   * {@inheritdoc}
   */
  public function buildForm(array $form, FormStateInterface $form_state): array {
    $config = $this->config('ai_dropsolid.settings');

    $privateStreamAvailable = $this->isPrivateFileSystemConfigured();

    if (!$privateStreamAvailable) {
      $this->messenger()->addError($this->t('The private file system path is not configured. This is required for storing tokenizer model files securely. Please configure it in your <code>settings.php</code> file by adding: <code>$settings[\'file_private_path\'] = \'/path/to/private/files\';</code><br>See <a href=":url">File system configuration</a> for more information.', [
        ':url' => Url::fromRoute('system.file_system_settings')->toString(),
      ]));
    }
    else {
      $upload_location = 'private://ai_dropsolid/tokenizers';
      $this->fileSystem->prepareDirectory($upload_location, FileSystemInterface::CREATE_DIRECTORY | FileSystemInterface::MODIFY_PERMISSIONS);
    }

    $selected_mode = $form_state->getValue([
      'tokenizer_settings',
      'mode',
    ]) ?? $config->get('tokenizer.mode') ?? self::MODE_LITE_LLM;

    $form['tokenizer_settings'] = [
      '#type' => 'fieldset',
      '#title' => $this->t('Tokenizer settings'),
      '#tree' => TRUE,
    ];

    $form['tokenizer_settings']['mode'] = [
      '#type' => 'select',
      '#title' => $this->t('Tokenizer mode'),
      '#options' => [
        self::MODE_LITE_LLM => $this->t('LiteLLM (HTTP) tokenizer – default'),
        self::MODE_CLI_SENTENCEPIECE => $this->t('CLI SentencePiece tokenizer'),
        self::MODE_NONE => $this->t('No tokenizer'),
      ],
      '#default_value' => $selected_mode,
      '#description' => $this->t('Choose how token counting should be handled when Dropsolid embedding engines are in use. LiteLLM always proxies tokenization over HTTP.'),
      '#disabled' => !$privateStreamAvailable,
    ];

    $form['tokenizer_settings']['mode_help'] = [
      '#type' => 'item',
      '#markup' => $this->t('<ul><li><strong>LiteLLM</strong>: Slower, but requires no additional server setup.</li><li><strong>CLI SentencePiece</strong>: Faster, but needs a compiled SentencePiece executable available on the server.</li></ul>'),
    ];

    $form['tokenizer_settings']['llm_configuration'] = $this->buildLlmConfiguration() + ['#tree' => TRUE];
    $form['tokenizer_settings']['llm_configuration']['#states'] = [
      'visible' => [
        ':input[name="tokenizer_settings[mode]"]' => ['value' => self::MODE_LITE_LLM],
      ],
    ];

    $form['tokenizer_settings']['cli_configuration'] = $this->buildCliConfiguration($config, $privateStreamAvailable) + ['#tree' => TRUE];
    $form['tokenizer_settings']['cli_configuration']['#states'] = [
      'visible' => [
        ':input[name="tokenizer_settings[mode]"]' => ['value' => self::MODE_CLI_SENTENCEPIECE],
      ],
    ];

    return parent::buildForm($form, $form_state);
  }

  /**
   * Checks if the private file system is properly configured.
   *
   * @return bool
   *   TRUE if private stream wrapper is registered and has a valid base path.
   */
  private function isPrivateFileSystemConfigured(): bool {
    try {
      // Check if private stream wrapper is registered.
      if (!$this->streamWrapperManager->isValidScheme('private')) {
        return FALSE;
      }

      // Get the base path directly from settings to avoid realpath() issues.
      $private_path = $this->settings->get('file_private_path');

      // Verify the path is configured and the directory exists.
      return !empty($private_path) && is_dir($private_path);
    }
    catch (\Exception $e) {
      $this->logger('ai_dropsolid')->warning('Failed to verify private file system: @message', ['@message' => $e->getMessage()]);
      return FALSE;
    }
  }

  /**
   * Builds the configuration fieldset for LLM-backed tokenizers.
   */
  private function buildLlmConfiguration(): array {
    $fieldset = [
      '#type' => 'fieldset',
      '#title' => $this->t('LiteLLM tokenizer'),
    ];

    $settingsUrl = Url::fromRoute('ai_provider_litellm.settings_form')->toString();

    $fieldset['description'] = [
      '#type' => 'markup',
      '#markup' => $this->t('LiteLLM tokenization reuses your <a href=":url">LiteLLM authentication</a> configuration.<br />If you are using the <strong>@engine</strong> embedding engine the tokenizer automatically switches to the <em>multilingual-E5-Large-Instruct</em> configuration. <strong>This is slower - because tokenization happens over the HTTP.</strong>', [
        ':url' => $settingsUrl,
        '@engine' => 'eu-e5large-embeddings-selfhosted',
      ]),
      '#prefix' => '<p>',
      '#suffix' => '</p>',
    ];

    if (!$this->moduleHandler->moduleExists('ai_provider_litellm')) {
      $fieldset['notice'] = [
        '#type' => 'container',
        '#attributes' => ['class' => ['messages', 'messages--error']],
        'message' => [
          '#markup' => $this->t('Enable the LiteLLM provider module and configure it at <a href=":url">LiteLLM authentication</a>.', [
            ':url' => $settingsUrl,
          ]),
        ],
      ];
      return $fieldset;
    }

    $liteConfig = $this->configFactory->get('ai_provider_litellm.settings');
    $host = (string) ($liteConfig->get('host') ?? '');

    if ($host === '') {
      $fieldset['notice'] = [
        '#type' => 'container',
        '#attributes' => ['class' => ['messages', 'messages--warning']],
        'message' => [
          '#markup' => $this->t('LiteLLM host is not configured. Please provide it at <a href=":url">LiteLLM authentication</a>.', [
            ':url' => $settingsUrl,
          ]),
        ],
      ];
    }
    else {
      $fieldset['summary'] = [
        '#type' => 'item',
        '#markup' => $this->t('Current LiteLLM host: <code>@host</code>', ['@host' => $host]),
      ];
    }

    return $fieldset;
  }

  /**
   * Builds configuration options for the CLI SentencePiece mode.
   *
   * @param \Drupal\Core\Config\Config $config
   *   The configuration object.
   * @param bool $privateStreamAvailable
   *   Whether the private file system is available.
   *
   * @return array
   *   Form array for CLI configuration.
   */
  private function buildCliConfiguration(Config $config, bool $privateStreamAvailable): array {
    $fieldset = [
      '#type' => 'fieldset',
      '#title' => $this->t('CLI SentencePiece configuration'),
    ];

    $fieldset['executable_name'] = [
      '#type' => 'textfield',
      '#title' => $this->t('Executable name'),
      '#default_value' => $config->get('tokenizer.cli.executable_name') ?? 'spm_encode',
      '#required' => TRUE,
      '#disabled' => !$privateStreamAvailable,
      '#description' => $this->t('Name of the SentencePiece executable (e.g. @example).', ['@example' => 'spm_encode']),
    ];

    $fieldset['executable_path'] = [
      '#type' => 'textfield',
      '#title' => $this->t('Executable path'),
      '#default_value' => $config->get('tokenizer.cli.executable_path') ?? '',
      '#disabled' => !$privateStreamAvailable,
      '#description' => $this->t('Absolute path to the executable on the server if it is not available in $PATH.'),
    ];

    $fieldset['model_file'] = [
      '#type' => 'managed_file',
      '#title' => $this->t('Upload tokenizer model → eu-e5large-embeddings-selfhosted'),
      '#multiple' => FALSE,
      '#autoupload' => TRUE,
      '#default_value' => ($model_file_id = $config->get('tokenizer.cli.model_file')) ? [$model_file_id] : NULL,
      '#disabled' => !$privateStreamAvailable,
      '#description' => $this->t('Upload the SentencePiece model file (e.g. sentencepiece.bpe.model). Files are stored in the private file system. For eu-e5large-embeddings-selfhosted - use the multilingual-E5-Large-Instruct model from HuggingFace.'),
      '#upload_location' => 'private://ai_dropsolid/tokenizers',
      '#upload_validators'  => [
        'FileExtension' => ['model json bpe'],
      ],
    ];

    $fieldset['model_path'] = [
      '#type' => 'textfield',
      '#title' => $this->t('Model file path → eu-e5large-embeddings-selfhosted'),
      '#default_value' => $config->get('tokenizer.cli.model_path') ?? '',
      '#disabled' => !$privateStreamAvailable,
      '#description' => $this->t('Provide a direct path to an existing SentencePiece model file as an alternative to uploading.'),
    ];

    return $fieldset;
  }

  /**
   * {@inheritdoc}
   */
  public function validateForm(array &$form, FormStateInterface $form_state): void {
    parent::validateForm($form, $form_state);

    $values = $form_state->getValue('tokenizer_settings');
    if (!is_array($values)) {
      return;
    }
    $mode = $values['mode'] ?? self::MODE_LITE_LLM;

    switch ($mode) {
      case self::MODE_CLI_SENTENCEPIECE:
        $cli = $values['cli_configuration'] ?? [];
        $hasModelFile = !empty($cli['model_file'][0]);
        $hasModelPath = !empty($cli['model_path']);

        if (!$hasModelFile && !$hasModelPath) {
          $form_state->setErrorByName('tokenizer_settings][cli_configuration][model_file', $this->t('Upload a SentencePiece model file or provide a path to one.'));
          break;
        }

        $verification = $this->testCliTokenizer($cli);
        if ($verification['status'] === 'error') {
          $form_state->setErrorByName('tokenizer_settings][cli_configuration][model_file', $verification['message']);
        }
        elseif ($verification['status'] === 'warning') {
          $this->messenger()->addWarning($verification['message']);
        }
        elseif ($verification['status'] === 'success') {
          $this->messenger()->addStatus($verification['message']);
        }
        break;

      case self::MODE_LITE_LLM:
        if (!$this->moduleHandler->moduleExists('ai_provider_litellm')) {
          $form_state->setErrorByName('tokenizer_settings][mode', $this->t('Enable the LiteLLM provider module before using the LiteLLM tokenizer.'));
          break;
        }

        $verification = $this->testLiteLlmTokenizer();
        if ($verification['status'] === 'error') {
          $form_state->setErrorByName('tokenizer_settings][mode', $verification['message']);
        }
        elseif ($verification['status'] === 'warning') {
          $this->messenger()->addWarning($verification['message']);
        }
        elseif ($verification['status'] === 'success') {
          $this->messenger()->addStatus($verification['message']);
        }
        $form_state->set('ai_dropsolid_litellm_verification', $verification);
        break;

      case self::MODE_NONE:
      default:
        // No validation required for 'none' mode.
        break;
    }
  }

  /**
   * {@inheritdoc}
   */
  public function submitForm(array &$form, FormStateInterface $form_state): void {
    $values = $form_state->getValue('tokenizer_settings');
    $config = $this->config('ai_dropsolid.settings');

    $config->set('tokenizer.mode', $values['mode']);

    $config->clear('tokenizer.llm');

    $cli = $values['cli_configuration'] ?? [];
    if (!empty($cli)) {
      $config->set('tokenizer.cli.executable_name', $cli['executable_name'] ?? '');
      $config->set('tokenizer.cli.executable_path', $cli['executable_path'] ?? '');
      $config->set('tokenizer.cli.model_path', $cli['model_path'] ?? '');

      if (!empty($cli['model_file'][0])) {
        /** @var \Drupal\file\FileInterface $file */
        $file = $this->entityTypeManager->getStorage('file')->load($cli['model_file'][0]);
        if ($file) {
          $file->setPermanent();
          $file->save();
          $config->set('tokenizer.cli.model_file', (int) $file->id());
        }
      }
      else {
        $config->clear('tokenizer.cli.model_file');
      }
    }
    else {
      $config->clear('tokenizer.cli');
    }

    $config->save();

    parent::submitForm($form, $form_state);
  }

  /**
   * Tests LiteLLM tokenizer availability.
   */
  private function testLiteLlmTokenizer(): array {
    $liteConfig = $this->configFactory->get('ai_provider_litellm.settings');
    $endpoint = trim((string) ($liteConfig->get('host') ?? ''));
    $apiKeyId = $liteConfig->get('api_key');

    if ($endpoint === '' || !$apiKeyId) {
      return [
        'status' => 'error',
        'message' => $this->t('LiteLLM host or API key is missing. Configure them at <a href=":url">LiteLLM authentication</a>.', [
          ':url' => Url::fromRoute('ai_provider_litellm.settings_form')->toString(),
        ]),
      ];
    }

    $key = $this->keyRepository->getKey($apiKeyId);
    $apiKey = $key ? $key->getKeyValue() : (string) $apiKeyId;
    if ($apiKey === '') {
      return [
        'status' => 'error',
        'message' => $this->t('Unable to resolve a LiteLLM API key value. Verify it at <a href=":url">LiteLLM authentication</a>.', [
          ':url' => Url::fromRoute('ai_provider_litellm.settings_form')->toString(),
        ]),
      ];
    }

    $uri = rtrim($endpoint, '/') . '/utils/token_counter';

    try {
      $response = $this->httpClient->request('POST', $uri, [
        RequestOptions::HEADERS => [
          'Authorization' => 'Bearer ' . $apiKey,
        ],
        RequestOptions::JSON => [
          'model' => 'eu-e5large-embeddings-selfhosted',
          'prompt' => self::TOKENIZER_TEST_STRING,
        ],
        RequestOptions::TIMEOUT => 10,
      ]);

      $body = (string) $response->getBody();
      $decoded = json_decode($body, TRUE);
      if (!is_array($decoded)) {
        $this->logger('ai_dropsolid')->warning('LiteLLM tokenizer returned an unexpected response: @body', ['@body' => $body]);
        return [
          'status' => 'error',
          'message' => $this->t('LiteLLM tokenizer returned an unexpected response. Check the endpoint configuration.'),
        ];
      }

      // Debug: Log the full response to compare with CLI tokenizer.
      $tokenizerType = $decoded['tokenizer_type'] ?? NULL;
      if ($tokenizerType === 'huggingface_tokenizer') {
        return [
          'status' => 'success',
          'message' => $this->t('LiteLLM tokenizer verified (huggingface_tokenizer, @count tokens).', ['@count' => $decoded['total_tokens'] ?? $this->t('unknown')]),
        ];
      }

      return [
        'status' => 'warning',
        'message' => $this->t('LiteLLM tokenizer responded with type @type. Expected huggingface_tokenizer.', ['@type' => $tokenizerType ?? $this->t('unknown')]),
      ];
    }
    catch (GuzzleException $e) {
      $this->logger('ai_dropsolid')->error('LiteLLM tokenizer verification failed: @message', ['@message' => $e->getMessage()]);
      return [
        'status' => 'error',
        'message' => $this->t('Unable to reach LiteLLM tokenizer endpoint. Check credentials and endpoint configuration.'),
      ];
    }

    return [
      'status' => 'error',
      'message' => $this->t('LiteLLM tokenizer verification failed with an unknown reason.'),
    ];
  }

  /**
   * Tests CLI SentencePiece tokenizer availability and functionality.
   *
   * @param array $cli
   *   Configuration array containing executable_name, executable_path,
   *   model_file, and model_path.
   *
   * @return array
   *   Status array with 'status' (success|warning|error) and 'message'.
   */
  private function testCliTokenizer(array $cli): array {
    $executableName = trim($cli['executable_name'] ?? '');
    $executablePath = trim($cli['executable_path'] ?? '');
    $modelPath = trim($cli['model_path'] ?? '');

    if ($executableName === '') {
      return [
        'status' => 'error',
        'message' => $this->t('SentencePiece executable name is required.'),
      ];
    }

    $fullExecutablePath = $executablePath !== '' ? $executablePath . '/' . $executableName : $executableName;

    $checkCommand = sprintf('command -v %s', escapeshellarg($fullExecutablePath));
    exec($checkCommand, $output, $returnCode);

    if ($returnCode !== 0) {
      return [
        'status' => 'error',
        'message' => $this->t('SentencePiece executable "@executable" not found. Ensure it is installed and available in the specified path.', [
          '@executable' => $fullExecutablePath,
        ]),
      ];
    }

    if (!empty($cli['model_file'][0])) {
      /** @var \Drupal\file\FileInterface|null $file */
      $file = $this->entityTypeManager->getStorage('file')->load($cli['model_file'][0]);
      if (!$file) {
        return [
          'status' => 'error',
          'message' => $this->t('Unable to load the uploaded model file.'),
        ];
      }
      $modelPath = $this->fileSystem->realpath($file->getFileUri());
      if (!$modelPath || !file_exists($modelPath)) {
        return [
          'status' => 'error',
          'message' => $this->t('Uploaded model file does not exist on the filesystem.'),
        ];
      }
    }
    elseif ($modelPath !== '') {
      if (!file_exists($modelPath)) {
        return [
          'status' => 'error',
          'message' => $this->t('Model file path "@path" does not exist.', ['@path' => $modelPath]),
        ];
      }
    }
    else {
      return [
        'status' => 'error',
        'message' => $this->t('No model file or model path provided.'),
      ];
    }

    $testCommand = sprintf(
      'echo %s | %s --model=%s --output_format=piece --extra_options=bos:eos 2>&1',
      escapeshellarg(self::TOKENIZER_TEST_STRING),
      escapeshellarg($fullExecutablePath),
      escapeshellarg($modelPath)
    );

    exec($testCommand, $testOutput, $testReturnCode);

    if ($testReturnCode !== 0) {
      $errorMessage = implode("\n", $testOutput);
      $this->logger('ai_dropsolid')->error('CLI tokenizer test failed: @message', ['@message' => $errorMessage]);
      return [
        'status' => 'error',
        'message' => $this->t('CLI tokenizer test failed. Verify the executable and model file are compatible.'),
      ];
    }

    if (empty($testOutput)) {
      return [
        'status' => 'warning',
        'message' => $this->t('CLI tokenizer executed but returned no output. Verify the configuration.'),
      ];
    }

    // SentencePiece outputs tokens space-separated.
    // Join output lines without adding extra spaces, then split on whitespace.
    $rawOutput = implode('', $testOutput);
    $tokens = preg_split('/\s+/', trim($rawOutput), -1, PREG_SPLIT_NO_EMPTY);
    $tokenCount = count($tokens);

    return [
      'status' => 'success',
      'message' => $this->t('CLI tokenizer verified: @count tokens produced.', ['@count' => $tokenCount]),
    ];
  }

}
