<?php

declare(strict_types=1);

namespace Drupal\rrf\Plugin\search_api\processor;

use Drupal\Core\Entity\EntityStorageInterface;
use Drupal\Core\Entity\EntityTypeManager;
use Drupal\Core\Entity\EntityTypeRepositoryInterface;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\Plugin\PluginFormInterface;
use Drupal\rrf\Service\RrfService;
use Drupal\search_api\Entity\Index;
use Drupal\search_api\IndexInterface;
use Drupal\search_api\Plugin\PluginFormTrait;
use Drupal\search_api\Processor\ProcessorPluginBase;
use Drupal\search_api\Query\QueryInterface;
use Drupal\search_api\Query\ResultSetInterface;
use Drupal\search_api\ServerInterface;
use Psr\Log\LoggerInterface;
use Symfony\Component\DependencyInjection\ContainerInterface;

/**
 * RAG search processor using Reciprocal Rank Fusion (RRF).
 *
 * This processor implements hybrid search for Retrieval Augmented Generation
 * (RAG) by combining two complementary search approaches:
 *
 * 1. Semantic Search (Vector): Uses embeddings to find conceptually similar
 *    content, even when exact keywords don't match.
 * 2. Keyword Search (Traditional): Uses exact term matching and relevance
 *    scoring based on term frequency and field boosting.
 *
 * RRF Algorithm:
 * For each result from both searches, calculates: score = 1 / (k + rank)
 * where k is a constant (default 60) and rank is the position in results.
 * Final scores are summed across both result sets.
 *
 * Use cases:
 * - RAG systems needing both semantic understanding and exact matches
 * - Search where users may use natural language or specific terms
 * - Content discovery requiring conceptual similarity
 *
 * @SearchApiProcessor(
 *   id = "database_rrf_boost",
 *   label = @Translation("RRF Hybrid Search"),
 *   description = @Translation("Combines semantic vector search with keyword search using Reciprocal Rank Fusion (RRF).
 *    Unlike simple boosting that prepends AI results,
 *   RRF mathematically merges rankings from both sources for balanced hybrid RAG retrieval.
 *   Best for: natural language queries, conceptual search, and RAG systems requiring
 *   both semantic understanding and exact term matching."),
 *   stages = {
 *     "preprocess_query" = -10,
 *     "postprocess_query" = 0,
 *   }
 * )
 */
class DatabaseRrfProcessor extends ProcessorPluginBase implements PluginFormInterface {
  use PluginFormTrait;

  protected EntityStorageInterface $searchApiIndex;

  /**
   * {@inheritDoc}
   */
  public function __construct(
    array $configuration,
    $plugin_id,
    array $plugin_definition,
    protected LoggerInterface $logger,
    protected EntityTypeRepositoryInterface $entityTypeRepository,
    protected EntityTypeManager $entityTypeManager,
    protected RrfService $rrfService,
  ) {
    parent::__construct($configuration, $plugin_id, $plugin_definition);
    $this->searchApiIndex = $this->entityTypeManager->getStorage(
        $this->entityTypeRepository
          ->getEntityTypeFromClass(Index::class));
  }

  /**
   * {@inheritDoc}
   */
  public static function create(
    ContainerInterface $container,
    array $configuration,
    $plugin_id,
    $plugin_definition,
  ) : static {
    return new static(
      $configuration,
      $plugin_id,
      $plugin_definition,
      $container->get('logger.channel.rrf'),
      $container->get('entity_type.repository'),
      $container->get('entity_type.manager'),
      $container->get('rrf.fusion'),
    );
  }

  /**
   * {@inheritdoc}
   */
  public static function supportsIndex(IndexInterface $index): bool {
    // Support any index with a database backend.
    $server = $index->getServerInstance();
    return $server && $server->getBackendId() === 'search_api_db';
  }

  /**
   * {@inheritdoc}
   */
  public function defaultConfiguration(): array {
    return [
      'search_api_ai_index' => '',
      'number_to_return' => 10,
      'rrf_k_constant' => 60,
    ];
  }

  /**
   * {@inheritDoc}
   */
  public function buildConfigurationForm(array $form, FormStateInterface $form_state): array {
    $form['search_api_ai_index'] = [
      '#type' => 'select',
      '#title' => $this->t('Semantic Search Index'),
      '#description' => $this->t('Select the vector database index for semantic search. RRF will combine semantic understanding with keyword matching for better RAG results.'),
      '#options' => [],
      '#empty_option' => $this->t('- Select -'),
    ];

    /** @var \Drupal\search_api\IndexInterface[] $indexes */
    $indexes = $this->searchApiIndex->loadMultiple();
    foreach ($indexes as $index) {
      if ($index->id() === $this->index->id()) {
        continue;
      }
      $serverId = $index->getServerId();
      if (!$serverId) {
        continue;
      }
      $server = $index->getServerInstance();
      if (!$server instanceof ServerInterface || $server->getBackendId() !== 'search_api_ai_search') {
        continue;
      }
      $form['search_api_ai_index']['#options'][$index->id()] = $index->label() . ' (' . $server->label() . ')';
    }

    $searchApiAiIndex = $this->configuration['search_api_ai_index'] ?? '';
    if ($searchApiAiIndex && array_key_exists($searchApiAiIndex, $form['search_api_ai_index']['#options'])) {
      $form['search_api_ai_index']['#default_value'] = $searchApiAiIndex;
    }

    $form['number_to_return'] = [
      '#type' => 'number',
      '#step' => 1,
      '#required' => TRUE,
      '#title' => $this->t('Number of semantic results'),
      '#description' => $this->t('How many results to fetch from semantic search. RRF will merge these with keyword search for hybrid RAG retrieval. Recommended: 10-20.'),
      '#default_value' => $this->configuration['number_to_return'] ?? 10,
    ];

    $form['rrf_k_constant'] = [
      '#type' => 'number',
      '#step' => 1,
      '#required' => TRUE,
      '#title' => $this->t('Ranking balance (k constant)'),
      '#description' => $this->t('Controls how semantic and keyword rankings are fused. Uses formula: score = 1/(k + rank). <strong>Default (60):</strong> Balanced fusion, recommended for most cases. <strong>Lower (20-40):</strong> Aggressive - top results dominate, use when confident in ranking quality. <strong>Higher (80-100):</strong> Democratic - spreads weight more evenly, use for diverse result sets.'),
      '#default_value' => $this->configuration['rrf_k_constant'] ?? 60,
    ];

    return $form;
  }

  /**
   * {@inheritdoc}
   */
  public function validateConfigurationForm(array &$form, FormStateInterface $form_state): void {
    if (empty($form_state->getValue('search_api_ai_index'))) {
      $form_state->setErrorByName('search_api_ai_index', $this->t('Choose a vector database index.'));
    }
  }

  /**
   * Fetches semantic search results before main query execution.
   *
   * Runs a parallel query against the configured vector database index
   * and stores results in query options for later fusion.
   *
   * @param \Drupal\search_api\Query\QueryInterface $query
   *   The search query being preprocessed.
   */
  public function preprocessSearchQuery(QueryInterface $query): void {
    if (!$querystringKeys = $query->getKeys()) {
      $this->logger->debug('RRF: No search keys provided');
      return;
    }

    $this->logger->info('RRF: Preprocessing query with keys: @keys', [
      '@keys' => is_array($querystringKeys) ? implode(' ', $querystringKeys) : $querystringKeys,
    ]);

    $aiResults = $this->getVectorResults($querystringKeys);
    if ($aiResults) {
      $this->logger->info('RRF: Found @count semantic results', ['@count' => count($aiResults)]);
      $query->setOption('rrf_ai_results', $aiResults);
    }
    else {
      $this->logger->warning('RRF: No semantic results found');
    }
  }

  /**
   * Queries the vector database for semantic search results.
   *
   * @param string|array $keywords
   *   Search keywords to find semantically similar content.
   *
   * @return array
   *   Array of entity IDs with their semantic similarity scores.
   *   Format: ['entity:node/123:en' => 0.95, ...]
   */
  protected function getVectorResults(string|array $keywords): array {
    $limit = (int) $this->configuration['number_to_return'];
    if ($limit <= 0) {
      $this->logger->warning('RRF: Invalid limit: @limit', ['@limit' => $limit]);
      return [];
    }

    $indexId = $this->configuration['search_api_ai_index'] ?? '';
    if (empty($indexId)) {
      $this->logger->warning('RRF: No semantic index configured');
      return [];
    }

    try {
      /** @var \Drupal\search_api\IndexInterface $aiIndex */
      $aiIndex = $this->searchApiIndex->load($indexId);

      if (!$aiIndex) {
        $this->logger->error('RRF: Could not load semantic index: @id', ['@id' => $indexId]);
        return [];
      }

      $this->logger->debug('RRF: Querying semantic index @id with limit @limit', [
        '@id' => $indexId,
        '@limit' => $limit,
      ]);

      $query = $aiIndex->query(['limit' => $limit]);
      $query->setOption('search_api_bypass_access', TRUE);
      $query->keys($keywords);
      $results = $query->execute();

      $aiEntityIds = [];
      foreach ($results->getResultItems() as $result) {
        $aiEntityIds[$result->getId()] = $result->getScore();
      }

      $this->logger->debug('RRF: Retrieved @count results from semantic search', [
        '@count' => count($aiEntityIds),
      ]);

      return $aiEntityIds;
    }
    catch (\Exception $e) {
      $this->logger->error('RRF: Exception in getVectorResults: @message', [
        '@message' => $e->getMessage(),
      ]);
      return [];
    }
  }

  /**
   * Applies RRF fusion to combine semantic and keyword search results.
   *
   * Takes the keyword search results (from main query) and semantic results
   * (from preprocessing) and merges them using the RRF algorithm.
   *
   * RRF Formula: final_score = Σ(1 / (k + rank_in_source))
   *
   * Example with k=60:
   * - Item ranked #1 in semantic: 1/(60+1) = 0.0164
   * - Same item ranked #3 in keyword: 1/(60+3) = 0.0159
   * - Combined RRF score: 0.0323
   *
   * @param \Drupal\search_api\Query\ResultSetInterface $results
   *   The search results to rerank using RRF.
   */
  public function postprocessSearchResults(ResultSetInterface $results): void {
    $query = $results->getQuery();
    $aiResults = $query->getOption('rrf_ai_results');

    $this->logger->info('RRF: Postprocessing - AI results present: @present', [
      '@present' => $aiResults ? 'yes (' . count($aiResults) . ')' : 'no',
    ]);

    if (!$aiResults) {
      return;
    }

    $keywordResults = [];
    foreach ($results->getResultItems() as $item) {
      $keywordResults[$item->getId()] = $item->getScore();
    }

    $this->logger->debug('RRF: Keyword results: @count', ['@count' => count($keywordResults)]);

    $k = (int) ($this->configuration['rrf_k_constant'] ?? 60);
    $combined = $this->rrfService->fuse([
      'semantic' => $aiResults,
      'keyword' => $keywordResults,
    ], $k);

    $this->logger->debug('RRF: Combined results: @count', ['@count' => count($combined)]);

    // Apply RRF scores to existing items.
    $items = $results->getResultItems();
    foreach ($items as $item) {
      if (isset($combined[$item->getId()])) {
        $item->setScore($combined[$item->getId()]);
      }
    }

    // Sort by RRF score descending.
    uasort($items, static fn($a, $b) => $b->getScore() <=> $a->getScore());
    $results->setResultItems($items);

    $this->logger->info('RRF: Fusion complete');
  }

}
