<?php

declare(strict_types=1);

namespace Drupal\rrf\Service;

/**
 * Reciprocal Rank Fusion (RRF) service for hybrid search.
 *
 * Implements the RRF algorithm to combine multiple ranked result sets
 * into a single unified ranking. RRF is particularly effective for
 * hybrid RAG systems that need to merge semantic (vector) and keyword
 * search results without score normalization.
 */
class RrfService {

  /**
   * Fuses multiple ranked result sets using Reciprocal Rank Fusion.
   *
   * RRF Algorithm:
   * For each item in each result set, calculate: score = 1 / (k + rank)
   * Sum scores across all result sets for each unique item.
   * Sort by final combined score descending.
   *
   * Why RRF works:
   * - Rank-based: Ignores raw scores, only uses position in results
   * - No normalization: Different scoring scales don't matter
   * - Democratic: Items appearing in multiple sources get boosted
   * - Tunable: Single k parameter controls ranking sensitivity
   *
   * Example with k=60:
   * Item A: Rank 1 in semantic (1/61=0.0164) + Rank 3 in keyword (1/63=0.0159)
   *         = 0.0323 combined score
   * Item B: Rank 2 in semantic only (1/62=0.0161)
   *         = 0.0161 combined score
   * Result: Item A ranks higher (appears in both sources)
   *
   * @param array $resultsSets
   *   Array of result sets to fuse. Keys are source names (e.g., 'semantic',
   *   'keyword'), values are arrays with entity IDs as keys and scores as
   *   values. The original scores are ignored; only ranking order matters.
   *   Example:
   *   [
   *     'semantic' => ['entity:node/1:en' => 0.95, 'entity:node/2:en' => 0.85],
   *     'keyword' => ['entity:node/3:en' => 100, 'entity:node/1:en' => 90]
   *   ].
   * @param int $k
   *   RRF constant controlling ranking balance. Higher values (60-100) create
   *   more democratic fusion where lower-ranked items have more influence.
   *   Lower values (20-40) make top-ranked items dominate. Default: 60
   *   (recommended for most hybrid search scenarios).
   *
   * @return array
   *   Fused results with entity IDs as keys and RRF scores as values,
   *   sorted by score descending. Higher scores indicate better relevance
   *   across multiple ranking sources.
   */
  public function fuse(array $resultsSets, int $k = 60): array {
    $rrfScores = [];

    // Process each result set (e.g., semantic, keyword).
    foreach ($resultsSets as $results) {
      $rank = 1;
      // Iterate through results in order (rank matters, score doesn't).
      foreach (array_keys($results) as $entityId) {
        if (!isset($rrfScores[$entityId])) {
          $rrfScores[$entityId] = 0;
        }
        // Apply RRF formula: 1 / (k + rank).
        $rrfScores[$entityId] += 1 / ($k + $rank);
        $rank++;
      }
    }

    // Sort by combined RRF score descending.
    arsort($rrfScores);
    return $rrfScores;
  }

}
