<?php

declare(strict_types=1);

namespace Drupal\ai_dropsolid\Plugin\EmbeddingStrategy;

use Drupal\ai_search\Attribute\EmbeddingStrategy;
use Drupal\ai_search\EmbeddingStrategyInterface;
use Drupal\ai_search\Plugin\EmbeddingStrategy\EmbeddingBase;
use Drupal\Core\StringTranslation\TranslatableMarkup;
use Symfony\Component\DependencyInjection\ContainerInterface;

/**
 * Provides a token-aware embedding strategy using separator-based chunking.
 *
 * This strategy replaces the default TextChunker with TokenAwareTextChunker,
 * which implements efficient token-budget management. Instead of validating
 * every chunk (N tokenization API calls), it uses density probes (3 calls)
 * and optional risk-based validation (1 call) for a total of 3-4 API calls
 * per document regardless of document size.
 *
 * **Key Features**:
 * - Token-aware chunking that respects embedding model token limits
 * - Separator hierarchy (paragraphs → sentences → words) for semantic chunks
 * - Minimal tokenization overhead (3-4 calls vs. N calls for N chunks)
 * - Configurable overlap for improved retrieval performance
 * - Support for both LiteLLM and SentencePiece tokenizers
 *
 * **Usage**:
 * Configure this strategy in your embedding settings to enable advanced
 * chunking with the TokenAwareTextChunker service. The strategy automatically
 * uses the tokenizer configured in the AI module settings.
 *
 * **Performance**:
 * - Traditional approach: 1 tokenization call per chunk = N calls
 * - This approach: 3 density probes + 1 validation = 4 calls total
 * - For 10 chunks: 60% fewer API calls (4 vs 10)
 * - For 100 chunks: 96% fewer API calls (4 vs 100)
 *
 * @see \Drupal\ai_dropsolid\Service\TokenAwareTextChunker
 * @see \Drupal\ai_search\Plugin\EmbeddingStrategy\EmbeddingBase
 */
#[EmbeddingStrategy(
  id: 'ds_token_aware_chunks',
  label: new TranslatableMarkup('Token-Aware Embedding Strategy (Separator-Based Chunking)'),
  description: new TranslatableMarkup('<strong>Dropsolid:</strong> Efficient token-aware embedding with separator-based chunking. Uses density probes (3-4 API calls total) instead of per-chunk validation (N calls). Supports LiteLLM and SentencePiece tokenizers.'),
)]
final class TokenAwareEmbeddingStrategy extends EmbeddingBase implements EmbeddingStrategyInterface {

  /**
   * {@inheritdoc}
   */
  public static function create(
    ContainerInterface $container,
    array $configuration,
    $plugin_id,
    $plugin_definition,
  ): static {
    /** @var \Drupal\ai\Utility\TextChunker $token_aware_text_chunker */
    $token_aware_text_chunker = $container->get('ai_dropsolid.token_aware_text_chunker');

    /** @var static $instance */
    $instance = parent::create($container, $configuration, $plugin_id, $plugin_definition);

    // Replace the default chunker with the token-aware implementation.
    $instance->textChunker = $token_aware_text_chunker;

    return $instance;
  }

}
