<?php

declare(strict_types=1);

namespace Drupal\ai_migration\Service;

use Drupal\ai_migration\HtmlSanitizerConfigBuilder;
use Drupal\Core\Logger\LoggerChannelFactoryInterface;
use Drupal\Core\Logger\LoggerChannelInterface;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\HtmlSanitizer\HtmlSanitizer;
use Symfony\Component\HtmlSanitizer\HtmlSanitizerConfig;
use Symfony\Component\HtmlSanitizer\HtmlSanitizerInterface;
use Drupal\ai_migration\HtmlSanitizerConfigBuilderInterface;

/**
 * Service to process HTML content based on migration configuration.
 */
class HtmlContentProcessor implements HtmlContentProcessorInterface {

  /**
   * The HTML sanitizer.
   *
   * @var \Symfony\Component\HtmlSanitizer\HtmlSanitizerInterface|null
   */
  protected ?HtmlSanitizerInterface $htmlSanitizer = NULL;

  /**
   * The HTML sanitizer config builder service.
   *
   * @var \Drupal\ai_migration\HtmlSanitizerConfigBuilderInterface|null
   */
  protected ?HtmlSanitizerConfigBuilderInterface $configBuilder = NULL;

  /**
   * The HTML sanitizer configuration.
   *
   * @var \Symfony\Component\HtmlSanitizer\HtmlSanitizerConfig
   */
  protected HtmlSanitizerConfig $htmlSanitizerConfig;

  /**
   * Whether the sanitizer config has been set.
   *
   * @var bool
   */
  protected bool $isSanitizerConfigSet = FALSE;

  /**
   * The HTML container to process.
   *
   * @var array|null
   */
  protected ?array $container = NULL;

  /**
   * The logger channel.
   *
   * @var \Drupal\Core\Logger\LoggerChannelInterface
   */
  protected LoggerChannelInterface $logger;

  /**
   * The regex to strip from the HTML content.
   *
   * @var array
   */
  protected array $stripRegex = [];

  /**
   * Creates a new HtmlContentProcessor instance.
   *
   * @param \Drupal\Core\Logger\LoggerChannelFactoryInterface $logger
   *   The logger service.
   */
  public function __construct(LoggerChannelFactoryInterface $logger) {
    $this->logger = $logger->get('ai_migration');
  }

  /**
   * {@inheritdoc}
   */
  public function setSanitizer(HtmlSanitizerInterface $htmlSanitizer): void {
    $this->htmlSanitizer = $htmlSanitizer;

    // A sanitizer object passed in would have been created with the config
    // already set. Set the sanitizer config to TRUE to indicate that the
    // sanitizer is ready to be used.
    $this->isSanitizerConfigSet = TRUE;
  }

  /**
   * {@inheritdoc}
   */
  public function getSanitizer(): HtmlSanitizerInterface {
    if (!$this->htmlSanitizer instanceof HtmlSanitizerInterface) {
      $this->htmlSanitizer = new HtmlSanitizer($this->htmlSanitizerConfig);
    }

    return $this->htmlSanitizer;
  }

  /**
   * {@inheritdoc}
   */
  public function setConfigBuilder(HtmlSanitizerConfigBuilderInterface $configBuilder): void {
    $this->configBuilder = $configBuilder;
  }

  /**
   * {@inheritdoc}
   */
  public function getConfigBuilder(): HtmlSanitizerConfigBuilderInterface {
    if (!$this->configBuilder instanceof HtmlSanitizerConfigBuilderInterface) {
      $this->configBuilder = new HtmlSanitizerConfigBuilder();
    }

    return $this->configBuilder;
  }

  /**
   * {@inheritdoc}
   */
  public function isConfigSet(): bool {
    return $this->isSanitizerConfigSet
      || !empty($this->container)
      || !empty($this->stripRegex);
  }

  /**
   * {@inheritdoc}
   */
  public function getConfig(): HtmlSanitizerConfig {
    return $this->htmlSanitizerConfig;
  }

  /**
   * {@inheritdoc}
   */
  public function setConfig(array $config = []): void {
    // Set the HTML container to process separately from sanitizer configs
    // since it is not a sanitizer config option.
    if (isset($config['container'])) {
      $this->container = (is_string($config['container'])) ? [$config['container']] : $config['container'];
      unset($config['container']);
    }

    if (isset($config['strip_regex'])) {
      $this->stripRegex = (is_array($config['strip_regex'])) ? $config['strip_regex'] : [$config['strip_regex']];
      unset($config['strip_regex']);
    }

    if (!empty($config)) {
      $this->htmlSanitizerConfig = $this->getConfigBuilder()->build($config);
      $this->isSanitizerConfigSet = TRUE;
    }
  }

  /**
   * {@inheritdoc}
   */
  public function processHtml(string $html, array $config = []): string {
    // Apply configuration if provided.
    if (!empty($config)) {
      $this->setConfig($config);

      // If a config was passed through the processHTML, set the sanitizer
      // to null so the config is reloaded when the sanitizer is accessed.
      $this->htmlSanitizer = NULL;
    }

    // Return early if no configuration has been set.
    if (!$this->isConfigSet()) {
      return $html;
    }

    // Extract content from container if specified.
    $content = $html;
    if (!empty($this->container)) {
      try {
        $content = $this->extractFromContainer($html, $this->container);
      }
      catch (\InvalidArgumentException $e) {
        // Log the warning but allow the content processing to continue.
        $this->logger->warning('HTML container extraction failed: @message', ['@message' => $e->getMessage()]);
      }
    }

    if (!empty($this->stripRegex)) {
      foreach ($this->stripRegex as $regex) {
        $content = preg_replace($regex, '', $content);
      }
    }

    // Apply sanitization if configured.
    if ($this->isSanitizerConfigSet) {
      $content = $this->sanitizeContent($content);
    }

    return $content;
  }

  /**
   * Sanitizes HTML content using the configured sanitizer.
   *
   * @param string $content
   *   The HTML content to sanitize.
   *
   * @return string
   *   The sanitized HTML content.
   */
  protected function sanitizeContent(string $content): string {
    $sanitized_content = '';

    if (empty($this->container)) {
      return trim($this->getSanitizer()->sanitize($content));
    }
    else {
      // When multiple containers are specified, each is sanitized individually
      // to allow <head> elements to be processed differently (e.g. allowing
      // <meta>) while rest of containers to be processed as <body>.
      foreach ($this->container as $selector) {
        $sanitized_content .= (count($this->container) > 1 ? PHP_EOL : '') . $this->getSanitizer()->sanitizeFor($selector, $content);
      }
    }

    return $sanitized_content;
  }

  /**
   * Extracts HTML content from a specific container within the input HTML.
   *
   * @param string $html
   *   The HTML content to extract from.
   * @param array $selector
   *   The CSS selector to identify the container.
   *
   * @return string
   *   The extracted HTML content.
   *
   * @throws \InvalidArgumentException
   *   When the specified container is not found.
   */
  protected function extractFromContainer(string $html, array $selector): string {
    $extracted_html = '';
    $crawler = new Crawler($html);

    foreach ($selector as $selector_value) {
      $dom_node = $crawler->filter($selector_value);

      if ($dom_node->count() === 0) {
        $this->logger->warning('HTML container extraction failed: @selector not found in HTML.', [
          '@selector' => $selector_value,
        ]);
      }

      // Append extracted HTML on top of each other if more than one container
      // is specified. Only add a newline if there are multiple containers.
      $extracted_html .= (count($this->container) > 1 ? PHP_EOL : '') . $dom_node->outerHtml();
    }

    return $extracted_html;
  }

}
