<?php

declare(strict_types=1);

namespace Drupal\typesense_integration\Plugin\search_api\processor;

use Drupal\search_api\Plugin\search_api\processor\HtmlFilter;

/**
 * Safer HTML filter that prevents word concatenation across block boundaries.
 *
 * Inserts spaces when removing adjacent block-level tags so that
 * "foo.</p><p>Bar" becomes "foo. Bar", preserving search accuracy.
 *
 * @SearchApiProcessor(
 *   id = "html_filter_safe",
 *   label = @Translation("HTML filter (safe spacing)"),
 *   description = @Translation("Strips HTML while preserving word boundaries across block elements."),
 *   stages = {
 *     "preprocess_index" = -20
 *   },
 *   locked = false
 * )
 */
class HtmlFilterSafe extends HtmlFilter {

  /**
   * Block-level HTML elements that can cause word concatenation when removed.
   */
  protected const BLOCK_ELEMENTS = [
    'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
    'section', 'article', 'header', 'footer', 'nav', 'main',
    'aside', 'blockquote', 'pre', 'ul', 'ol', 'li',
    'table', 'tr', 'td', 'th', 'tbody', 'thead', 'tfoot',
    'dl', 'dt', 'dd', 'figure', 'figcaption', 'hr',
    'form', 'fieldset', 'legend', 'address',
  ];

  /**
   * {@inheritdoc}
   */
  protected function processFieldValue(&$value, $type) {
    if (!is_string($value)) {
      return;
    }

    // First, add spaces around block elements to prevent word concatenation
    $value = $this->addSpacesAroundBlockElements($value);

    // Remove invisible content.
    $text = $this->removeInvisibleHtmlElements($value);
    $is_text_type = $this->getDataTypeHelper()->isTextType($type);
    if ($is_text_type) {
      // Let removed tags still delimit words.
      $text = str_replace(['<', '>'], [' <', '> '], $text);
      $text = $this->handleAttributes($text);
    }
    if ($this->configuration['tags'] && $is_text_type) {
      $text = strip_tags($text, '<' . implode('><', array_keys($this->configuration['tags'])) . '>');
      $value = $this->parseHtml($text);
    }
    else {
      $text = strip_tags($text);
      $value = $this->normalizeText(trim($text));
    }
  }

  /**
   * Adds spaces around block-level elements to prevent word concatenation.
   *
   * @param string $html
   *   The HTML content to process.
   *
   * @return string
   *   The HTML with spaces added around block elements.
   */
  protected function addSpacesAroundBlockElements(string $html): string {
    // Create a regex pattern for all block elements (opening and closing tags)
    $block_elements_pattern = implode('|', self::BLOCK_ELEMENTS);

    // Add space before opening block tags
    $html = preg_replace(
      '/(?<!\s)<\s*(' . $block_elements_pattern . ')(?:\s[^>]*)?>/i',
      ' <$1>',
      $html
    );

    // Add space after closing block tags
    $html = preg_replace(
      '/<\s*\/\s*(' . $block_elements_pattern . ')\s*>(?!\s)/i',
      '</$1> ',
      $html
    );

    // Handle self-closing block elements
    $html = preg_replace(
      '/(?<!\s)<\s*(' . $block_elements_pattern . ')(?:\s[^>]*)?\s*\/\s*>(?!\s)/i',
      ' <$1/> ',
      $html
    );

    return $html;
  }

}
