<?php

namespace Drupal\doc_to_html\Services;

use Drupal\Core\Config\ConfigFactoryInterface;
use Drupal\Core\File\FileSystemInterface;
use Psr\Log\LoggerInterface;

/**
 * Handles extraction and cleanup of HTML markup generated by LibreOffice.
 */
class MarkupService implements MarkupServiceInterface
{

  public function __construct(
    protected ConfigFactoryInterface $configFactory,
    protected FileServiceInterface   $fileService,
    protected FileSystemInterface    $fileSystem,
    protected LoggerInterface        $logger,
  )
  {
  }

  /**
   * {@inheritdoc}
   */
  public function parseConvertedHtml(string $sourceUri, bool $applyRegex = FALSE, ?string $overrideBodyRegex = NULL, ?int $overrideBodyMatchIndex = NULL): ?string
  {
    $htmlUri = $this->fileService->convertUriToHtml($sourceUri);
    $realPath = $this->fileService->realPath($htmlUri);

    if (!$realPath || !is_file($realPath)) {
      $this->logger->error('Converted HTML file not found: @file', ['@file' => $htmlUri]);
      return NULL;
    }

    $html = file_get_contents($realPath);
    if ($html === FALSE) {
      $this->logger->error('Unable to read converted HTML file: @file', ['@file' => $htmlUri]);
      return NULL;
    }

    // Normalise encoding then extract body/filtered markup based on settings.
    $html = $this->ensureUtf8($html);
    $html = $this->extractBody($html, $overrideBodyRegex, $overrideBodyMatchIndex);

    if ($applyRegex) {
      // Apply DOM cleanup regex from configuration (if any).
      $html = $this->applyRegexFilter($html);
    }

    return $html;
  }

  /**
   * Ensures the markup is correctly UTF-8 encoded if enabled.
   */
  private function ensureUtf8(string $html): string
  {
    $config = $this->configFactory->get('doc_to_html.basic_settings');

    if (!(bool)$config->get('utf_8_encode')) {
      return $html;
    }

    // Convert to UTF-8 only if needed.
    if (!mb_check_encoding($html, 'UTF-8')) {
      $html = mb_convert_encoding($html, 'UTF-8', 'auto');
    }

    return $html;
  }

  /**
   * Extracts only the <body>...</body> content if configured.
   */
  private function extractBody(string $html, ?string $overrideBodyRegex = NULL, ?int $overrideBodyMatchIndex = NULL): string
  {
    $config = $this->configFactory->get('doc_to_html.basic_settings');

    $regex = $overrideBodyRegex ?: $config->get('regex_to_parse_body');
    if (!$regex) {
      return $html;
    }

    $matchIndex = $overrideBodyMatchIndex;
    if ($matchIndex === NULL) {
      $stored = $config->get('regex_body_match_index');
      $matchIndex = is_numeric($stored) ? (int) $stored : 0;
    }
    if ($matchIndex < 0) {
      $matchIndex = 0;
    }

    if (preg_match($regex, $html, $matches)) {
      if (array_key_exists($matchIndex, $matches)) {
        return trim((string) $matches[$matchIndex]);
      }
      return trim((string) $matches[0]);
    }

    return $html;
  }

  /**
   * Applies the user-defined regex filter to strip content.
   */
  private function applyRegexFilter(string $html): string
  {
    $config = $this->configFactory->get('doc_to_html.basic_settings');
    $regex = $config->get('regex_to_parse_body');

    if (!$regex) {
      return $html;
    }

    $filtered = preg_replace($regex, '', $html);
    return $filtered ?? $html;
  }

}
