<?php

namespace Drupal\ai_llms_txt_generator\Service;

use GuzzleHttp\ClientInterface;
use Drupal\Core\Logger\LoggerChannelFactoryInterface;

/**
 * Service to parse sitemap.xml files.
 */
class SitemapParserService {

  /**
   * The HTTP client.
   *
   * @var \GuzzleHttp\ClientInterface
   */
  protected $httpClient;

  /**
   * The logger.
   *
   * @var \Drupal\Core\Logger\LoggerChannelInterface
   */
  protected $logger;

  /**
   * Constructs a SitemapParserService object.
   *
   * @param \GuzzleHttp\ClientInterface $http_client
   *   The HTTP client service.
   * @param \Drupal\Core\Logger\LoggerChannelFactoryInterface $logger_factory
   *   The logger factory service.
   */
  public function __construct(ClientInterface $http_client, LoggerChannelFactoryInterface $logger_factory) {
    $this->httpClient = $http_client;
    $this->logger = $logger_factory->get('ai_llms_txt_generator');
  }

  /**
   * Parse sitemap and extract URLs with metadata.
   *
   * @param string $sitemap_url
   *   The URL of the sitemap.
   *
   * @return array
   *   Array of URLs with metadata.
   */
  public function parseSitemap(string $sitemap_url): array {
    try {
      $response = $this->httpClient->request('GET', $sitemap_url);
      $xml_content = (string) $response->getBody();

      $xml = simplexml_load_string($xml_content);
      if ($xml === FALSE) {
        $this->logger->error('Failed to parse sitemap XML');
        return [];
      }

      $urls = [];

      // Handle standard sitemap.
      if (isset($xml->url)) {
        foreach ($xml->url as $url) {
          $loc = (string) $url->loc;
          $priority = isset($url->priority) ? (float) $url->priority : 0.5;
          $changefreq = isset($url->changefreq) ? (string) $url->changefreq : 'monthly';

          $urls[] = [
            'loc' => $loc,
            'priority' => $priority,
            'changefreq' => $changefreq,
          ];
        }
      }

      // Handle sitemap index.
      if (isset($xml->sitemap)) {
        foreach ($xml->sitemap as $sitemap) {
          $sitemap_loc = (string) $sitemap->loc;
          $sub_urls = $this->parseSitemap($sitemap_loc);
          $urls = array_merge($urls, $sub_urls);
        }
      }

      return $urls;
    }
    catch (\Exception $e) {
      $this->logger->error('Error parsing sitemap: @message', ['@message' => $e->getMessage()]);
      return [];
    }
  }

  /**
   * Get formatted sitemap data for AI processing.
   *
   * @param array $urls
   *   Array of URLs from sitemap.
   *
   * @return string
   *   Formatted string representation of sitemap data.
   */
  public function formatForAi(array $urls): string {
    $output = "Site Structure (from sitemap.xml):\n\n";
    $output .= "Total URLs: " . count($urls) . "\n\n";

    // Group by priority.
    $high_priority = array_filter($urls, fn($u) => $u['priority'] >= 0.8);
    $medium_priority = array_filter($urls, fn($u) => $u['priority'] >= 0.5 && $u['priority'] < 0.8);
    $low_priority = array_filter($urls, fn($u) => $u['priority'] < 0.5);

    $output .= "High Priority URLs (" . count($high_priority) . "):\n";
    foreach (array_slice($high_priority, 0, 20) as $url) {
      $output .= "- " . $url['loc'] . " (priority: " . $url['priority'] . ")\n";
    }

    $output .= "\nMedium Priority URLs (" . count($medium_priority) . "):\n";
    foreach (array_slice($medium_priority, 0, 15) as $url) {
      $output .= "- " . $url['loc'] . " (priority: " . $url['priority'] . ")\n";
    }

    $output .= "\nLow Priority URLs (" . count($low_priority) . "):\n";
    foreach (array_slice($low_priority, 0, 10) as $url) {
      $output .= "- " . $url['loc'] . " (priority: " . $url['priority'] . ")\n";
    }

    return $output;
  }

}
