<?php

namespace Drupal\module_manager\Service;

use Drupal\Core\Http\ClientFactory;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;

/**
 * Service to crawl Drupal.org project pages and extract module/project data.
 */
class Crawler {

  /**
   * HTTP client for requests.
   *
   * @var \GuzzleHttp\ClientInterface
   */
  protected $client;

  /**
   * Base URL for Drupal.org.
   *
   * @var string
   */
  protected $baseUrl = 'https://www.drupal.org';

  /**
   * Constructs a Crawler object.
   *
   * @param \Drupal\Core\Http\ClientFactory $httpClientFactory
   *   The HTTP client factory.
   */
  public function __construct(ClientFactory $httpClientFactory) {
    $this->client = $httpClientFactory->fromOptions([
      'timeout' => 20,
      'headers' => [
        'User-Agent' => 'Drupal ModuleManager Crawler',
      ],
    ]);
  }

  /**
   * Fetches modules and pager links from Drupal.org.
   *
   * @param string $search
   *   The search string for modules.
   * @param string $category
   *   The category filter.
   *
   * @return array
   *   An array with 'pager_links' and 'modules' keys.
   */
  public function crawl($search, $category, $page): array {
    $drupalVersion = \Drupal::VERSION;
    $drupalVersion = explode('.', $drupalVersion)[0];
    $url = 'https://www.drupal.org/project/project_module?f%5B44%5D=im_vid_44%3A13028&f%5B46%5D=&f%5B47%5D=sm_core_compatibility%3A' . urlencode($drupalVersion) . '&f%5B3%5D=' . urlencode($category) . '&f%5B48%5D=sm_field_project_type%3Afull&f%5B49%5D=bs_project_release_has_full%3Atrue&f%5B50%5D=sm_field_security_advisory_cover%3Acovered&text=' . urlencode($search) . '&solrsort=sort_label+asc&op=Search&page=' . $page;
    $response = $this->client->get($url);
    $html = (string) $response->getBody();

    $crawler = new DomCrawler($html);

    $result = [
      'pager_links' => [],
      'modules' => [],
    ];

    // 1) Pager links
    $crawler->filter('.pager a')->each(function ($node) use (&$result) {
      $href = $node->attr('href');
      $text = $node->text();
      if ($href) {
        $result['pager_links'][] = [
          'url' => $this->absoluteUrl($href),
          'text' => $text,
        ];
      }
    });

    // 2) Modules
    $crawler->filter('.region-content .content h2')->each(function ($h2) use (&$result) {
      $item = [
        'module' => '',
        'title' => trim($h2->text()),
        'title_links' => [],
        'description' => '',
        'description_links' => [],
      ];

      // Links in H2.
      $h2->filter('a')->each(function ($a) use (&$item) {
        $href = $a->attr('href');
        if ($href) {
          $item['title_links'][] = $href;
          $parts = explode('/', trim($href, '/'));
          $last = end($parts);
          $item['module'] = $last;
        }
      });

      // Next <p> after H2.
      $next = $h2->nextAll()->filter('p')->first();
      if ($next->count()) {
        $item['description'] = trim($next->text());

        $next->filter('span a')->each(function ($a) use (&$item) {
          $href = $a->attr('href');
          if ($href) {
            $item['description_links'][] = $href;
          }
        });
      }

      $result['modules'][] = $item;
    });

    return $result;
  }

  /**
   * Crawl a project page and extract info list and latest version.
   *
   * @param string $url
   *   The project page URL.
   *
   * @return array
   *   - info: array of texts inside .project-info (ul li)
   *   - versions: list of found versions
   *   - latest_version: first version in the list
   *   - content_html: HTML content of the project body
   *   - title: project title
   *   - nid: node ID
   */
  public function crawlProject(string $url): array {
    $response = $this->client->get($url);
    $html = (string) $response->getBody();

    $crawler = new DomCrawler($html);

    $result = [
      'info' => [],
      'versions' => [],
      'latest_version' => NULL,
      'content_html' => '',
      'title' => '',
      'nid' => NULL,
    ];

    // 1) Page title (.page-title)
    $titleNode = $crawler->filter('#page-title');
    if ($titleNode->count()) {
      $result['title'] = trim($titleNode->text());
    }

    // 2) .region-content .project-info
    $crawler->filter('.region-content .project-info li')->each(function ($li) use (&$result) {
      $text = trim($li->text());
      if ($text !== '') {
        $result['info'][] = $text;
      }
    });

    // 3) Release versions
    $crawler->filter('.views-field-field-release-version .field-content a')->each(function ($node) use (&$result) {
      $version = trim($node->text());
      if ($version !== '') {
        $result['versions'][] = $version;
      }
    });

    // The first is usually the most recent.
    if (!empty($result['versions'])) {
      $result['latest_version'] = $result['versions'][0];
    }

    // 4) All content of .region-content .content
    $contentNode = $crawler->filter('.region-content .content .field-name-body');
    if ($contentNode->count()) {
      $innerHtml = '';
      foreach ($contentNode->first()->children() as $child) {
        $innerHtml .= $child->ownerDocument->saveHTML($child);
      }
      $result['content_html'] = trim($innerHtml);
    }

    // 5) Find NID via page-node-XXXX in body
    if (preg_match('/page-node-(\d+)/', $html, $matches)) {
      $result['nid'] = (int) $matches[1];
    }

    return $result;
  }

  /**
   * Gets the FTP link for a project release.
   *
   * @param string $project
   *   Project machine name.
   * @param string $version
   *   Project version.
   *
   * @return string|null
   *   FTP link or NULL if not found.
   */
  public function getReleaseFtpLink(string $project, string $version): ?string {
    // Build the release URL.
    $url = "https://www.drupal.org/project/{$project}/releases/{$version}";

    try {
      $response = $this->client->get($url);
    }
    catch (\Exception $e) {
      return NULL;
    }

    $html = (string) $response->getBody();

    // Example pattern:
    // ftp.drupal.org/files/projects/taxonomy_overview-2.1.1.tar.gz.
    preg_match_all(
      '/https?:\/\/ftp\.drupal\.org\/[^\s"\']+\.zip/i',
      $html,
      $matches
    );

    if (!empty($matches[0])) {
      return $matches[0][0];
    }

    return NULL;
  }

  /**
   * Ensures URLs are absolute.
   *
   * @param string $url
   *   The URL to check.
   *
   * @return string
   *   The absolute URL.
   */
  protected function absoluteUrl(string $url): string {
    if (str_starts_with($url, 'http')) {
      return $url;
    }

    return $this->baseUrl . $url;
  }

}
