<?php

declare(strict_types=1);

namespace Drupal\Tests\tmgmt_laratranslate\Unit\Service;

use Drupal\Tests\UnitTestCase;
use Drupal\tmgmt_laratranslate\Service\RecursiveCharacterTextSplitter;
use Drupal\tmgmt_laratranslate\Service\TextSplitterValidator;
use Psr\Log\LoggerInterface;

/**
 * Validation tests for RecursiveCharacterTextSplitter service.
 *
 * @coversDefaultClass \Drupal\tmgmt_laratranslate\Service\RecursiveCharacterTextSplitter
 * @group tmgmt_laratranslate
 */
class RecursiveCharacterTextSplitterValidationTest extends UnitTestCase {

  /**
   * The text splitter service.
   *
   * @var \Drupal\tmgmt_laratranslate\Service\RecursiveCharacterTextSplitter
   */
  private RecursiveCharacterTextSplitter $splitter;

  /**
   * The validator service.
   *
   * @var \Drupal\tmgmt_laratranslate\Service\TextSplitterValidator
   */
  private TextSplitterValidator $validator;

  /**
   * {@inheritdoc}
   */
  protected function setUp(): void {
    parent::setUp();

    $logger = $this->createMock(LoggerInterface::class);
    $this->splitter = new RecursiveCharacterTextSplitter($logger);
    $this->validator = new TextSplitterValidator($logger);
  }

  /**
   * Tests complex HTML chunking and reassembly.
   *
   * Simulates real-world Drupal content with nested HTML, formatting,
   * links, and lists that exceeds the 10k character limit.
   *
   * NOTE: RecursiveCharacterTextSplitter splits by text patterns and does NOT
   * guarantee valid HTML structure in individual chunks. It's designed for
   * splitting text/code, not HTML-aware parsing. For translation workflows,
   * chunks are translated individually and reassembled.
   *
   * @covers ::splitText
   * @covers ::configure
   * @covers ::forLanguage
   */
  public function testComplexHtmlChunkingAndValidation(): void {
    // Build complex HTML content exceeding 10k chars.
    $html = '';
    for ($i = 0; $i < 100; $i++) {
      $html .= sprintf(
        '<div class="content-section">
          <h2>Section %d: Important Information</h2>
          <p>This is a <strong>complex paragraph</strong> with <em>various formatting</em> options.
          It contains <a href="/node/%d">internal links</a> and <a href="https://example.com/page-%d">external links</a>.</p>
          <ul class="list-items">
            <li>First item with <code>inline code</code> and <span class="highlight">highlighted text</span>.</li>
            <li>Second item with more content to increase the character count significantly.</li>
            <li>Third item with a <strong>nested <em>emphasis</em></strong> element.</li>
          </ul>
          <blockquote>
            <p>This is a quoted passage that provides additional context and information.
            It helps demonstrate how Drupal content often contains nested block-level elements.</p>
          </blockquote>
        </div>',
        $i,
        $i,
        $i
      );
    }

    $totalLength = $this->splitter->getTextLength($html);

    // Verify content exceeds 10k limit.
    self::assertGreaterThan(10000, $totalLength);

    // Configure splitter for HTML with 9900 character chunks.
    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 9900,
      'chunk_overlap' => 0,
    ]);

    // Split the HTML.
    $chunks = $htmlSplitter->splitText($html);

    // Verify chunking occurred.
    self::assertGreaterThan(1, count($chunks));

    // Verify each chunk respects size limit.
    foreach ($chunks as $i => $chunk) {
      $chunkLength = $this->splitter->getTextLength($chunk);
      self::assertLessThanOrEqual(9900, $chunkLength, "Chunk $i exceeds size limit: $chunkLength");
    }

    // Reassemble chunks (simple join for HTML).
    $reassembled = $htmlSplitter->reassembleChunks($chunks);

    // Verify key HTML elements are present in reassembled content.
    self::assertStringContainsString('<h2>Section', $reassembled);
    self::assertStringContainsString('<strong>', $reassembled);
    self::assertStringContainsString('<ul class="list-items">', $reassembled);
    self::assertStringContainsString('<blockquote>', $reassembled);
    self::assertStringContainsString('href=', $reassembled);

    // Verify total length is preserved.
    $reassembledLength = $this->splitter->getTextLength($reassembled);
    self::assertEquals($totalLength, $reassembledLength, 'Total length should be preserved');
    self::assertEquals($html, $reassembled, 'Reassembled HTML should match original');
  }

  /**
   * Tests HTML chunking with nested elements.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testNestedHtmlChunking(): void {
    $html = str_repeat(
      '<div class="outer"><p>Text with <strong>bold and <em>italic</em></strong> formatting.</p></div>',
      400
    );

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 5000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify nested tags are present.
    self::assertStringContainsString('<strong>', $reassembled);
    self::assertStringContainsString('<em>', $reassembled);
  }

  /**
   * Tests HTML chunking with links.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testHtmlWithLinksChunking(): void {
    $html = str_repeat(
      '<p>Text with <a href="/node/123">internal link</a> and <a href="https://example.com">external link</a>.</p>',
      400
    );

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 5000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify links are preserved.
    self::assertStringContainsString('href="/node/123"', $reassembled);
    self::assertStringContainsString('href="https://example.com"', $reassembled);
  }

  /**
   * Tests HTML chunking with block elements and attributes.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testHtmlBlockElementsWithAttributes(): void {
    $html = str_repeat(
      '<div class="callout"><p class="important">Important message here.</p></div>',
      300
    );

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 5000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify attributes are preserved.
    self::assertStringContainsString('class="callout"', $reassembled);
    self::assertStringContainsString('class="important"', $reassembled);
  }

  /**
   * Tests HTML chunking with lists.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testHtmlListsChunking(): void {
    $html = '<ul>' . str_repeat('<li>List item with content that could be quite long.</li>', 500) . '</ul>';

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 5000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify list structure.
    self::assertStringContainsString('<ul>', $reassembled);
    self::assertStringContainsString('<li>', $reassembled);
    self::assertStringContainsString('</li>', $reassembled);
    self::assertStringContainsString('</ul>', $reassembled);
  }

  /**
   * Tests HTML chunking with tables.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testHtmlTablesChunking(): void {
    $html = '<table><tbody>' . str_repeat('<tr><td>Cell content with data.</td><td>More data.</td></tr>', 300) . '</tbody></table>';

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 5000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify table structure.
    self::assertStringContainsString('<table>', $reassembled);
    self::assertStringContainsString('<tbody>', $reassembled);
    self::assertStringContainsString('<tr>', $reassembled);
    self::assertStringContainsString('<td>', $reassembled);
  }

  /**
   * Tests HTML chunking with headings and paragraphs.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testHtmlHeadingsAndParagraphs(): void {
    $sections = [];
    for ($i = 1; $i <= 50; $i++) {
      $sections[] = sprintf(
        '<h2>Heading %d</h2>
        <p>This is the first paragraph under heading %d with some content.</p>
        <p>This is the second paragraph with <strong>bold text</strong> and <em>italic text</em>.</p>
        <h3>Subheading %d.1</h3>
        <p>Content under the subheading with more text to increase character count.</p>',
        $i,
        $i,
        $i
      );
    }

    $html = implode('', $sections);

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 5000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify heading structure.
    self::assertStringContainsString('<h2>Heading', $reassembled);
    self::assertStringContainsString('<h3>Subheading', $reassembled);
  }

  /**
   * Tests that validator handles empty HTML.
   *
   * @covers \Drupal\tmgmt_laratranslate\Service\TextSplitterValidator::validateHtml
   */
  public function testValidatorHandlesEmptyHtml(): void {
    self::assertTrue($this->validator->validateHtml(''));
  }

  /**
   * Tests that validator accepts valid HTML.
   *
   * @covers \Drupal\tmgmt_laratranslate\Service\TextSplitterValidator::validateHtml
   */
  public function testValidatorAcceptsValidHtml(): void {
    $html = '<div><p>Hello <strong>world</strong>!</p></div>';
    self::assertTrue($this->validator->validateHtml($html));
  }

  /**
   * Tests HTML chunking with special characters and entities.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testHtmlWithSpecialCharactersAndEntities(): void {
    $html = str_repeat(
      '<p>Text with &nbsp; entities &amp; special chars: < > " \' and unicode: こんにちは世界</p>',
      200
    );

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 5000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify entities are preserved.
    self::assertStringContainsString('&nbsp;', $reassembled);
    self::assertStringContainsString('&amp;', $reassembled);
    self::assertStringContainsString('こんにちは世界', $reassembled);
  }

  /**
   * Tests HTML chunking with mixed block and inline elements.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testHtmlMixedElements(): void {
    $html = str_repeat(
      '<article>
        <header><h1>Article Title</h1></header>
        <section>
          <p>First paragraph with <span class="highlight">highlighted</span> text.</p>
          <blockquote>A quoted passage with <cite>citation</cite>.</blockquote>
          <p>Second paragraph with <code>inline code</code> and <a href="/link">a link</a>.</p>
        </section>
        <footer><p>Footer content</p></footer>
      </article>',
      30
    );

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 5000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify various elements are preserved.
    self::assertStringContainsString('<article>', $reassembled);
    self::assertStringContainsString('<header>', $reassembled);
    self::assertStringContainsString('<section>', $reassembled);
    self::assertStringContainsString('<blockquote>', $reassembled);
    self::assertStringContainsString('<cite>', $reassembled);
    self::assertStringContainsString('<code>', $reassembled);
    self::assertStringContainsString('<footer>', $reassembled);
  }

  /**
   * Tests that chunks maintain approximately correct sizes.
   *
   * @covers ::splitText
   * @covers ::configure
   */
  public function testChunkSizesAreWithinLimits(): void {
    $html = str_repeat('<p>This is a paragraph with some text content.</p>', 1000);

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 3000,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($html);

    self::assertGreaterThan(1, count($chunks));

    // Verify each chunk is within reasonable bounds.
    foreach ($chunks as $i => $chunk) {
      $length = $this->splitter->getTextLength($chunk);
      // Allow some margin for HTML tags.
      self::assertLessThanOrEqual(3500, $length, "Chunk $i exceeds expected size");
      self::assertGreaterThan(0, $length, "Chunk $i should not be empty");
    }
  }

  /**
   * Tests HTML chunking preserves content integrity.
   *
   * @covers ::splitText
   * @covers ::forLanguage
   */
  public function testHtmlContentIntegrityPreserved(): void {
    $originalHtml = '';
    for ($i = 1; $i <= 50; $i++) {
      $originalHtml .= "<p id=\"para-$i\">Paragraph $i with unique content for testing.</p>";
    }

    $htmlSplitter = $this->splitter->forLanguage('html', [
      'chunk_size' => 500,
      'chunk_overlap' => 0,
    ]);

    $chunks = $htmlSplitter->splitText($originalHtml);

    self::assertGreaterThan(1, count($chunks));

    // Reassemble.
    $reassembled = implode('', $chunks);

    // Verify all paragraph IDs are present.
    for ($i = 1; $i <= 50; $i++) {
      self::assertStringContainsString("id=\"para-$i\"", $reassembled, "Paragraph $i should be preserved");
      self::assertStringContainsString("Paragraph $i with unique content", $reassembled);
    }
  }

}
