<?php

namespace Drupal\taxonomy_overview;

use Wamania\Snowball\StemmerFactory;

/**
 * Provides utilities for normalizing and grouping taxonomy terms.
 *
 * This class uses stemming and string similarity to normalize terms
 * and cluster them into groups of similar words. It helps in detecting
 * and organizing duplicate or near-duplicate taxonomy terms.
 */
class TagsOverviewTermNormalizer {

  /**
   * The stemmer instance used to reduce words to their root form.
   *
   * @var \Wamania\Snowball\Stemmer
   */
  protected $stemmer;

  /**
   * Constructs a TagsOverviewTermNormalizer object.
   *
   * @param string $language
   *   The language code used to initialize the stemmer (default: 'en').
   */
  public function __construct($language = 'en') {
    $this->stemmer = StemmerFactory::create($language);
  }

  /**
   * Normalizes a term into a canonical string.
   *
   * - Converts the term to lowercase.
   * - Splits it into tokens (words).
   * - Applies stemming to reduce each token to its root form.
   * - Sorts the stems alphabetically to make order irrelevant.
   * - Joins them back into a single normalized string.
   *
   * @param string $term
   *   The original taxonomy term label.
   *
   * @return string
   *   The normalized representation of the term.
   */
  public function normalize($term) {
    $tokens = preg_split('/\s+/', strtolower($term));
    $stems = array_map(fn($token) => $this->stemmer->stem($token), $tokens);
    sort($stems);
    return implode(' ', $stems);
  }

  /**
   * Groups a set of taxonomy terms by similarity.
   *
   * Process:
   * - Normalizes each term.
   * - Compares normalized values using Levenshtein distance.
   * - Groups terms if their normalized form is within a small
   *   edit distance (<= 2), meaning they are likely variations.
   * - Creates a new group if no existing group matches.
   *
   * @param array $terms
   *   An associative array of taxonomy terms, keyed by term ID (tid).
   *
   * @return array
   *   A nested array where each group key is the normalized base string
   *   and its value is a list of term IDs with their original labels.
   */
  public function groupSimilarTerms(array $terms) {
    $normalized = [];
    $groups = [];

    foreach ($terms as $tid => $original) {
      $key = $this->normalize($original);
      $normalized[$tid] = $key;
    }

    foreach ($normalized as $tid => $base) {
      $matched = FALSE;
      foreach ($groups as $groupKey => &$group) {
        if (levenshtein($groupKey, $base) <= 2) {
          $group[$tid] = $terms[$tid];
          $matched = TRUE;
          break;
        }
      }
      if (!$matched) {
        $groups[$base] = [$tid => $terms[$tid]];
      }
    }

    return $groups;
  }

}
