<?php

namespace Drupal\sanitize_placeholder\Commands;

use Consolidation\AnnotatedCommand\CommandData;
use Drupal\Component\Datetime\TimeInterface;
use Drupal\Core\Cache\CacheTagsInvalidatorInterface;
use Drupal\Core\Config\ConfigFactoryInterface;
use Drupal\Core\Entity\EntityFieldManagerInterface;
use Drupal\Core\Entity\EntityInterface;
use Drupal\Core\Entity\EntityStorageInterface;
use Drupal\Core\Entity\EntityTypeInterface;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Field\FieldDefinitionInterface;
use Drupal\sanitize_placeholder\Service\ThematicFaker;
use Drupal\sanitize_placeholder\Strategy\StrategyManager;
use Drush\Attributes as CLI;
use Drush\Commands\DrushCommands;
use Psr\Log\LoggerInterface;
use function drupal_register_shutdown_function;

/**
 * Drush commands for sanitize_placeholder.
 */
final class SanitizePlaceholderCommands extends DrushCommands {

  /**
   * Pre-sanitize fingerprints per rule key "entity.bundle.field".
   *
   * @var array<string, array{count:int, empties:int, samples:array<int,string>}>
   */
  private array $preSanitizeFingerprints = [];

  /**
   * Constructs the command service.
   *
   * @param \Drupal\Core\Entity\EntityTypeManagerInterface $etm
   *   Entity type manager.
   * @param \Drupal\Core\Entity\EntityFieldManagerInterface $efm
   *   Entity field manager.
   * @param \Drupal\Core\Config\ConfigFactoryInterface $configFactory
   *   Config factory.
   * @param \Drupal\sanitize_placeholder\Service\ThematicFaker $faker
   *   Thematic Faker wrapper.
   * @param \Drupal\sanitize_placeholder\Strategy\StrategyManager $strategyManager
   *   Strategy manager.
   * @param \Drupal\Core\Cache\CacheTagsInvalidatorInterface $cacheInvalidator
   *   Cache tag invalidator used to clear UI/render caches after replacements.
   * @param \Drupal\Component\Datetime\TimeInterface $time
   *   Time service.
   */
  public function __construct(
    private readonly EntityTypeManagerInterface $etm,
    private readonly EntityFieldManagerInterface $efm,
    private readonly ConfigFactoryInterface $configFactory,
    private readonly ThematicFaker $faker,
    private readonly StrategyManager $strategyManager,
    private readonly CacheTagsInvalidatorInterface $cacheInvalidator,
    private readonly TimeInterface $time,
  ) {
    parent::__construct();
  }

  #[CLI\Command(name: 'sp:fake', aliases: ['sp:fake-fields'])]
  /**
   * Apply configured fake data rules to entities.
   *
   * @param array $options
   *   Command options: entity, bundle, field, scope, limit, seed.
   *
   * @throws \Drupal\Component\Plugin\Exception\InvalidPluginDefinitionException
   *   If an entity storage cannot be loaded.
   * @throws \Drupal\Component\Plugin\Exception\PluginNotFoundException
   *   If an entity type is unknown.
   * @throws \Drupal\Core\Entity\EntityStorageException
   *   If an entity cannot be saved.
   */
  public function fake(
    array $options = [
      'entity' => '',
      'bundle' => '',
      'field' => '',
      'scope' => 'sanitized',
      'limit' => 100,
      'seed' => NULL,
    ],
  ): void {
    $options += [
      'entity' => '',
      'bundle' => '',
      'field' => '',
      'scope' => 'sanitized',
      'limit' => 100,
      'seed' => NULL,
    ];

    $seed = $options['seed'] !== NULL ? (int) $options['seed'] : NULL;
    $this->faker->seed($seed);

    $rules = $this->configFactory->get('sanitize_placeholder.settings')->get('replacements') ?? [];
    $rules = is_array($rules) ? $rules : [];
    $rules = $this->filterRules(
      $rules,
      (string) $options['entity'],
      (string) $options['bundle'],
      (string) $options['field'],
    );

    if (empty($rules)) {
      $this->io()->text('No replacement rules configured.');
      return;
    }

    foreach ($rules as $rule) {
      $entityTypeId = (string) $rule['entity'];
      $bundle = (string) $rule['bundle'];
      $fieldName = (string) $rule['field'];
      $strategyId = (string) $rule['strategy'];

      $storage = $this->etm->getStorage($entityTypeId);
      $label = sprintf('%s.%s.%s → %s', $entityTypeId, $bundle, $fieldName, $strategyId);

      $ids = $this->findEntityIds($storage, $entityTypeId, $bundle, (int) $options['limit']);
      $total = count($ids);
      $this->io()->title("$label ($total entities)");

      $updated = 0;
      $strategy = $this->strategyManager->get($strategyId);

      foreach ($ids as $id) {
        /** @var \Drupal\Core\Entity\EntityInterface|null $entity */
        $entity = $storage->load($id);
        if (!$entity) {
          continue;
        }

        $fieldDef = $this->getFieldDefinition($entityTypeId, $bundle, $fieldName);
        if (!$fieldDef) {
          continue;
        }

        if (!$this->matchScope($entity, $fieldName, (string) $options['scope'])) {
          continue;
        }

        $value = $strategy->generate($entity, $fieldDef);

        // Enforce max lengths before saving.
        $value = $this->enforceLength($value, $fieldDef, $entityTypeId, $fieldName, $strategyId);

        $this->writeValue($entity, $fieldName, $value);
        $entity->save();
        $updated++;
      }

      $this->io()->success("Updated $updated entities for {$entityTypeId}.{$bundle}.{$fieldName}.");
      $this->output()->writeln('');
    }
  }

  /**
   * Take a small snapshot of sample values before sanitize begins.
   *
   * @param \Consolidation\AnnotatedCommand\CommandData $commandData
   *   Command metadata.
   *
   * @hook pre-command sql:sanitize
   * @priority 10000
   */
  public function preSqlSanitize(CommandData $commandData): void {
    try {
      // Sample up to N entities per field to compare later.
      $this->preSanitizeFingerprints = $this->snapshotFieldFingerprints(200);
    }
    catch (\Throwable $e) {
      if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
        $this->logger->error(
          'sanitize_placeholder: pre-snapshot failed: @class - @message',
          [
            '@class' => get_class($e),
            '@message' => $e->getMessage(),
            'exception' => $e,
          ]
        );
      }
      $this->io()->error('sanitize_placeholder: could not compute pre-sanitize field snapshot; continuing without diffing.');
      $this->preSanitizeFingerprints = [];
    }
  }

  /**
   * Apply replacements after `drush sql:sanitize` finishes.
   *
   * @param int|null $result
   *   Result code from the command execution.
   * @param \Consolidation\AnnotatedCommand\CommandData $commandData
   *   Command metadata.
   *
   * @hook post-command sql:sanitize
   * @priority -10000
   */
  public function postSqlSanitize(?int $result, CommandData $commandData): void {
    // Normalize result when Drush provides NULL.
    $resultCode = $result ?? 0;

    // If sql:sanitize failed, do nothing.
    $status = $commandData->annotationData()->get('output') ?? 0;
    if (($resultCode !== 0) || (is_int($status) && $status !== 0)) {
      return;
    }

    $cfg = $this->configFactory->get('sanitize_placeholder.settings');
    $limit = (int) ($cfg->get('post_hook_limit') ?? 5000);

    // Use a changing seed so values get regenerated every sanitize run.
    $seed = $this->time->getRequestTime();

    // Defer to the very end so we run after other post hooks.
    drupal_register_shutdown_function(function () use ($limit, $seed) {
      // Compute after snapshot now.
      try {
        $after = $this->snapshotFieldFingerprints(200);
      }
      catch (\Throwable $e) {
        if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
          $this->logger->error(
            'sanitize_placeholder: after-snapshot failed: @class - @message',
            [
              '@class' => get_class($e),
              '@message' => $e->getMessage(),
              'exception' => $e,
            ]
          );
        }
        // Ensure an error is surfaced but never throws.
        try {
          $this->io()->error('sanitize_placeholder: could not compute post-sanitize field snapshot; falling back to scope "sanitized".');
        }
        catch (\Throwable) {
          if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
            $this->logger->error('sanitize_placeholder: failed to emit IO error message during after-snapshot failure.');
          }
        }

        // Fallback to the previous behavior.
        try {
          $this->fake([
            'entity' => '',
            'bundle' => '',
            'field'  => '',
            'scope'  => 'sanitized',
            'limit'  => $limit,
            'seed'   => $seed,
          ]);
        }
        catch (\Throwable $fallbackE) {
          if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
            $this->logger->error(
              'sanitize_placeholder post-hook fallback failed during fake(): @class - @message',
              [
                '@class' => get_class($fallbackE),
                '@message' => $fallbackE->getMessage(),
                'exception' => $fallbackE,
              ]
            );
          }
          try {
            $this->io()->error('sanitize_placeholder: fallback run failed after sql:sanitize. See logs.');
          }
          catch (\Throwable) {
            if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
              $this->logger->error('sanitize_placeholder: failed to emit IO error message during fallback failure.');
            }
          }
        }

        $this->invalidateUiCaches();
        try {
          $this->io()->success('sanitize_placeholder: completed after sql:sanitize.');
        }
        catch (\Throwable) {
          // Best-effort only; nothing else to do here.
        }
        return;
      }

      // Decide which configured fields actually changed during sanitize.
      $rules = (array) ($this->configFactory->get('sanitize_placeholder.settings')->get('replacements') ?? []);
      $changedTargets = [];

      foreach ($rules as $r) {
        $key = sprintf('%s.%s.%s', $r['entity'] ?? '', $r['bundle'] ?? '', $r['field'] ?? '');
        if ($key === '..') {
          continue;
        }

        $beforeFp = $this->preSanitizeFingerprints[$key] ?? NULL;
        $afterFp = $after[$key] ?? NULL;

        if (!$beforeFp || !$afterFp) {
          continue;
        }

        if ($this->looksSanitizedByDiff($beforeFp, $afterFp)) {
          $changedTargets[] = [
            'entity' => (string) $r['entity'],
            'bundle' => (string) $r['bundle'],
            'field'  => (string) $r['field'],
          ];
        }
      }

      if (!empty($changedTargets)) {
        try {
          $this->io()->section(
            'sanitize_placeholder: applying replacements after sql:sanitize ' .
            '(mode: changed fields only, seed: ' . $seed . ')'
          );
        }
        catch (\Throwable) {
          // Continue silently; this is just cosmetic output.
        }

        try {
          // Run once per changed field with scope=all (table was changed).
          foreach ($changedTargets as $t) {
            $this->fake([
              'entity' => $t['entity'],
              'bundle' => $t['bundle'],
              'field'  => $t['field'],
              'scope'  => 'all',
              'limit'  => $limit,
              'seed'   => $seed,
            ]);
          }
        }
        catch (\Throwable $e) {
          if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
            $this->logger->error(
              'sanitize_placeholder post-hook failed during fake(): @class - @message',
              [
                '@class' => get_class($e),
                '@message' => $e->getMessage(),
                'exception' => $e,
              ]
            );
          }
          try {
            $this->io()->error('sanitize_placeholder: error while applying replacements after sql:sanitize. See logs.');
          }
          catch (\Throwable) {
            if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
              $this->logger->error('sanitize_placeholder: failed to emit IO error message during changed-targets failure.');
            }
          }
        }
      }
      else {
        // Fallback: nothing obviously changed — keep the previous behavior.
        try {
          $this->io()->section(
            'sanitize_placeholder: no changed fields detected; running with scope "sanitized" (seed: ' . $seed . ')'
          );
        }
        catch (\Throwable) {
          // Cosmetic only.
        }

        try {
          $this->fake([
            'entity' => '',
            'bundle' => '',
            'field'  => '',
            'scope'  => 'sanitized',
            'limit'  => $limit,
            'seed'   => $seed,
          ]);
        }
        catch (\Throwable $e) {
          if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
            $this->logger->error(
              'sanitize_placeholder post-hook fallback failed during fake(): @class - @message',
              [
                '@class' => get_class($e),
                '@message' => $e->getMessage(),
                'exception' => $e,
              ]
            );
          }
          try {
            $this->io()->error('sanitize_placeholder: fallback run failed after sql:sanitize. See logs.');
          }
          catch (\Throwable) {
            if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
              $this->logger->error('sanitize_placeholder: failed to emit IO error message during fallback failure.');
            }
          }
        }
      }

      $this->invalidateUiCaches();
      try {
        $this->io()->success('sanitize_placeholder: completed after sql:sanitize.');
      }
      catch (\Throwable) {
        // Best-effort only.
      }
    });
  }

  /**
   * Filter configured rules by optional entity/bundle/field limits.
   *
   * @param array $rules
   *   All rules from configuration.
   * @param string $entityLimit
   *   Optional entity type ID to include.
   * @param string $bundleLimit
   *   Optional bundle to include.
   * @param string $fieldLimit
   *   Optional field machine name to include.
   *
   * @return array
   *   Filtered rules.
   */
  private function filterRules(array $rules, string $entityLimit, string $bundleLimit, string $fieldLimit): array {
    if ($entityLimit === '' && $bundleLimit === '' && $fieldLimit === '') {
      return $rules;
    }
    return array_values(array_filter($rules, static function (array $r) use ($entityLimit, $bundleLimit, $fieldLimit): bool {
      if ($entityLimit !== '' && ($r['entity'] ?? '') !== $entityLimit) {
        return FALSE;
      }
      if ($bundleLimit !== '' && ($r['bundle'] ?? '') !== $bundleLimit) {
        return FALSE;
      }
      if ($fieldLimit !== '' && ($r['field'] ?? '') !== $fieldLimit) {
        return FALSE;
      }
      return TRUE;
    }));
  }

  /**
   * Find entity IDs to process.
   *
   * @param \Drupal\Core\Entity\EntityStorageInterface $storage
   *   Storage handler.
   * @param string $entityTypeId
   *   Entity type ID.
   * @param string $bundle
   *   Bundle (optional).
   * @param int $limit
   *   Max number of IDs to return.
   *
   * @return int[]
   *   Entity IDs (integers).
   *
   * @throws \Drupal\Component\Plugin\Exception\PluginNotFoundException
   */
  private function findEntityIds(EntityStorageInterface $storage, string $entityTypeId, string $bundle, int $limit): array {
    $query = $storage->getQuery()->accessCheck(FALSE)->range(0, $limit);
    $entityType = $this->etm->getDefinition($entityTypeId);
    if ($bundle !== '' && $this->hasBundleKey($entityType)) {
      $query->condition($entityType->getKey('bundle'), $bundle);
    }
    $ids = $query->execute();
    return array_map('intval', is_array($ids) ? array_values($ids) : []);
  }

  /**
   * Whether the entity type definition has a bundle key.
   *
   * @param \Drupal\Core\Entity\EntityTypeInterface $def
   *   Entity type definition.
   *
   * @return bool
   *   TRUE if a bundle key exists, FALSE otherwise.
   */
  private function hasBundleKey(EntityTypeInterface $def): bool {
    $key = $def->getKey('bundle');
    return is_string($key) && $key !== '';
  }

  /**
   * Get a field definition for an entity/bundle/field combination.
   *
   * @param string $entityTypeId
   *   Entity type ID.
   * @param string $bundle
   *   Bundle.
   * @param string $fieldName
   *   Field machine name.
   *
   * @return \Drupal\Core\Field\FieldDefinitionInterface|null
   *   Field definition or NULL if not found.
   */
  private function getFieldDefinition(string $entityTypeId, string $bundle, string $fieldName): ?FieldDefinitionInterface {
    $defs = $this->efm->getFieldDefinitions($entityTypeId, $bundle);
    return $defs[$fieldName] ?? NULL;
  }

  /**
   * Decide if a field value should be processed for a given scope.
   *
   * @param \Drupal\Core\Entity\EntityInterface $entity
   *   Entity.
   * @param string $fieldName
   *   Field machine name.
   * @param string $scope
   *   Scope: all|empty|sanitized.
   *
   * @return bool
   *   TRUE if the entity/field should be processed.
   */
  private function matchScope(EntityInterface $entity, string $fieldName, string $scope): bool {
    $val = $this->readValue($entity, $fieldName);

    if ($scope === 'all') {
      return TRUE;
    }

    if ($scope === 'empty') {
      return $val === '' || $val === NULL;
    }

    if ($scope === 'sanitized') {

      // Many sanitizers blank fields. Consider empty/NULL as "sanitized".
      if ($val === '' || $val === NULL) {
        return TRUE;
      }

      if (!is_string($val)) {
        return FALSE;
      }

      // Common placeholder tokens used by sanitize steps.
      if (preg_match('/\b(lorem|ipsum|redacted|removed|sanitized)\b/i', $val)) {
        return TRUE;
      }

      // E-mail-ish placeholders using example/invalid/test domains.
      if (str_contains($val, '@')) {
        $lower = mb_strtolower($val, 'UTF-8');
        return str_contains($lower, 'example.')
          || str_contains($lower, 'invalid')
          || str_contains($lower, 'localhost')
          || str_contains($lower, 'test');
      }

      // Site-specific placeholders often seen after sanitize runs.
      $lowerVal = mb_strtolower($val, 'UTF-8');
      if ($lowerVal === 'john'
        || $lowerVal === 'doe'
        || $lowerVal === 'john_doe'
        || $lowerVal === 'example co') {
        return TRUE;
      }

      // Very generic fake-y patterns.
      if (preg_match('/^user[-_ ]?\d+$/i', $val)) {
        return TRUE;
      }
      if (preg_match('/^[A-Z][a-z]+ [A-Z]\.?$/', $val)) {
        return TRUE;
      }

      return FALSE;
    }

    // Unknown scope: be conservative.
    return FALSE;
  }

  /**
   * Read a single-value field from an entity.
   *
   * @param \Drupal\Core\Entity\EntityInterface $entity
   *   Entity.
   * @param string $fieldName
   *   Field machine name.
   *
   * @return mixed
   *   The scalar value or NULL if empty.
   */
  private function readValue(EntityInterface $entity, string $fieldName): mixed {
    if ($entity->hasField($fieldName)) {
      $list = $entity->get($fieldName);
      if ($list->isEmpty()) {
        return NULL;
      }
      $first = $list->first();
      if (!$first) {
        return NULL;
      }
      $v = $first->getValue();
      if (is_array($v) && array_key_exists('value', $v)) {
        return $v['value'];
      }
      return (string) $first;
    }
    return $entity->get($fieldName);
  }

  /**
   * Write a scalar-like value back to the entity field.
   *
   * @param \Drupal\Core\Entity\EntityInterface $entity
   *   Entity.
   * @param string $fieldName
   *   Field machine name.
   * @param string $value
   *   Value to write.
   */
  private function writeValue(EntityInterface $entity, string $fieldName, string $value): void {
    if ($entity->hasField($fieldName)) {
      $def = $entity->getFieldDefinition($fieldName);
      $type = $def->getType();

      switch ($type) {
        case 'text':
        case 'text_long':
          $entity->set($fieldName, ['value' => $value, 'format' => NULL]);
          return;

        case 'text_with_summary':
          $entity->set($fieldName, ['value' => $value, 'summary' => '', 'format' => NULL]);
          return;

        case 'list_string':
          $allowed = (array) ($def->getSettings()['allowed_values'] ?? []);
          if ($allowed && !isset($allowed[$value]) && !array_key_exists($value, $allowed) && !in_array($value, $allowed, TRUE)) {
            return;
          }
          // Fall through to the generic writer below.
          break;

        // No default: everything else uses the generic writer below.
      }

      // Generic writer for scalar-like fields
      // (string, integer, email, uri, etc.).
      $entity->set($fieldName, ['value' => $value]);
      return;
    }

    if (method_exists($entity, 'set')) {
      $entity->set($fieldName, $value);
    }
  }

  /**
   * Enforce maximum length on generated values.
   *
   * Uses field max_length when present. For usernames, also applies
   * sanitize_placeholder.settings:username_max_length. Chooses the smallest
   * positive max among the available limits. Defaults to truncation;
   * never throws.
   *
   * @param string $value
   *   Original value.
   * @param \Drupal\Core\Field\FieldDefinitionInterface $fieldDef
   *   Field definition.
   * @param string $entityTypeId
   *   Entity type ID.
   * @param string $fieldName
   *   Field machine name.
   * @param string $strategyId
   *   Strategy ID used to generate the value.
   *
   * @return string
   *   Possibly truncated value.
   */
  private function enforceLength(string $value, FieldDefinitionInterface $fieldDef, string $entityTypeId, string $fieldName, string $strategyId): string {
    $limits = [];

    // Field-level max_length (string-like fields).
    $fieldMax = $fieldDef->getSetting('max_length');
    if (is_numeric($fieldMax) && (int) $fieldMax > 0) {
      $limits[] = (int) $fieldMax;
    }

    // Module config for username lengths.
    $isUsername = ($entityTypeId === 'user' && $fieldName === 'name') || $strategyId === 'username';
    if ($isUsername) {
      $cfgMax = $this->configFactory->get('sanitize_placeholder.settings')->get('username_max_length');
      if (is_numeric($cfgMax) && (int) $cfgMax > 0) {
        $limits[] = (int) $cfgMax;
      }
    }

    if (empty($limits)) {
      return $value;
    }

    $max = min($limits);
    if ($max <= 0) {
      return $value;
    }

    // Multibyte-safe truncate.
    if (mb_strlen($value, 'UTF-8') > $max) {
      return mb_substr($value, 0, $max, 'UTF-8');
    }
    return $value;
  }

  /**
   * Invalidate caches that may affect the UI after replacements.
   */
  private function invalidateUiCaches(): void {
    try {
      $this->cacheInvalidator->invalidateTags([
        'rendered',
        'user_list',
        'config:sanitize_placeholder.settings',
      ]);
    }
    catch (\Throwable $e) {
      // Best effort only — log and continue.
      if (property_exists($this, 'logger') && $this->logger instanceof LoggerInterface) {
        $this->logger->warning(
          'sanitize_placeholder: cache invalidation failed: @class - @message',
          [
            '@class' => get_class($e),
            '@message' => $e->getMessage(),
            'exception' => $e,
          ]
        );
      }
    }
  }

  /**
   * Snapshot per-field fingerprints to compare before/after sanitize.
   *
   * @param int|null $perFieldLimit
   *   Max entities to sample per field.
   *
   * @return array<string, array{count:int, empties:int, samples:array<int,string>}>
   *   A map keyed by "entity.bundle.field" with simple stats and sample values.
   *
   * @throws \Drupal\Component\Plugin\Exception\InvalidPluginDefinitionException
   * @throws \Drupal\Component\Plugin\Exception\PluginNotFoundException
   */
  private function snapshotFieldFingerprints(?int $perFieldLimit = NULL): array {
    $perFieldLimit = $perFieldLimit ?? 200;
    $fingerprints = [];
    $rules = (array) ($this->configFactory->get('sanitize_placeholder.settings')->get('replacements') ?? []);

    foreach ($rules as $r) {
      $entityTypeId = (string) ($r['entity'] ?? '');
      $bundle       = (string) ($r['bundle'] ?? '');
      $fieldName    = (string) ($r['field'] ?? '');
      if ($entityTypeId === '' || $fieldName === '') {
        continue;
      }

      $storage = $this->etm->getStorage($entityTypeId);
      $def     = $this->etm->getDefinition($entityTypeId);
      $idKey   = $def->getKey('id');

      $q = $storage->getQuery()->accessCheck(FALSE)->range(0, $perFieldLimit);
      if ($bundle !== '' && $this->hasBundleKey($def)) {
        $q->condition($def->getKey('bundle'), $bundle);
      }
      if (is_string($idKey) && $idKey !== '') {
        $q->sort($idKey);
      }
      $ids = $q->execute();
      $ids = is_array($ids) ? array_values($ids) : [];

      $count = 0;
      $empties = 0;
      $samples = [];
      $samplesSet = [];

      foreach ($ids as $id) {
        $entity = $storage->load($id);
        if (!$entity instanceof EntityInterface) {
          continue;
        }
        $count++;
        $val = $this->readValue($entity, $fieldName);
        if ($val === '' || $val === NULL) {
          $empties++;
          continue;
        }
        $norm = $this->normalizeSampleValue((string) $val);
        if (!isset($samplesSet[$norm])) {
          $samplesSet[$norm] = TRUE;
          $samples[] = $norm;
          if (count($samples) >= 50) {
            break;
          }
        }
      }

      $key = sprintf('%s.%s.%s', $entityTypeId, $bundle, $fieldName);
      $fingerprints[$key] = [
        'count'   => $count,
        'empties' => $empties,
        'samples' => $samples,
      ];
    }

    return $fingerprints;
  }

  /**
   * Heuristic: decide if a field looks sanitized by comparing samples.
   *
   * @param array{count:int, empties:int, samples:array<int,string>} $before
   *   Fingerprint before sanitize.
   * @param array{count:int, empties:int, samples:array<int,string>} $after
   *   Fingerprint after sanitize.
   *
   * @return bool
   *   TRUE if the change suggests the field was sanitized.
   */
  private function looksSanitizedByDiff(array $before, array $after): bool {
    if (($before['count'] ?? 0) < 5 || ($after['count'] ?? 0) < 5) {
      return FALSE;
    }

    $afterEmptyRate = $after['empties'] / max(1, $after['count']);
    $emptyDelta = $after['empties'] - $before['empties'];
    if ($afterEmptyRate >= 0.25 && $emptyDelta >= 5) {
      return TRUE;
    }

    $a = array_unique($before['samples']);
    $b = array_unique($after['samples']);
    if (!$a && !$b) {
      return FALSE;
    }
    $intersect = count(array_intersect($a, $b));
    $union     = count(array_unique(array_merge($a, $b)));
    $jaccard   = $union ? ($intersect / $union) : 1.0;

    return $jaccard < 0.5;
  }

  /**
   * Normalize a sample value to mute random noise (digits, whitespace, case).
   *
   * @param string $value
   *   The original value.
   *
   * @return string
   *   Normalized sample string.
   */
  private function normalizeSampleValue(string $value): string {
    $s = mb_strtolower(trim($value), 'UTF-8');
    if (str_contains($s, '@')) {
      [$local, $dom] = explode('@', $s, 2);
      $local = preg_replace('/\d+/', '#', (string) $local);
      $s = $local . '@' . $dom;
    }
    $s = preg_replace('/\d+/', '#', (string) $s);
    $s = preg_replace('/\s+/', ' ', (string) $s);
    return (string) $s;
  }

}
