# deduplicator.py
import os
import re
import sys
import math
from collections import defaultdict
import logging

# --- Configuration ---
LOG_FILENAME_SUFFIX = "_successful.log"
# Use the same logging setup as the main script for consistency (optional)
APP_LOG_FILE = "deduplicator.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)-8s - %(message)s",
    handlers=[
        logging.FileHandler(APP_LOG_FILE, mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),  # Also print to console
    ],
)
logger = logging.getLogger(__name__)
# --- End Configuration ---

# --- Helper Functions ---


def _format_size(size_bytes):
    """Formats bytes into a human-readable string."""
    if size_bytes is None or not isinstance(size_bytes, (int, float)) or size_bytes < 0:
        return "Unknown size"
    if size_bytes == 0:
        return "0 B"
    if "math" not in sys.modules:
        return f"{size_bytes} B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    try:
        i = int(math.floor(math.log(size_bytes, 1024)))
        if i >= len(size_name):
            i = len(size_name) - 1
        p = math.pow(1024, i)
        s = round(size_bytes / p, 2)
        return f"{s} {size_name[i]}"
    except (ValueError, OverflowError):
        return f"{size_bytes} B"


def parse_success_log_line(line):
    """Parses a line from the success log (Filename | Size: X). Returns (filename, size_str) or None."""
    # Expects format like: "My_File_1.zip | Size: 10.5 MB"
    parts = line.split(" | Size:", 1)
    if len(parts) == 2:
        filename = parts[0].strip()
        size_str = parts[1].strip()
        return filename, size_str
    else:
        logger.warning(f"Could not parse success log line: {line}")
        return None, None


def get_base_name_and_suffix_index(filename):
    """Extracts base name, suffix index (like _1, _2), and extension."""
    match = re.match(r"^(.*?)_(\d+)(\.[^.]+)$", filename)  # Matches base_N.ext
    if match:
        base = match.group(1)
        index = int(match.group(2))
        ext = match.group(3)
        return base, index, ext
    else:
        # No suffix found, check if it has an extension
        base, ext = os.path.splitext(filename)
        if ext:
            return base, 0, ext  # Treat no suffix as index 0
        else:
            return filename, 0, ""  # No extension either


# --- Core Logic Functions ---


def deduplicate_success_log(log_filepath):
    """Reads the success log, removes duplicate entries based on base name and size, and overwrites the log."""
    if not os.path.exists(log_filepath):
        logger.error(f"Success log file not found: {log_filepath}")
        return

    logger.info(f"Processing success log: {log_filepath}")
    # { base_name: [ (filename, size_str, original_line), ... ], ... }
    base_name_groups = defaultdict(list)
    # { (base_name, size_str): [ (filename, suffix_index), ... ], ... }
    size_groups = defaultdict(list)
    kept_lines = []
    removed_count = 0

    try:
        with open(log_filepath, "r", encoding="utf-8") as f:
            original_lines = f.readlines()

        # First pass: group by base name and identify potential duplicates
        for line in original_lines:
            line = line.strip()
            if not line:
                continue
            filename, size_str = parse_success_log_line(line)
            if filename and size_str:
                base, index, ext = get_base_name_and_suffix_index(filename)
                base_name_key = base + ext  # Group by base + extension
                base_name_groups[base_name_key].append((filename, size_str, line))

        # Second pass: decide which lines to keep
        for base_key, items in base_name_groups.items():
            if len(items) <= 1:
                # Only one entry for this base name, keep it
                kept_lines.append(items[0][2])
                continue

            # Multiple entries, group by size to find exact duplicates
            size_groups.clear()
            for filename, size_str, original_line in items:
                base, index, ext = get_base_name_and_suffix_index(filename)
                size_groups[(base_key, size_str)].append(
                    (filename, index, original_line)
                )

            # Process each size group for this base name
            for (bk, sz), size_items in size_groups.items():
                if len(size_items) <= 1:
                    # Only one item with this specific size, keep it
                    kept_lines.append(size_items[0][2])
                else:
                    # Found exact duplicates (same base name, same size)
                    logger.info(
                        f"Found {len(size_items)} duplicates for '{bk}' with size '{sz}': {[item[0] for item in size_items]}"
                    )
                    # Decide which one to keep: prefer non-suffixed (index 0), then lowest index
                    best_to_keep = min(
                        size_items, key=lambda x: x[1]
                    )  # Find item with lowest index
                    kept_lines.append(best_to_keep[2])
                    logger.info(f"  Keeping log entry: {best_to_keep[0]}")
                    removed_count += len(size_items) - 1

        # Overwrite the log file if changes were made
        if removed_count > 0:
            logger.warning(
                f"Overwriting log file {log_filepath} to remove {removed_count} duplicate entries."
            )
            # Sort kept lines for consistency (optional)
            kept_lines.sort()
            with open(log_filepath, "w", encoding="utf-8") as f:
                for line in kept_lines:
                    f.write(line + "\n")
            logger.info("Success log deduplication complete.")
        else:
            logger.info("No duplicate entries found in the success log.")

    except Exception as e:
        logger.error(f"Error processing success log {log_filepath}: {e}", exc_info=True)


def deduplicate_filesystem(target_dir_base):
    """Scans the target directory for duplicate files (base name + size) and deletes them."""
    if not os.path.isdir(target_dir_base):
        logger.error(f"Target directory not found: {target_dir_base}")
        return

    logger.info(f"Scanning directory for duplicates: {target_dir_base}")
    # { (dirname, base_name_key): [ (filepath, size, suffix_index), ... ], ... }
    potential_duplicates = defaultdict(list)
    files_to_delete = []
    total_bytes_saved = 0

    # Walk through the directory structure
    for root, _, files in os.walk(target_dir_base):
        # Skip the selenium temp directory
        if os.path.basename(root).startswith("worker_") and "selenium_temp" in root:
            logger.debug(f"Skipping Selenium temp dir: {root}")
            continue

        logger.debug(f"Scanning directory: {root}")
        for filename in files:
            filepath = os.path.join(root, filename)
            try:
                size = os.path.getsize(filepath)
                base, index, ext = get_base_name_and_suffix_index(filename)
                base_name_key = base + ext  # Group by base + extension
                dir_key = os.path.dirname(filepath)  # Group by directory as well
                potential_duplicates[(dir_key, base_name_key)].append(
                    (filepath, size, index)
                )
            except OSError as e:
                logger.warning(f"Could not process file {filepath}: {e}")

    # Identify duplicates based on size within each group
    for (dir_key, base_key), items in potential_duplicates.items():
        if len(items) <= 1:
            continue  # No potential duplicates for this base name

        # Group by size
        size_groups = defaultdict(list)
        for filepath, size, index in items:
            size_groups[size].append((filepath, index))

        # Check groups with same size
        for size, size_items in size_groups.items():
            if len(size_items) > 1:
                # Found files with same base name AND same size
                logger.info(
                    f"Found {len(size_items)} potential duplicates for '{base_key}' (Size: {_format_size(size)}) in {os.path.relpath(dir_key, target_dir_base)}:"
                )
                for filepath, index in size_items:
                    logger.info(f"  - {os.path.basename(filepath)} (Index: {index})")

                # Decide which one to keep: prefer non-suffixed (index 0), then lowest index
                size_items.sort(key=lambda x: x[1])  # Sort by index
                file_to_keep = size_items[0][0]
                logger.info(f"  Keeping file: {os.path.basename(file_to_keep)}")

                # Mark others for deletion
                for filepath_to_delete, _ in size_items[1:]:
                    files_to_delete.append((filepath_to_delete, size))
                    total_bytes_saved += size

    # Perform deletions with confirmation
    if not files_to_delete:
        logger.info("No duplicate files found on filesystem.")
        return

    logger.warning(f"\n--- Found {len(files_to_delete)} duplicate files to delete ---")
    for filepath, size in files_to_delete:
        logger.warning(f"  - {filepath} ({_format_size(size)})")

    confirm = (
        input(f"\nProceed with deleting these {len(files_to_delete)} files? (y/N): ")
        .strip()
        .lower()
    )
    if confirm == "y":
        deleted_count = 0
        failed_count = 0
        logger.info("Proceeding with deletion...")
        for filepath, size in files_to_delete:
            try:
                os.remove(filepath)
                logger.info(f"Deleted: {filepath}")
                deleted_count += 1
            except OSError as e:
                logger.error(f"Failed to delete {filepath}: {e}")
                failed_count += 1
        logger.info(
            f"Deletion complete. Deleted: {deleted_count}, Failed: {failed_count}. Saved approx: {_format_size(total_bytes_saved)}"
        )
    else:
        logger.info("Deletion cancelled by user.")


# --- Main Execution ---
if __name__ == "__main__":
    logger.info("--- Deduplicator Script Started ---")
    base_name = input(
        "Enter the base name used for logs/directory (e.g., CSP_764_2): "
    ).strip()

    if not base_name:
        print("Error: Base name cannot be empty.")
        sys.exit(1)

    log_filepath = f"{base_name}{LOG_FILENAME_SUFFIX}"
    target_dir = base_name  # Assuming output dir name matches base name

    print("-" * 30)
    deduplicate_success_log(log_filepath)
    print("-" * 30)
    deduplicate_filesystem(target_dir)
    print("-" * 30)

    logger.info("--- Deduplicator Script Finished ---")
