import re
import os
import json  # Import the json library
from pathlib import Path
from bs4 import BeautifulSoup
import urllib.parse  # For decoding URL-encoded filenames

# --- Configuration ---
HTML_FILE = Path("Asset Store.htm")  # Corrected based on context
DOWNLOAD_BASE_DIR = Path("CSP_764")  # VERIFY THIS PATH
# Consider changing the output extension to .json for clarity
OUTPUT_FILE = Path("missing_files_report.json")  # Changed extension to .json

# --- Column Indices (Verified from Asset Store.htm) ---
NAME_COLUMN_INDEX = 0
DL_COLUMN_INDEX = 2
# -------------------------------------------------------------

# List of common extensions (used for double-checking, but primary check is base name)
COMMON_EXTENSIONS = [
    ".zip",
    ".rar",
    ".sut",
    ".clip",
    ".7z",
    ".png",
    ".jpg",
    ".jpeg",
    ".txt",
    ".unknown",
    ".mp4",
    ".webm",
    ".pdf",
    ".unitypackage",
    ".brushset",
    ".abr",
    ".tpl",
    ".ai",
]


# --- Sanitization Function (Crucial: Must match downloader's logic) ---
def sanitize_filename(name: str) -> str:
    """Removes or replaces characters invalid for filenames."""
    if not name:
        return "_unknown_"
    sanitized = re.sub(r'[\\/*?:"<>|]', "", name)
    sanitized = re.sub(r"\s+", "_", sanitized)
    sanitized = sanitized.strip("._")
    if not sanitized:
        return "_unknown_"
    return sanitized


# --- Helper Functions ---


def extract_items_from_html(html_file: Path) -> list[tuple[str, str]]:
    """
    Parses the HTML file (assuming a table structure) to find asset names
    from the 'Name' column and their corresponding download links from the 'DL' column.
    Returns a list of tuples: (extracted_name, download_url)
    """
    items = []
    print(f"--- Parsing HTML file: {html_file.resolve()} ---")
    if not html_file.is_file():
        print(f"Error: HTML file not found: {html_file.resolve()}")
        return items

    try:
        with open(html_file, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "html.parser")

        rows = soup.find_all("tr")
        print(f"Found {len(rows)} table rows (<tr>).")
        if not rows:
            print(
                "Warning: No table rows found. Cannot extract items based on table structure."
            )
            return items

        processed_hrefs = set()
        items_extracted_count = 0
        max_expected_index = max(NAME_COLUMN_INDEX, DL_COLUMN_INDEX)

        for i, row in enumerate(rows):
            cells = row.find_all("td")
            if len(cells) <= max_expected_index:
                continue

            try:
                name_cell = cells[NAME_COLUMN_INDEX]
                asset_name = name_cell.get_text(strip=True)
            except IndexError:
                continue

            dl_href = None
            try:
                dl_cell = cells[DL_COLUMN_INDEX]
                link_tag = dl_cell.find("a", href=True)
                if link_tag:
                    temp_href = link_tag["href"].strip()
                    if (
                        temp_href
                        and not temp_href.startswith("#")
                        and not temp_href.startswith("javascript:")
                    ):
                        dl_href = temp_href
            except IndexError:
                continue

            if asset_name and dl_href and dl_href not in processed_hrefs:
                if asset_name == dl_href:
                    print(
                        f"Warning: Row {i}, Name is the same as DL href, skipping: {dl_href}"
                    )
                    continue
                items.append((asset_name, dl_href))
                processed_hrefs.add(dl_href)
                items_extracted_count += 1

    except FileNotFoundError:
        print(f"Error: HTML file not found during open: {html_file.resolve()}")
        return []
    except Exception as e:
        print(f"Error parsing HTML file {html_file}: {e}")
        import traceback

        traceback.print_exc()

    print(
        f"Extracted {items_extracted_count} potential asset items (Name/DL Link pairs) from table rows."
    )
    print(f"--- Finished parsing HTML ---")
    return items


def scan_download_directory(base_dir: Path) -> set[str]:
    """
    Recursively scans the download directory and returns a set of
    base filenames (without extensions) found.
    """
    existing_base_filenames = set()
    print(f"--- Scanning download directory: {base_dir.resolve()} ---")
    if not base_dir.is_dir():
        print(f"Error: Download base directory not found: {base_dir.resolve()}")
        return existing_base_filenames

    file_count = 0
    for item_path in base_dir.rglob("*"):
        if item_path.is_file():
            if item_path.name.endswith(
                (
                    ".txt",
                    ".log",
                    ".crdownload",
                    ".part",
                    ".tmp",
                    ".py",
                    ".htm",
                    ".html",
                    ".json",
                )
            ):  # Exclude more types
                continue
            file_count += 1
            base_name = item_path.stem
            if base_name:
                existing_base_filenames.add(base_name)

    print(f"Scanned {file_count} potential asset files.")
    print(f"Found {len(existing_base_filenames)} unique base filenames in {base_dir}.")
    print(f"--- Finished scanning directory ---")
    return existing_base_filenames


# --- Main Script Execution ---
if __name__ == "__main__":
    print("--- Starting Comparison (HTML vs Disk Files) ---")

    html_items = extract_items_from_html(HTML_FILE)
    if not html_items:
        print("No potential items extracted from HTML. Cannot perform comparison.")
        exit()

    existing_files_base_set = scan_download_directory(DOWNLOAD_BASE_DIR)
    if not existing_files_base_set:
        print(
            f"Warning: No files found in {DOWNLOAD_BASE_DIR}. All HTML items will be marked as missing."
        )

    missing_items_details = []
    found_count = 0
    checked_sanitized_names = set()

    print("\n--- Checking HTML items against files on disk ---")
    for original_name, dl_href in html_items:
        sanitized_name = sanitize_filename(original_name)
        if (
            not sanitized_name
            or sanitized_name == "_unknown_"
            or sanitized_name in checked_sanitized_names
        ):
            continue
        checked_sanitized_names.add(sanitized_name)
        is_found = sanitized_name in existing_files_base_set
        if is_found:
            found_count += 1
        else:
            missing_items_details.append(
                {
                    "name_html": original_name,
                    "sanitized_name": sanitized_name,
                    "download_url": dl_href,
                }
            )  # Store as dictionary for JSON

    # Sort the list of missing item dictionaries by 'name_html'
    missing_items_details.sort(key=lambda item: item["name_html"].lower())

    # --- Prepare data for JSON output ---
    total_items_checked = len(checked_sanitized_names)
    missing_count = len(missing_items_details)

    output_data = {
        "report_summary": {
            "total_items_checked_html": total_items_checked,
            "items_found_on_disk": found_count,
            "items_missing_on_disk": missing_count,
            "source_html_file": str(HTML_FILE.resolve()),
            "scanned_directory": str(DOWNLOAD_BASE_DIR.resolve()),
        },
        "missing_items": missing_items_details,  # Already a list of dictionaries
    }

    # --- Report summary to console ---
    print(f"\n--- Comparison Summary ---")
    print(f"Total unique sanitized names checked from HTML: {total_items_checked}")
    print(f"Items FOUND as files on disk (based on base name): {found_count}")
    print(
        f"Items from HTML potentially MISSING on disk: {missing_count} out of {total_items_checked}"
    )

    # --- Write JSON data to the output file ---
    try:
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            # Use json.dump for pretty printing
            json.dump(output_data, f, indent=4, ensure_ascii=False)
        print(
            f"Missing files report written in JSON format to: {OUTPUT_FILE.resolve()}"
        )
    except Exception as e:
        print(f"Error writing JSON output file {OUTPUT_FILE}: {e}")

    print("--- Comparison Finished ---")
