# parser.py
import requests
from bs4 import BeautifulSoup, Tag
import re
import os
import time
import sys
from urllib.parse import urljoin, urlparse, unquote
import lxml  # Use lxml for parsing speed

# Import ONLY app_logger from logger
from logger import app_logger

# --- Constants ---
VALID_SCHEMES = {"http", "https"}
# Add more extensions if needed
DIRECT_LINK_EXTENSIONS = {
    ".zip",
    ".rar",
    ".7z",
    ".tar",
    ".gz",
    ".bz2",
    ".xz",
    ".exe",
    ".msi",
    ".dmg",
    ".jpg",
    ".jpeg",
    ".png",
    ".gif",
    ".bmp",
    ".webp",
    ".svg",
    ".tif",
    ".tiff",
    ".mp4",
    ".mkv",
    ".avi",
    ".mov",
    ".wmv",
    ".webm",
    ".mp3",
    ".wav",
    ".ogg",
    ".flac",
    ".aac",
    ".pdf",
    ".doc",
    ".docx",
    ".xls",
    ".xlsx",
    ".ppt",
    ".pptx",
    ".txt",
    ".iso",
    ".img",
    ".sut",
    ".clip",
    ".brush",
    ".abr",
    ".tpl",  # Art specific
    ".psd",
    ".csp",
    ".sai",
    ".sai2",
    ".kra",  # Art specific
    ".unitypackage",
    ".blend",
    ".fbx",
    ".obj",
    ".mtl",  # 3D/Game Dev
    ".otf",
    ".ttf",
    ".woff",
    ".woff2",  # Fonts
}
# Add more hostnames known for non-direct links
PAGE_LIKE_HOSTNAMES = {
    "rentry.org",
    "rentry.co",
    "pastebin.com",
    "github.com",
    "gitlab.com",
    "bitbucket.org",
    "youtube.com",
    "youtu.be",
    "vimeo.com",
    "twitter.com",
    "x.com",
    "reddit.com",
    "tumblr.com",
    "facebook.com",
    "instagram.com",
    "imgur.com",  # Often gallery pages unless i.imgur.com
    "drive.google.com",
    "dropbox.com",
    "onedrive.live.com",
    "box.com",
    "mega.nz",
    "mediafire.com",
    "pixeldrain.com",  # Handled by Selenium check mostly
    "gofile.io",
    "anonfiles.com",
    "krakenfiles.com",
    "sendspace.com",
    "zippyshare.com",
    "1fichier.com",
    "itch.io",
    "artstation.com",
    "gumroad.com",
    "patreon.com",
    "subscribestar.adult",
    "fanbox.cc",
    "boosty.to",
    "booth.pm",
    "dlsite.com",
}


# --- Utility Functions ---


def sanitize_filename(name):
    """Removes or replaces characters invalid for Windows/Linux filenames."""
    if not name:
        return f"download_{int(time.time())}"
    reserved_names = {
        "CON",
        "PRN",
        "AUX",
        "NUL",
        "COM1",
        "COM2",
        "COM3",
        "COM4",
        "COM5",
        "COM6",
        "COM7",
        "COM8",
        "COM9",
        "LPT1",
        "LPT2",
        "LPT3",
        "LPT4",
        "LPT5",
        "LPT6",
        "LPT7",
        "LPT8",
        "LPT9",
    }
    sanitized = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", name)
    sanitized = re.sub(r"\s+", "_", sanitized)
    sanitized = sanitized.strip("._ ")

    base_name_check, _ = os.path.splitext(sanitized)
    if base_name_check.upper() in reserved_names:
        sanitized = "_" + sanitized

    if not sanitized:
        sanitized = re.sub(r"[^\w\-. ]", "_", name)
        sanitized = re.sub(r"\s+", "_", sanitized).strip("._ ")
        if not sanitized:
            return f"download_{int(time.time())}"
        base_name_check, _ = os.path.splitext(sanitized)
        if base_name_check.upper() in reserved_names:
            sanitized = "_" + sanitized

    max_len = 200
    if len(sanitized) > max_len:
        base, ext = os.path.splitext(sanitized)
        if len(ext) < 10:
            sanitized = base[: max_len - len(ext)] + ext
        else:
            sanitized = sanitized[:max_len]
        sanitized = sanitized.strip("._ ")

    return sanitized if sanitized else f"download_{int(time.time())}"


def get_final_desired_filename(desired_base, actual_filename):
    """Constructs the final filename, preferring desired base + actual extension."""
    sanitized_base = sanitize_filename(desired_base) if desired_base else ""

    if not actual_filename:
        return (
            sanitized_base + ".unknown"
            if sanitized_base
            else f"download_{int(time.time())}.unknown"
        )

    _, actual_ext = os.path.splitext(actual_filename)
    actual_ext_lower = actual_ext.lower()

    temp_extensions = {".part", ".crdownload", ".tmp", ".download"}
    if actual_ext_lower in temp_extensions:
        actual_ext = ""
        actual_ext_lower = ""

    if not sanitized_base:
        return sanitize_filename(actual_filename)

    _, desired_ext = os.path.splitext(sanitized_base)
    desired_ext_lower = desired_ext.lower()

    # If desired name already has the correct (non-temp) extension, use it
    if desired_ext_lower and desired_ext_lower == actual_ext_lower:
        return sanitized_base

    # If desired name has wrong extension, remove it and add actual (if valid)
    if desired_ext_lower and actual_ext_lower and desired_ext_lower != actual_ext_lower:
        base_without_ext, _ = os.path.splitext(sanitized_base)
        # Check if stripping ext made it empty
        if not base_without_ext:
            base_without_ext = sanitized_base
        return sanitize_filename(
            base_without_ext + actual_ext
        )  # Use actual_ext (original case)

    # If desired name has no extension, add actual (if valid)
    if not desired_ext_lower and actual_ext_lower:
        return sanitize_filename(
            sanitized_base + actual_ext
        )  # Use actual_ext (original case)

    # Otherwise (e.g., actual has no valid ext), return sanitized base
    return sanitized_base


def is_likely_direct_link(url):
    """Heuristic to guess if a URL points directly to a downloadable file."""
    try:
        parsed = urlparse(url)
        if not parsed.scheme in VALID_SCHEMES:
            return False
        hostname = parsed.hostname.lower() if parsed.hostname else ""
        path = parsed.path.lower()
        if not hostname:
            return False

        for page_host in PAGE_LIKE_HOSTNAMES:
            if hostname == page_host or hostname.endswith(f".{page_host}"):
                if hostname == "i.imgur.com":
                    break
                if hostname == "pixeldrain.com" and path.startswith("/f/"):
                    break
                app_logger.debug(f"URL classified as PAGE (known hostname): {url}")
                return False

        _, ext = os.path.splitext(path)
        if ext in DIRECT_LINK_EXTENSIONS:
            app_logger.debug(f"URL classified as DIRECT (extension match): {url}")
            return True

        app_logger.debug(f"URL classification UNDETERMINED (defaulting to PAGE): {url}")
        return False

    except Exception as e:
        app_logger.warning(f"Error in is_likely_direct_link check for {url}: {e}")
        return False


# --- Core Parsing Logic ---


def fetch_rentry_html(url, headers):
    """Fetches HTML content from a given URL."""
    app_logger.info(f"Fetching Rentry page: {url}")
    try:
        response = requests.get(url, headers=headers, timeout=45)
        app_logger.debug(
            f"Fetch status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}"
        )
        response.raise_for_status()
        content_type = response.headers.get("Content-Type", "").lower()
        if "text/html" not in content_type:
            app_logger.warning(f"Expected HTML from {url}, got {content_type}")
        app_logger.info(f"Rentry page fetched successfully: {url}")
        return response.text
    except requests.exceptions.Timeout:
        app_logger.error(f"Timeout occurred while fetching Rentry page {url}.")
        return None
    except requests.exceptions.RequestException as e:
        app_logger.error(f"Could not fetch Rentry page {url}. Exception: {e}")
        return None
    except Exception as e:
        app_logger.error(
            f"Unexpected error fetching Rentry page {url}: {e}", exc_info=True
        )
        return None


def parse_rentry_items(html_content, base_url):
    """
    Parses Rentry HTML using logic similar to parser_working.py.
    Returns dict: {category_name: [(original_name, url, base_filename), ...]} or None.
    """
    if not html_content:
        app_logger.error("parse_rentry_items called with no HTML content.")
        return None

    app_logger.info(f"Parsing Rentry HTML for categories and items from {base_url}...")
    try:
        soup = BeautifulSoup(html_content, "lxml")
        content_article = soup.find("article")
        if not content_article:
            app_logger.error("Could not find main <article> tag in HTML.")
            return None

        categorized_items = {}
        current_category = "Uncategorized"
        items_found_count = 0

        # --- Logic based on parser_working.py ---
        # Iterate through all relevant tags (headers and tables) in document order
        # This assumes headers define the category for subsequent tables until the next header
        for tag in content_article.find_all(["h3", "h4", "div", "table"]):
            # --- Category Handling ---
            if tag.name in ["h3", "h4"]:
                category_text = tag.get_text(strip=True)
                if category_text:
                    current_category = sanitize_filename(category_text)
                    if not current_category:
                        current_category = f"Category_{tag.name}"
                    app_logger.debug(
                        f"Switched to category: '{current_category}' (from '{category_text}')"
                    )
                    # Ensure category exists even if no items follow immediately
                    if current_category not in categorized_items:
                        categorized_items[current_category] = []

            # --- Table Handling ---
            table_to_process = None
            # Check if the tag itself is the table we want
            if tag.name == "table" and "ntable" in tag.get("class", []):
                # Avoid processing tables inside wrappers if the wrapper was already handled
                parent_wrapper = tag.find_parent("div", class_="ntable-wrapper")
                if not parent_wrapper:  # Process if it's a standalone table
                    table_to_process = tag
            # Check if it's the wrapper div containing the table
            elif tag.name == "div" and "ntable-wrapper" in tag.get("class", []):
                table_to_process = tag.find("table", class_="ntable")

            if table_to_process:
                tbody = table_to_process.find("tbody") or table_to_process
                for row in tbody.find_all("tr"):
                    cells = row.find_all("td")
                    if len(cells) >= 3:
                        name_tag = cells[0].find("strong") or cells[0]
                        original_name = (
                            name_tag.get_text(strip=True) if name_tag else None
                        )
                        if not original_name or original_name == "****":
                            continue

                        # Find ALL 'DL' links specifically in the 3rd cell
                        dl_links_in_cell = cells[2].find_all(
                            "a", string="DL", href=True
                        )

                        if dl_links_in_cell:
                            # Handle multiple DL links - take the first one found
                            if len(dl_links_in_cell) > 1:
                                app_logger.warning(
                                    f"Multiple 'DL' links found for item '{original_name}'. Using the first one: {dl_links_in_cell[0]['href']}"
                                )

                            dl_link_tag = dl_links_in_cell[0]  # Take the first link
                            href = dl_link_tag["href"].strip()
                            if href:
                                absolute_url = urljoin(base_url, href)
                                base_filename = sanitize_filename(original_name)
                                item_data = (original_name, absolute_url, base_filename)

                                # Ensure category exists before appending
                                if current_category not in categorized_items:
                                    categorized_items[current_category] = []
                                categorized_items[current_category].append(item_data)
                                items_found_count += 1
                                # app_logger.debug(f"  Added Item: Cat='{current_category}', Name='{original_name}'") # Verbose
                        # else:
                        #      app_logger.debug(f"Skipping row for '{original_name}': No 'DL' link found in 3rd cell.")

        # Cleanup empty categories
        empty_categories = [
            cat for cat, items in categorized_items.items() if not items
        ]
        for cat in empty_categories:
            del categorized_items[cat]
            app_logger.debug(f"Removed empty category: '{cat}'")

        if items_found_count == 0:
            app_logger.error(
                "Parsing finished, but no items extracted. Check HTML structure ('Asset Store.htm') and parser logic."
            )
            return {}

        app_logger.info(
            f"Parsing complete. Found {items_found_count} items in {len(categorized_items)} categories."
        )
        return categorized_items

    except Exception as e:
        app_logger.error(f"Critical error during HTML parsing: {e}", exc_info=True)
        return None


# --- Main Execution / Testing ---
if __name__ == "__main__":
    print("--- Running parser.py in test mode ---")

    class MockConfig:  # Minimal config for logger testing
        def get(self, section, option, fallback=None):
            return fallback

        def getboolean(self, section, option, fallback=False):
            return fallback

    try:
        import logger as main_logger

        main_logger.setup_logging(MockConfig())
        print("[Parser Test] Logger setup complete.")
    except ImportError:
        print("[Parser Test] Warning: logger.py not found.")
    except Exception as log_e:
        print(f"[Parser Test] Warning: Error setting up logger: {log_e}")

    print("\n--- Testing sanitize_filename ---")
    test_names = [
        "Valid Name.zip",
        'Invalid<>:"/\\|?*Chars.rar',
        " Leading/Trailing Spaces ",
        "Lots   of   spaces.txt",
        "file/with/slashes.jpg",
        "very_long_filename_" + ("a" * 250) + ".ext",
        "",
        None,
        "Control\x08Chars.zip",
        ".hiddenfile",
        "name.with.dots.tar.gz",
        "con",
        "PRN.txt",
        "LPT1",
    ]
    for name in test_names:
        print(f"Original: '{name}' -> Sanitized: '{sanitize_filename(name)}'")

    print("\n--- Testing get_final_desired_filename ---")
    test_cases = [
        ("My Image", "photo.jpeg"),
        ("archive", "data.zip"),
        ("document.pdf", "document.pdf"),
        ("video.mp4", "different_video.mkv"),
        ("no_extension", "file.with.ext.png"),
        ("archive.zip", "archive.zip.part"),
        ("image", ""),
        ("", "actual_name.gif"),
        ("base_with.dots", "actual_name.dots.jpg"),
        ("base_with.dots.ext", "actual_name.dots.ext"),
    ]
    for desired, actual in test_cases:
        print(
            f"Desired: '{desired}', Actual: '{actual}' -> Final: '{get_final_desired_filename(desired, actual)}'"
        )

    print("\n--- Testing is_likely_direct_link ---")
    test_urls = [
        "https://example.com/files/archive.zip",
        "http://images.com/img.png",
        "https://rentry.org/somepage",
        "https://github.com/user/repo",
        "https://mega.nz/file/abc123xyz",
        "https://pixeldrain.com/u/XyZaBc",
        "https://pixeldrain.com/l/GhIjKl",
        "https://pixeldrain.com/f/123456",
        "https://i.imgur.com/image.jpeg",
        "https://imgur.com/gallery/abcde",
        "ftp://invalid.com/file.txt",
        "https://example.com/page?download=true",
        "https://example.com/short",
        "https://example.com/",
        "https://domain.with.dots.co.uk/path/file.pdf",
        "http://192.168.1.100/localfile.mkv",
    ]
    for url in test_urls:
        print(f"URL: '{url}' -> Likely Direct: {is_likely_direct_link(url)}")

    TEST_HTML_FILE = "Asset Store.htm"  # Use the correct file name
    TEST_BASE_URL = "https://rentry.co/CSP_764"  # Use .co as per canonical in file
    if os.path.exists(TEST_HTML_FILE):
        print(f"\n--- Loading test HTML from: {TEST_HTML_FILE} ---")
        try:
            with open(TEST_HTML_FILE, "r", encoding="utf-8") as f:
                test_html = f.read()
            print(f"HTML loaded ({len(test_html)} bytes). Parsing...")
            parsed_data = parse_rentry_items(test_html, TEST_BASE_URL)
            if parsed_data is not None:
                print("\n--- Parsed Data Summary (from test file) ---")
                total_items = sum(len(v) for v in parsed_data.values())
                print(
                    f"Parsing successful. Found {total_items} items in {len(parsed_data)} categories."
                )
                for category, items in parsed_data.items():
                    print(f"  Category: '{category}' ({len(items)} items)")
                    if items:
                        print(f"    Sample: {items[0][0]} -> {items[0][1]}")
                print("---------------------------------------------")
            else:
                print("\nParsing failed or returned no data from the test file.")
        except Exception as e:
            print(f"\nError during parser test: {e}", exc_info=True)
    else:
        print(f"\nTest HTML file '{TEST_HTML_FILE}' not found. Skipping parse test.")
    print("\n--- Parser Test Complete ---")