import re import os import json # Import the json library from pathlib import Path from bs4 import BeautifulSoup import urllib.parse # For decoding URL-encoded filenames # --- Configuration --- HTML_FILE = Path("Asset Store.htm") # Corrected based on context DOWNLOAD_BASE_DIR = Path("CSP_764") # VERIFY THIS PATH # Consider changing the output extension to .json for clarity OUTPUT_FILE = Path("missing_files_report.json") # Changed extension to .json # --- Column Indices (Verified from Asset Store.htm) --- NAME_COLUMN_INDEX = 0 DL_COLUMN_INDEX = 2 # ------------------------------------------------------------- # List of common extensions (used for double-checking, but primary check is base name) COMMON_EXTENSIONS = [ ".zip", ".rar", ".sut", ".clip", ".7z", ".png", ".jpg", ".jpeg", ".txt", ".unknown", ".mp4", ".webm", ".pdf", ".unitypackage", ".brushset", ".abr", ".tpl", ".ai", ] # --- Sanitization Function (Crucial: Must match downloader's logic) --- def sanitize_filename(name: str) -> str: """Removes or replaces characters invalid for filenames.""" if not name: return "_unknown_" sanitized = re.sub(r'[\\/*?:"<>|]', "", name) sanitized = re.sub(r"\s+", "_", sanitized) sanitized = sanitized.strip("._") if not sanitized: return "_unknown_" return sanitized # --- Helper Functions --- def extract_items_from_html(html_file: Path) -> list[tuple[str, str]]: """ Parses the HTML file (assuming a table structure) to find asset names from the 'Name' column and their corresponding download links from the 'DL' column. Returns a list of tuples: (extracted_name, download_url) """ items = [] print(f"--- Parsing HTML file: {html_file.resolve()} ---") if not html_file.is_file(): print(f"Error: HTML file not found: {html_file.resolve()}") return items try: with open(html_file, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") rows = soup.find_all("tr") print(f"Found {len(rows)} table rows ().") if not rows: print( "Warning: No table rows found. Cannot extract items based on table structure." ) return items processed_hrefs = set() items_extracted_count = 0 max_expected_index = max(NAME_COLUMN_INDEX, DL_COLUMN_INDEX) for i, row in enumerate(rows): cells = row.find_all("td") if len(cells) <= max_expected_index: continue try: name_cell = cells[NAME_COLUMN_INDEX] asset_name = name_cell.get_text(strip=True) except IndexError: continue dl_href = None try: dl_cell = cells[DL_COLUMN_INDEX] link_tag = dl_cell.find("a", href=True) if link_tag: temp_href = link_tag["href"].strip() if ( temp_href and not temp_href.startswith("#") and not temp_href.startswith("javascript:") ): dl_href = temp_href except IndexError: continue if asset_name and dl_href and dl_href not in processed_hrefs: if asset_name == dl_href: print( f"Warning: Row {i}, Name is the same as DL href, skipping: {dl_href}" ) continue items.append((asset_name, dl_href)) processed_hrefs.add(dl_href) items_extracted_count += 1 except FileNotFoundError: print(f"Error: HTML file not found during open: {html_file.resolve()}") return [] except Exception as e: print(f"Error parsing HTML file {html_file}: {e}") import traceback traceback.print_exc() print( f"Extracted {items_extracted_count} potential asset items (Name/DL Link pairs) from table rows." ) print(f"--- Finished parsing HTML ---") return items def scan_download_directory(base_dir: Path) -> set[str]: """ Recursively scans the download directory and returns a set of base filenames (without extensions) found. """ existing_base_filenames = set() print(f"--- Scanning download directory: {base_dir.resolve()} ---") if not base_dir.is_dir(): print(f"Error: Download base directory not found: {base_dir.resolve()}") return existing_base_filenames file_count = 0 for item_path in base_dir.rglob("*"): if item_path.is_file(): if item_path.name.endswith( ( ".txt", ".log", ".crdownload", ".part", ".tmp", ".py", ".htm", ".html", ".json", ) ): # Exclude more types continue file_count += 1 base_name = item_path.stem if base_name: existing_base_filenames.add(base_name) print(f"Scanned {file_count} potential asset files.") print(f"Found {len(existing_base_filenames)} unique base filenames in {base_dir}.") print(f"--- Finished scanning directory ---") return existing_base_filenames # --- Main Script Execution --- if __name__ == "__main__": print("--- Starting Comparison (HTML vs Disk Files) ---") html_items = extract_items_from_html(HTML_FILE) if not html_items: print("No potential items extracted from HTML. Cannot perform comparison.") exit() existing_files_base_set = scan_download_directory(DOWNLOAD_BASE_DIR) if not existing_files_base_set: print( f"Warning: No files found in {DOWNLOAD_BASE_DIR}. All HTML items will be marked as missing." ) missing_items_details = [] found_count = 0 checked_sanitized_names = set() print("\n--- Checking HTML items against files on disk ---") for original_name, dl_href in html_items: sanitized_name = sanitize_filename(original_name) if ( not sanitized_name or sanitized_name == "_unknown_" or sanitized_name in checked_sanitized_names ): continue checked_sanitized_names.add(sanitized_name) is_found = sanitized_name in existing_files_base_set if is_found: found_count += 1 else: missing_items_details.append( { "name_html": original_name, "sanitized_name": sanitized_name, "download_url": dl_href, } ) # Store as dictionary for JSON # Sort the list of missing item dictionaries by 'name_html' missing_items_details.sort(key=lambda item: item["name_html"].lower()) # --- Prepare data for JSON output --- total_items_checked = len(checked_sanitized_names) missing_count = len(missing_items_details) output_data = { "report_summary": { "total_items_checked_html": total_items_checked, "items_found_on_disk": found_count, "items_missing_on_disk": missing_count, "source_html_file": str(HTML_FILE.resolve()), "scanned_directory": str(DOWNLOAD_BASE_DIR.resolve()), }, "missing_items": missing_items_details, # Already a list of dictionaries } # --- Report summary to console --- print(f"\n--- Comparison Summary ---") print(f"Total unique sanitized names checked from HTML: {total_items_checked}") print(f"Items FOUND as files on disk (based on base name): {found_count}") print( f"Items from HTML potentially MISSING on disk: {missing_count} out of {total_items_checked}" ) # --- Write JSON data to the output file --- try: with open(OUTPUT_FILE, "w", encoding="utf-8") as f: # Use json.dump for pretty printing json.dump(output_data, f, indent=4, ensure_ascii=False) print( f"Missing files report written in JSON format to: {OUTPUT_FILE.resolve()}" ) except Exception as e: print(f"Error writing JSON output file {OUTPUT_FILE}: {e}") print("--- Comparison Finished ---")