
    	hsG                        d dl Z d dlmZmZ d dlZd dlZd dlZd dlZd dlm	Z	m
Z
mZ d dlZd dlmZ ddhZh dZh dZd	 Zd
 Zd Zd Zd Zedk    r ed            G d d          Z	 d dlZ ej         e                        ed           n1# e$ r  ed           Y ne$ rZ ede            Y dZ[ndZ[ww xY w ed           g dZeD ]Z  ede  d ee            d            ed           g dZ!e!D ]$\  Z"Z# ede" de# d ee"e#           d           % ed           g d Z$e$D ]Z% ed!e% d" ee%                      d#Z&d$Z'ej(        )                    e&          rW ed%e& d&           	  e*e&d'd()          5 Z+e+,                                Z-ddd           n# 1 swxY w Y    ed* e.e-           d+            ee-e'          Z/e/ ed,            e0d- e/1                                D                       Z2 ed.e2 d/ e.e/           d0           e/3                                D ]K\  Z4Z3 ed1e4 d2 e.e3           d3           e3r) ed4e3d          d           d5e3d          d6                     L ed7           n ed8           n1# e$ rZ5 ed9e5 d:;           Y dZ5[5ndZ5[5ww xY w ed<e& d=            ed>           dS dS )?    N)BeautifulSoupTag)urljoinurlparseunquote)
app_loggerhttphttps>;   .7z.gz.xz.aac.abr.avi.bmp.bz2.csp.dmg.doc.exe.fbx.gif.img.iso.jpg.kra.mkv.mov.mp3.mp4.msi.mtl.obj.ogg.otf.pdf.png.ppt.psd.rar.sai.sut.svg.tar.tif.tpl.ttf.txt.wav.wmv.xls.zip.clip.docx.flac.jpeg.pptx.sai2.tiff.webm.webp.woff.xlsx.blend.brush.woff2.unitypackage>&   x.combox.comitch.iomega.nzbooth.pmyoutu.be	boosty.to	fanbox.cc	gofile.io	imgur.com	rentry.co	vimeo.com
dlsite.com
github.com
gitlab.com
reddit.com
rentry.org
tumblr.comdropbox.comgumroad.compatreon.comtwitter.comyoutube.com1fichier.comfacebook.compastebin.comanonfiles.combitbucket.orginstagram.commediafire.comsendspace.comartstation.comzippyshare.comkrakenfiles.comdrive.google.comonedrive.live.comsubscribestar.adultpixeldrain.comc                    | s#dt          t          j                               S h d}t          j        dd|           }t          j        dd|          }|                    d          }t
          j                            |          \  }}|                                |v rd|z   }|st          j        dd|           }t          j        dd|                              d          }|s#dt          t          j                               S t
          j                            |          \  }}|                                |v rd|z   }d}t          |          |k    rrt
          j                            |          \  }}t          |          d	k     r|d
|t          |          z
           |z   }n
|d
|         }|                    d          }|r|n"dt          t          j                               S )zCRemoves or replaces characters invalid for Windows/Linux filenames.	download_>   AUXCONNULPRNCOM1COM2COM3COM4COM5COM6COM7COM8COM9LPT1LPT2LPT3LPT4LPT5LPT6LPT7LPT8LPT9z[<>:"/\\|?*\x00-\x1F]_z\s+z._ z	[^\w\-. ]   
   N)
inttimeresubstripospathsplitextupperlen)namereserved_names	sanitizedbase_name_checkr   max_lenbaseexts           0c:\Users\brownies\Downloads\rentrytest\parser.pysanitize_filenamer   |   s    .-3ty{{++---  N0 /d;;IvsI..I&&I)))44OQ.00)O	 (F<d33	F63	2288??	 	21s49;;//111W--i88  ""n44iIG
9~~G$$Y//	cs88b==1wS112S8II!(7(+IOOE**	!E99'E3ty{{3C3C'E'EE    c                 \   | rt          |           nd}|s+|r|dz   n#dt          t          j                               dS t          j                            |          \  }}|                                }h d}||v rd}d}|st          |          S t          j                            |          \  }}|                                }|r||k    r|S |r@|r>||k    r8t          j                            |          \  }	}|	s|}	t          |	|z             S |s|rt          ||z             S |S )zJConstructs the final filename, preferring desired base + actual extension. z.unknownrm   >   .tmp.part	.download.crdownload)r   r   r   r   r   r   lower)
desired_baseactual_filenamesanitized_baser   
actual_extactual_ext_lowertemp_extensionsdesired_extdesired_ext_lowerbase_without_exts
             r   get_final_desired_filenamer      s   8DL&|444"N 
 8NZ''7S--777	
 G$$_55MAz!''))CCCO?**
 2 111W%%n55NA{#))++  .2BBB  
- 
2CGW2W2W g..~>>! 	.- z)
 
 	

  
!1 
 Z'
 
 	

 r   c                    	 t          |           }|j        t          vrdS |j        r|j                                        nd}|j                                        }|sdS t          D ]_}||k    s|                    d|           r?|dk    r n8|dk    r|                    d          r nt          j
        d|              dS `t          j                            |          \  }}|t          v rt          j
        d|             d	S t          j
        d
|             dS # t          $ r%}t          j        d|  d|            Y d}~dS d}~ww xY w)zCHeuristic to guess if a URL points directly to a downloadable file.Fr   .zi.imgur.comrk   z/f/z)URL classified as PAGE (known hostname): z,URL classified as DIRECT (extension match): Tz6URL classification UNDETERMINED (defaulting to PAGE): z)Error in is_likely_direct_link check for : N)r   schemeVALID_SCHEMEShostnamer   r   PAGE_LIKE_HOSTNAMESendswith
startswithr   debugr   r   DIRECT_LINK_EXTENSIONS	Exceptionwarning)urlparsedr   r   	page_hostr   r   es           r   is_likely_direct_linkr      s   #}--5.4oE6?((***2{  "" 	5, 	 	I9$$(9(9/i//(J(J$},,E///DOOE4J4J/E !RS!R!RSSSuu % !!$''3(((QCQQRRR4WRUWWXXXu   QsQQaQQRRRuuuuus0   D$ =D$  A$D$ AD$ D$ $
E.EEc                    t          j        d|             	 t          j        | |d          }t          j        d|j         d|j                            d                      |                                 |j                            dd                                          }d|vrt          j	        d	|  d
|            t          j        d|             |j
        S # t          j        j        $ r t          j        d|  d           Y dS t          j        j        $ r%}t          j        d|  d|            Y d}~dS d}~wt          $ r'}t          j        d|  d| d           Y d}~dS d}~ww xY w)z&Fetches HTML content from a given URL.zFetching Rentry page: -   )headerstimeoutzFetch status: z, Content-Type: zContent-Typer   z	text/htmlzExpected HTML from z, got z"Rentry page fetched successfully: z,Timeout occurred while fetching Rentry page r   NzCould not fetch Rentry page z. Exception: z&Unexpected error fetching Rentry page r   Texc_info)r   inforequestsgetr   status_coder   raise_for_statusr   r   text
exceptionsTimeouterrorRequestExceptionr   )r   r   responsecontent_typer   s        r   fetch_rentry_htmlr   
  s   O2S22333<WbAAAiX1ii8CSCWCWXfCgCgii	
 	
 	
 	!!###'++NB??EEGGl**NSNNNNOOOBSBBCCC}&   NNNNOOOtt/   MMM!MMNNNttttt   ?S??A??$	
 	
 	
 	
 ttttt	s*   C
C$ $,E:E:&EE:E55E:c           	         | st          j        d           dS t          j        d| d           	 t          | d          }|                    d          }|st          j        d           dS i }d}d	}|                    g d
          D ]K}|j        dv rW|                    d          }|r?t          |          }|s
d|j         }t          j	        d| d| d           ||vrg ||<   d}	|j        dk    r4d|
                    dg           v r|                    dd          }
|
s|}	n:|j        dk    r/d|
                    dg           v r|                    dd          }	|	rj|	                    d          p|	}|                    d          D ]<}|                    d          }t          |          dk    r|d	                             d          p|d	         }|r|                    d          nd}|r|dk    rr|d                             dd d!          }|rt          |          d"k    r&t          j        d#| d$|d	         d%                     |d	         }|d%                                         }|rMt          ||          }t          |          }|||f}||vrg ||<   ||                             |           |d"z  }>Md& |                                D             }|D ]}||= t          j	        d'| d(           |d	k    rt          j        d)           i S t          j        d*| d+t          |           d,           |S # t$          $ r$}t          j        d-| d.           Y d}~dS d}~ww xY w)/z
    Parses Rentry HTML using logic similar to parser_working.py.
    Returns dict: {category_name: [(original_name, url, base_filename), ...]} or None.
    z/parse_rentry_items called with no HTML content.Nz2Parsing Rentry HTML for categories and items from z...lxmlarticlez*Could not find main <article> tag in HTML.Uncategorizedr   )h3h4divtable)r   r   T)r   	Category_zSwitched to category: 'z	' (from 'z')r   ntableclassr   zntable-wrapper)class_tbodytrtd   strongz****   aDL)stringhref   z$Multiple 'DL' links found for item 'z'. Using the first one: r   c                     g | ]	\  }}||
S  r   ).0catitemss      r   
<listcomp>z&parse_rentry_items.<locals>.<listcomp>  s1     
 
 
C5

 
 
r   zRemoved empty category: ''zdParsing finished, but no items extracted. Check HTML structure ('Asset Store.htm') and parser logic.zParsing complete. Found 
 items in  categories.z$Critical error during HTML parsing: r   )r   r   r   r   findfind_allr   get_textr   r   r   find_parentr   r   r   r   appendr   r   )html_contentbase_urlsoupcontent_articlecategorized_itemscurrent_categoryitems_found_counttagcategory_texttable_to_processparent_wrapperr   rowcellsname_tagoriginal_namedl_links_in_celldl_link_tagr   absolute_urlbase_filename	item_dataempty_categoriesr   r   s                            r   parse_rentry_itemsr  %  s   
  JKKKtOVVVVWWWf\622))I.. 	IJJJ4*
 #++,H,H,HII >	7 >	7Cx<'' #4 8 8  	A'8'G'G$+ B+Asx+A+A($^2B^^]^^^   (/@@@>@)*:;  $x7""x3777B3G3G'G'G!$?O!P!P% +'*$U""'73777B;O;O'O'O#&88GH8#E#E  #7(--g66J:J >>$// !7 !7CLL..E5zzQ#(8==#:#:#FeAh=EOH--D-9994 &  - %0G0G$ ,18+<+<4 ,= , ,( , 7"#344q88 * 2 %P=  %P  %P  sC  DE  sF  GM  sN  %P  %P!" !" !" +;1*=K#.v#6#<#<#>#>D# 	7/6x/F/F0A-0P0P-:L-,X	 $4;L#L#LJL$56F$G 12B C J J9 U U U 1Q 6 1
 
"3"9"9";";
 
 
 $ 	A 	AC!#&????@@@@!!v   Ih'8hhCHYDZDZhhh	
 	
 	
 !    CCCdSSSSttttts$   ;M /KM 3)M 
N'NN__main__z&--- Running parser.py in test mode ---c                       e Zd ZddZddZdS )
MockConfigNc                     |S Nr   selfsectionoptionfallbacks       r   r   zMockConfig.get      Or   Fc                     |S r  r   r  s       r   
getbooleanzMockConfig.getboolean  r  r   r  )F)__name__
__module____qualname__r   r  r   r   r   r  r    s<        	 	 	 		 	 	 	 	 	r   r  z$[Parser Test] Logger setup complete.z+[Parser Test] Warning: logger.py not found.z0[Parser Test] Warning: Error setting up logger: z"
--- Testing sanitize_filename ---)zValid Name.zipzInvalid<>:"/\|?*Chars.rarz Leading/Trailing Spaces zLots   of   spaces.txtzfile/with/slashes.jpga  very_long_filename_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.extr   NzControlChars.zipz.hiddenfilezname.with.dots.tar.gzconzPRN.txtr{   zOriginal: 'z' -> Sanitized: 'r   z+
--- Testing get_final_desired_filename ---)
)zMy Imagez
photo.jpeg)archivezdata.zip)document.pdfr  )z	video.mp4zdifferent_video.mkv)no_extensionzfile.with.ext.png)zarchive.zipzarchive.zip.part)imager   )r   zactual_name.gif)zbase_with.dotszactual_name.dots.jpg)zbase_with.dots.extzactual_name.dots.extz
Desired: 'z', Actual: 'z' -> Final: 'z&
--- Testing is_likely_direct_link ---)z%https://example.com/files/archive.zipzhttp://images.com/img.pngzhttps://rentry.org/somepagezhttps://github.com/user/repozhttps://mega.nz/file/abc123xyzzhttps://pixeldrain.com/u/XyZaBczhttps://pixeldrain.com/l/GhIjKlzhttps://pixeldrain.com/f/123456zhttps://i.imgur.com/image.jpegzhttps://imgur.com/gallery/abcdezftp://invalid.com/file.txtz&https://example.com/page?download=truezhttps://example.com/shortzhttps://example.com/z,https://domain.with.dots.co.uk/path/file.pdfz"http://192.168.1.100/localfile.mkvzURL: 'z' -> Likely Direct: zAsset Store.htmzhttps://rentry.co/CSP_764z
--- Loading test HTML from: z ---rzutf-8)encodingzHTML loaded (z bytes). Parsing...z-
--- Parsed Data Summary (from test file) ---c              #   4   K   | ]}t          |          V  d S r  )r   )r   vs     r   	<genexpr>r%    s(      !G!GQ#a&&!G!G!G!G!G!Gr   zParsing successful. Found r   r   z  Category: 'z' (z items)z    Sample: z -> r   z----------------------------------------------z7
Parsing failed or returned no data from the test file.z
Error during parser test: Tr   z
Test HTML file 'z!' not found. Skipping parse test.z
--- Parser Test Complete ---)6r   bs4r   r   r   r   r   sysurllib.parser   r   r   r   loggerr   r   r   r   r   r   r   r   r  r  printr  main_loggersetup_loggingImportErrorr   log_e
test_namesr   
test_casesdesiredactual	test_urlsr   TEST_HTML_FILETEST_BASE_URLr   existsopenfread	test_htmlr   parsed_datasumvaluestotal_itemsr   categoryr   r   r   r   <module>r@     s    " " " " " " " " 				 				  



 3 3 3 3 3 3 3 3 3 3        !< < < |' ' ' Z6F 6F 6Fr. . .b  H  6p p ph z	E
2333       J$$$$!!**,,///45555 = = =;<<<<< J J JHHHIIIIIIIIJ 
E
/000  J   O OMDMM3D3DT3J3JMMMNNNN	E
8999  J & 
 
qqqfqqC]C]^egmCnCnqqq	
 	
 	
 	
 
E
3444  I$  N NLsLL0E0Ec0J0JLLMMMM&N/M	w~~n%% VC~CCCDDD	EncG<<< %FFHH	% % % % % % % % % % % % % % %EE##i..EEEFFF,,YFFK&FGGG!c!G!G+2D2D2F2F!G!G!GGGfffKHXHXfff   (3'8'8':': M MOHeEJ(JJss5zzJJJKKK MKU1Xa[KKeAhqkKKLLLEFFFFPQQQ 	E 	E 	EE444tDDDDDDDDD	E 	T>TTTUUU	E
*+++++U s[   ,'B C&C*B==CJ, F6*J, 6F::J, =F:>C-J, ,K1KK