From 509ef2a164a299f10d8179ed963860db3b390740 Mon Sep 17 00:00:00 2001 From: Simatwa Caleb Date: Sat, 3 Jun 2023 16:08:45 +0300 Subject: [PATCH 1/9] Make a package --- .gitignore | 4 ++ README.md | 81 +++++++++++++++++++++++++++++++++++++++-- install.sh | 3 ++ setup.py | 33 +++++++++++++++++ wclone/__init__.py | 5 +++ wclone/__main__.py | 2 + app.py => wclone/app.py | 77 ++++++++++++++++++++++++++------------- 7 files changed, 175 insertions(+), 30 deletions(-) create mode 100644 .gitignore create mode 100644 install.sh create mode 100644 setup.py create mode 100644 wclone/__init__.py create mode 100644 wclone/__main__.py rename app.py => wclone/app.py (77%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3933979 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +wclone.egg-info +build +*__pycache__ +.vscode diff --git a/README.md b/README.md index 6657538..022b76a 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,86 @@ # Basic website cloner **Description:** + * This script is a basic website cloner that downloads all the files from a website and saves them in a folder. * It works by scraping all the links on the page, save them locally, and replace them with the local path. -**Usage:** -```bash -python3 website_cloner.py +## Installation + +## From [pypi](pypi.org). + +```sh +pip3 install wclone +``` + +## From source + +```sh +git clone https://github.com/Simatwa/website-clone.git +cd website-cloner +bash install.sh +``` + +## Usage + +```sh +$ wclone ``` **More info:** -* In order to use the tor network, you need to download the [tor package](https://www.torproject.org/dist/torbrowser/11.0.9/tor-win32-0.4.6.10.zip) and start the tor service. \ No newline at end of file + +* In order to use the tor network, you need to download the [tor package](https://www.torproject.org/dist/torbrowser/11.0.9/tor-win32-0.4.6.10.zip) and start the tor service. + +
+ + + +> For further info run `$ wclone --help` + + + +
+ +``` + usage: wclone [-h] [-v] + [-tp PROXY] + [-o FOLDER] + [-w PATH] + [--use-tor] + host + +Basic website cloner written in Python + +positional arguments: + host URL pointing to a + website + +options: + -h, --help + show this help + message and exit + -v, ---version + show program's + version number and + exit + -tp PROXY, --tor-proxy PROXY + Proxy server url + without schema - + localhost:9050 + -o FOLDER, --output FOLDER + Folder for saving + contents - host + -w PATH, --workspace PATH + Directory for saving + contents - /storage/ + emulated/0/git/Smart + wa/website-cloner + --use-tor Use tor proxy - + False +``` +
+ + + + + diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..a627af0 --- /dev/null +++ b/install.sh @@ -0,0 +1,3 @@ +#!/usr/bin/bash +pip3 install . +wclone -h \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..2ac67be --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +from setuptools import setup +from wclone import __version__,__repo__,__maintainer__,__author__,__description__ + +setup( + name="wclone", + version=__version__, + license="MIT License", + author=__author__, + maintainer=__maintainer__, + maintainer_email="smartwacaleb@gmail.com", + description=__description__, + packages=["wclone"], + url=__repo__, + project_urls={"Bug Report": f"{__repo__}/issues/new"}, + install_requires=["bs4", "requests"], + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + python_requires=">=3.8", + classifiers=[ + "License :: OSI Approved :: MIT License", + "Intended Audience :: Developers", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], + entry_points = { + "console_scripts" : [ + ("wclone = wclone.app:main"), + ] + } +) \ No newline at end of file diff --git a/wclone/__init__.py b/wclone/__init__.py new file mode 100644 index 0000000..458d0e7 --- /dev/null +++ b/wclone/__init__.py @@ -0,0 +1,5 @@ +__version__="1.0.0" +__maintainer__="Simatwa" +__author__="ZKAW" +__description__="Basic website cloner written in Python" +__repo__=f"https://github.com/{__author__}/website-cloner" \ No newline at end of file diff --git a/wclone/__main__.py b/wclone/__main__.py new file mode 100644 index 0000000..40d7ebf --- /dev/null +++ b/wclone/__main__.py @@ -0,0 +1,2 @@ +from .app import main +main() \ No newline at end of file diff --git a/app.py b/wclone/app.py similarity index 77% rename from app.py rename to wclone/app.py index 03284be..715bce0 100644 --- a/app.py +++ b/wclone/app.py @@ -1,30 +1,15 @@ +#!/usr/bin/python3 import requests import functools import shutil import codecs import sys import os +import logging from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse - -# URL of the web page you want to extract data from -url = "https://google.com" -use_tor_network = False - - -if len(sys.argv) > 1: url = sys.argv[1] -output_folder = urlparse(url).netloc - -# initialize a session -session = requests.session() -if use_tor_network: - session.request = functools.partial(session.request, timeout=30) - session.proxies = {'http': 'socks5h://localhost:9050', - 'https': 'socks5h://localhost:9050'} - -# define workspace from script location -workspace = os.path.dirname(os.path.realpath(__file__)) +from . import __version__,__description__ class Extractor: def __init__(self, url): @@ -42,7 +27,8 @@ def get_page_content(self, url): content = session.get(url) content.encoding = 'utf-8' return content.text - except: return None + except Exception as e: + sys.exit(logging.critical(get_excep(e))) # get the script files def scrap_scripts(self): @@ -203,7 +189,7 @@ def download_file(self, url, output_path): response = session.get(url) with open(output_path, "wb") as file: file.write(response.content) - print(f"Downloaded {file_name} to {os.path.relpath(output_path)}") + logging.info(f"Downloaded {file_name} to {os.path.relpath(output_path)}") return True @@ -224,12 +210,51 @@ def save_html(self): with codecs.open(output_path, 'w', 'utf-8') as file: file.write(prettyHTML) file.close() - print(f"Saved index.html to {os.path.relpath(output_path)}") + logging.info(f"Saved index.html to {os.path.relpath(output_path)}") return True -extractor = Extractor(url) - -print(f"Extracting files from {url}\n") -extractor.run() -print(f"\nTotal extracted files: {len(extractor.scraped_urls)}") \ No newline at end of file +def main(): + import argparse + parser = argparse.ArgumentParser(description=__description__) + parser.add_argument('host',help="URL pointing to a website") + parser.add_argument('-v','---version',action="version",version=f"%(prog)s {__version__}") + parser.add_argument("-tp","--tor-proxy",metavar="PROXY",help="Proxy server url without schema - %(default)s",default="localhost:9050") + parser.add_argument("-o","--output",metavar="FOLDER",help="Folder for saving contents - host") + parser.add_argument("-w","--workspace",metavar="PATH",help="Directory for saving contents - %(default)s",default=os.getcwd()) + parser.add_argument('--use-tor',action='store_true',help="Use tor proxy - %(default)s") + args = parser.parse_args() + + global url,workspace,get_excep,output_folder,session + + logging.basicConfig(format="[%(asctime)s : %(levelname)s] - %(message)s",datefmt="%H:%M:%S",level=logging.INFO) + # URL of the web page you want to extract data from + url = args.host + if not url.startswith('http'): + url="https://"+url + + #define workspace from script location + workspace = args.workspace + + get_excep = lambda e: e.args[1] if len(e.args)>1 else e + + output_folder = args.output or urlparse(url).netloc + # initialize a session + session = requests.session() + if args.use_tor: + session.request = functools.partial(session.request, timeout=30) + session.proxies = {'http': f'socks5h://{args.tor_proxy}','https': f'socks5h://{args.tor_proxy}'} + + try: + logging.info(f"Extracting files from '{url}'") + extractor = Extractor(url) + ectractor.run() + logging.info(f"\nTotal extracted files: {len(extractor.scraped_urls)}") + except (KeyboardInterrupt,EOFError) as e: + logging.warning(f'^{e} - Exitting"') + except Exception as e: + #logging.exception(e) + logging.critical(get_excep(e)) + +if __name__=="__main__": + main() \ No newline at end of file From 524e7eef6e32385ecf3fa05cee41ca5ca3cfe90a Mon Sep 17 00:00:00 2001 From: Simatwa Caleb Date: Sat, 3 Jun 2023 16:17:38 +0300 Subject: [PATCH 2/9] Make a package --- README.md | 47 ++++++++++++++++++++--------------------------- wclone/app.py | 2 +- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 022b76a..f032434 100644 --- a/README.md +++ b/README.md @@ -39,44 +39,37 @@ $ wclone -
- ``` - usage: wclone [-h] [-v] - [-tp PROXY] - [-o FOLDER] - [-w PATH] + usage: wclone [-h] [-v] [-tp PROXY] + [-o FOLDER] [-w PATH] [--use-tor] host Basic website cloner written in Python -positional arguments: - host URL pointing to a - website - +positional arguments: host URL pointing to a + website options: - -h, --help - show this help - message and exit + -h, --help show this help + message and exit -v, ---version - show program's - version number and - exit + show program's + version number and + exit -tp PROXY, --tor-proxy PROXY - Proxy server url - without schema - - localhost:9050 + Proxy server url + without schema - + localhost:9050 -o FOLDER, --output FOLDER - Folder for saving - contents - host + Folder for saving + contents - host -w PATH, --workspace PATH - Directory for saving - contents - /storage/ - emulated/0/git/Smart - wa/website-cloner - --use-tor Use tor proxy - - False + Directory for saving + contents - /data/dat + a/com.termux/files/h + ome/temp + --use-tor Use tor proxy - + False ```
diff --git a/wclone/app.py b/wclone/app.py index 715bce0..47772af 100644 --- a/wclone/app.py +++ b/wclone/app.py @@ -248,7 +248,7 @@ def main(): try: logging.info(f"Extracting files from '{url}'") extractor = Extractor(url) - ectractor.run() + extractor.run() logging.info(f"\nTotal extracted files: {len(extractor.scraped_urls)}") except (KeyboardInterrupt,EOFError) as e: logging.warning(f'^{e} - Exitting"') From 172eabb14e6f40f392e11cf73c38d8226352c20f Mon Sep 17 00:00:00 2001 From: Simatwa Caleb Date: Fri, 9 Jun 2023 17:36:40 +0300 Subject: [PATCH 3/9] Request using cloudscraper --- dist/wclone-1.0.0.tar.gz | Bin 0 -> 4813 bytes setup.py | 2 +- wclone/__init__.py | 10 +- wclone/__main__.py | 3 +- wclone/app.py | 314 +++++++++++++++++++++++++-------------- 5 files changed, 210 insertions(+), 119 deletions(-) create mode 100644 dist/wclone-1.0.0.tar.gz diff --git a/dist/wclone-1.0.0.tar.gz b/dist/wclone-1.0.0.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..4072134aca57ab2b5c3fc192aa0bbeae6e35c761 GIT binary patch literal 4813 zcmZ9PS2WxKx5ag$Mz5nrCt8FE(OZ;^-n$q?2_n%)iRitA2u4VBqXtn!Fkys5uOmip zWAr-zoA0jsaPM7bul;h)I!|Zq^*dZiWMpQlp{@kL8y7EMpXcII_a*<;D<^-veI<{8 z1^LEHPaL_}t#PI_UEoNAXZDg;s)1*w?(|vb2)Z0hch65f!wOqwS03NL_ihs6gilQJ^N^+Qu$i4VB$BwtF|T)Nw=n@XaouCS(WUPnEb8CPZKO>BxnF^K z9P>W3wR#^=Zw9!amp15qI9&NA606Lz*9>$;z)k9brP9~uv>y8bH_G5^pvXVeEuaI7 zdElCgq0$}lWbc_5Rk&v?mt3YHl;Fo!YyP^8HM3_PnV%_JC>dwM>!{vH4f<87&5;(i zc=#E^>U+cJO>{;;cK;F0Z2g?Ad>E@iO=XWK?p$IFC}o{+*I^qxCcE3u^IaOEIl6pR_; zeZO%>^_aP&%lgrBbZ1kSCXrp1YE*gM>B%z7(D|Ty>p|L|hlX3qe1c^)JjZcsb7a-m znT;M>L>wp0*goxsqro|W$fC`YL^;*RPsVOqAS`j8JzuaqD{|GLELM!-@c;UXB_p4x z$V*yyBB;Chk?Lxl?-<^2AWu?eJ7Kio@wfy2C$)wyR;M$520R2gX@bEwa7@Yra={EeIyYpfrPp5#5VYekngi%&&_$RXtuZu#+V7Sec(_bEWzwhNUICd-;bz zm?Qci>TiG@`+}&ny^GmcjKPYnDHgsSm&Dc^adna)p3-~g8`vSLl#(#JEDxF1Cv3JN z`L--Rf!>gcX1aFct%}$K<9)GH`ZnlSu z3z3yQQuULCP@@eH1#fX5qAJ}>e<}`s9Gu5~E7TgZQQvv%+f;sgWn%k(4} zxX+3qj`^VXN-^332KN=8@WJstIb$R5CJtsu9wEv#2vP>T}t za=FZhn{>coWSHXRVKETsly0lwd$D#J_eR2@kbCk&SXjiF8~YdhMlY46B!0bOzy7w; zEJ*?`JQYS&?fsXuB}OjljXCM=M}(B}cfwv0%NxSR_SC58zFxXsIHx!9M9Sr!TDpRJ zb8&1T1n(5w7!qoQ)bAu3mRi7SBEu7AQ}njElgOfNxks&I8TDN5t!h@^i6^RH;$fhl zQbJJ3-qEr5b~UG6FBMYbC&-j!`btC)GoB`I$y`tM(1T$lHbaG#Ce7xSN!o%f5{WN^ z=bCS%5iX8ov241#Pj{k2>nJiR%|}(qrea&gkjeB~LZv$Pk>=!kM{N1{$>UE+%&X;uI)PX&P#11N zTcJE@fMs)(nWU8&t-IDHBh9-?iewQ|Y;7K!#yJc@xioHCQPBBmEGljvh#93}0pu)t>zWBd%L0vwrj$5URt=EqYaW-8~ZD)Yk=;9=aRr7b$k1B@L}I8f43rrp|V; z>XfsGpi}m4h|yF0^+)n3$crWVrI`kWB0Dlw_-8>NA3YJ<^Y(ZGi-RJz{QO!|snj*u zVM)e0j})!g9md2VXNoz57)p&v=4Z3Bse2Ytd})c-t2*mfJQSbCLHyB~FliNqcXSWG zJ)8i!Z=@qfUl8B$H~mK5@EW0eB~%c0;9l>(5Vyj-Jatw`qPVG3M(rkorouutOHMIT zy{l3=pmP9`e9?)Wtg+(JC#Y)iCqT=?9&8lgUGcQmiZB^-JHPi>*Kd#$z5h`lvzz0aXv@OvlE(=^-GAqWTd@MMqbQnx+A znIOLz;c|&s^qVi7P4mDD0EoGP_PkLX#{u83q0PYcF~9;CVW0}(+W@KeK5(A#LzFXl zyNX$H_LKxyx#L)O2FwG>VgU5j;_qD`?g04C9yVjZEq4tVmMiI;K`E`3IDlKj|6F(` zH87cMIr~xG-lV%5!-myR*%t!$Y-Y$YhH99S*k~>KO~EUbODCGWvJ|g=X6;Ha1sQ7| zi00Q9eF(V8>GMS8Yadj?Ul(eW>?+WU)~o04DMfWcFH$XrNR7BvYmd5wV^aj30sP>= z0d@AYUzBKXP{5~8c4k^j?Jzrb0g10tW6@&L!yqoYIZ6?V%l2Uk-s$KgDNbF}P=P9) zW1)ZrY6Wm&WQxKZh2RF@O! zT$|Y3Y7G|PNNrVN<&R6%xMe*fN>pp^rbHS)2IF<}p21B09eLJ`w)n@})_>^G#29$= zmMCMD73JJy)F!Gxkp$L3jx}teg~Z=_3oG}{8$1X)p1x>lW2SESzB=v{)@fE`KKfMS zOtVwlS?_??=5#4IzqAXq2 zlkb9mVRdS&{+f(2%zR}5GsCy~d|r3Pv)Qb3+Ep_RoD42|9<_MKvboR%M6Ng!l1X%59kp)GeCi=kgS8n(5Z3+m*CEzXJ1m31_O!SL*!Rmmw|I z%INWizgsJXq1s#a8=R>!q;O<@(XGExB%2&9Y90ldM{}#T#Xd=qh^cmT zM;*@-h`BNkAR9qR<2J*LE_;);ZJnkq#y$%lT9Sa7Ei>fi!jwqedGGj)C(pbuzb|wVW0B z!bjj1j74X_#zSf~FqTSwio9}Cl=$lL%U?2Evyv=aFcNv5hWdW3@wTLFpOU>gePeWAno^LvSi>wz&)1W(NlVuAx}OPUEt>XAs!Vy|lnyfbCR(T`V6=6i zpzOCX@F@A34ax2qhg#jN_n;7jXenu{DTiy&0Eu)Y*7vg=Swto-Aq;x`EH_Fex93G_ zGwlc<%u$mZf;73vFyr1I#p^sKa_^%>UgX5-ds+TeFsUuC53_XdN>_3EGYDMOIAO_WUJQ*}-FU0Qjj(oYq6B6V-7 zM`+a(?fqzNbgpfPOQj9w@mbTNC|8!k*LbwtU{t)g#F?j$IB#^}&-AR#=-wNt_Sdk| z$q_Psiq~onHu1BkRc)qo6S<||_Kr$e{N(*?mO_R{<=vgqVlARQx>pzwnVHKyNoQB0 zpcQH|eX_*49w3!O**ihRhem>&2kB&mkipEOk+yeNB)c-GTyl<_yHJM9{9Pr9n)l=v z=#QpFeyfgCXemr7m52hFgT~J#hfd3kG|zL&xn6u=2w!e2cVn1M#l}N8$=3px-62Tv z)aN^9^oMN2@Em>rH6`PG>uxe(dp%lIm#m4L6DzUBo=k}Z3!xkAa!0V2FU!-Mj2KD{ z6TOxpM%auuwp_92NTKf$)cHKTM?H4JM-isD7o;9!yZB7#m#16MR6=`Eo@}xoUshPS z)KDKNx4DYN(og*8mx*Q{_G{YCVi>C^Sz*Sm#+Ad%NQY{-xJgE|{KB5Z#(|K)FG=#O z-J1@wl4vn@d1g#{&C3Dxr}TbF2}b_J)}I=W+z@OvDa@SmS5j2!%C51&REA?la!#J9 zo#f#K4_(jqDktriR5lD-ex-HXd^DWoQ?_r_+5tU}$g_>sUH;ew70l*Cv95YQ`;HDX z2o|(Sj_CIiRxqN-rc053bX*@F8??Z-jy{*P)M9dE4gah^E?GEzvTnUVlK<_A zI6ynHMb;q*`Il8Ts|K4Z2qrW_gBSq44$fn$VdpM!_kFNwyCeYuf54u zu2RgYTQX@iWij?C$k&r2-$qWg3N*KIe{Yb#X8unHyJ#S5IA^}!J1Yyw@#Y3vWO>!n zWliCDkWoWz*rR}oGcQmkz^c^B!N|v9{x5=I=HUmwmZTY3pQSM!&KTE?*K0>?GP_tb z0_iZ(R-NM3&pee5%CC&+>@_lT-d!DBHAv~$h7;eqB3uhDZ}w80!LLJIaz!D1;wELUS>J0AlZDaA~2Zzs*D zi6?8OT-9l3LVO{|Y9+p)4`syP&;SU}vf1o-=w_NzuF`;k4$N4+A|om8WWYAkE?#8N zB85CS^0*j|w~H;$I zKd5TW8DJn?R-a?*r|JwBkSQ4FDPf8(YL%Fr)ODLIY8gl}Ril-ip-l`?gk(}n%ltkS zic=^-z7%}x(($WDC@;C}dX<#B^z5yrA5)%Lb_R@kSdNKZl=`h+)^Py~KI*1&pRd9c zQDz-kNpFf@fv#AnT91d-t%9id{KYkui6-owa3?am5~>F?(aMLM6ty^lT>V#fdY8z~ zG+#z65$HXIR)~ifa$@j)0zCqOAbrEK4^IRq| zj>HkvNn5aNB@8}eC;5!Gb4>p^c9CY#xd+j@)13N`E8(|2hER9F!7Uvc3HY|m2LmV% znv>gS*SaAU|K0r$N4@}1o!+&lG1EY8MC%q{6H{~sT1 else e - - output_folder = args.output or urlparse(url).netloc - # initialize a session - session = requests.session() - if args.use_tor: - session.request = functools.partial(session.request, timeout=30) - session.proxies = {'http': f'socks5h://{args.tor_proxy}','https': f'socks5h://{args.tor_proxy}'} - - try: - logging.info(f"Extracting files from '{url}'") - extractor = Extractor(url) - extractor.run() - logging.info(f"\nTotal extracted files: {len(extractor.scraped_urls)}") - except (KeyboardInterrupt,EOFError) as e: - logging.warning(f'^{e} - Exitting"') - except Exception as e: - #logging.exception(e) - logging.critical(get_excep(e)) - -if __name__=="__main__": - main() \ No newline at end of file + import argparse + + parser = argparse.ArgumentParser(description=__description__) + parser.add_argument("host", help="URL pointing to a website") + parser.add_argument( + "-v", "---version", action="version", version=f"%(prog)s {__version__}" + ) + parser.add_argument( + "-tp", + "--tor-proxy", + metavar="PROXY", + help="Proxy server url without schema - %(default)s", + default="localhost:9050", + ) + parser.add_argument( + "-o", "--output", metavar="FOLDER", help="Folder for saving contents - host" + ) + parser.add_argument( + "--headers", help="Path to .json file containing request headers" + ) + parser.add_argument( + "--browser", + help="Browser name to be used - %(default)s", + choices=browsers, + default="firefox", + metavar="|".join(browsers), + ) + parser.add_argument( + "--platform", + help="OS name to be used - %(default)s", + choices=platforms, + default="linux", + metavar="|".join(platforms), + ) + parser.add_argument( + "-w", + "--workspace", + metavar="PATH", + help="Directory for saving contents - %(default)s", + default=os.getcwd(), + ) + parser.add_argument( + "--use-tor", action="store_true", help="Use tor proxy - %(default)s" + ) + args = parser.parse_args() + + global url, workspace, get_excep, output_folder, session + + logging.basicConfig( + format="[%(asctime)s : %(levelname)s] - %(message)s", + datefmt="%H:%M:%S", + level=logging.INFO, + ) + # URL of the web page you want to extract data from + url = args.host + if not url.startswith("http"): + url = "https://" + url + + # define workspace from script location + workspace = args.workspace + + get_excep = lambda e: e.args[1] if len(e.args) > 1 else e + + output_folder = args.output or urlparse(url).netloc + # initialize a session + session_def = requests.session() + if args.use_tor: + session_def.request = functools.partial(session.request, timeout=30) + session_def.proxies = { + "http": f"socks5h://{args.tor_proxy}", + "https": f"socks5h://{args.tor_proxy}", + } + + if args.headers: + from json import load + + with open(args.headers) as fh: + session_def.headers = load(fh) + + session = cloudscraper.create_scraper( + sess=session_def, + browser={ + "browser": args.browser, + "platform": args.platform, + "desktop": True, + }, + ) + logging.info(f"Extracting files from '{url}'") + extractor = Extractor(url) + extractor.run() + logging.info(f"\nTotal extracted files: {len(extractor.scraped_urls)}") + + +if __name__ == "__main__": + main() From 7923ec4e9c246663ae01525b4b55c64ec230d6bf Mon Sep 17 00:00:00 2001 From: Simatwa Caleb Date: Fri, 9 Jun 2023 17:37:22 +0300 Subject: [PATCH 4/9] Request using cloudscraper --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3933979..739d72b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ wclone.egg-info build *__pycache__ .vscode +dist From 66d8c04a03b3d2e8581873682ab9f82422570f3c Mon Sep 17 00:00:00 2001 From: Simatwa Caleb Date: Fri, 9 Jun 2023 19:06:06 +0300 Subject: [PATCH 5/9] Handle exception per download --- wclone/app.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/wclone/app.py b/wclone/app.py index 4c3fc22..01a4d29 100644 --- a/wclone/app.py +++ b/wclone/app.py @@ -192,7 +192,7 @@ def url_to_local_path(self, url, keepQuery=False): return new_url # download file from URL - def download_file(self, url, output_path): + def download_file(self, url, output_path, count=0): # Remove query string and http from URL url = url.split("?")[0] @@ -206,10 +206,24 @@ def download_file(self, url, output_path): os.makedirs(os.path.dirname(output_path)) # Get file content and save it - response = session.get(url) - with open(output_path, "wb") as file: - file.write(response.content) - logging.info(f"Downloaded {file_name} to {os.path.relpath(output_path)}") + try: + response = session.get(url, stream=True) + if not response.ok: + logging.warning(f"Failed to get resource in URI '{url}'") + return True + with open(output_path, "wb") as file: + for chunk in response.iter_content(chunk_size=2048): + file.write(chunk) + logging.info( + f"Downloaded {file_name} to {os.path.relpath(output_path)}" + ) + except requests.exceptions.ConnectionError as e: + if count < 1: + logging.warning(f"{get_excep(e)} with uri '{url}' retrying once...") + count += 1 + return self.download_file(url, output_path, count) + else: + logging.error(f"{get_excep(e)} with uri '{url}'") return True @@ -331,6 +345,13 @@ def main(): with open(args.headers) as fh: session_def.headers = load(fh) + else: + session_def.headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "User-Agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.9", + } session = cloudscraper.create_scraper( sess=session_def, @@ -343,7 +364,7 @@ def main(): logging.info(f"Extracting files from '{url}'") extractor = Extractor(url) extractor.run() - logging.info(f"\nTotal extracted files: {len(extractor.scraped_urls)}") + logging.info(f"Total extracted files: {len(extractor.scraped_urls)}") if __name__ == "__main__": From 943f91534a96a09287613697241f3f944ae3ca38 Mon Sep 17 00:00:00 2001 From: Simatwa Caleb Date: Fri, 9 Jun 2023 19:13:55 +0300 Subject: [PATCH 6/9] Update to version 1.0.1 --- wclone/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wclone/__init__.py b/wclone/__init__.py index 66a2431..50e9787 100644 --- a/wclone/__init__.py +++ b/wclone/__init__.py @@ -1,4 +1,4 @@ -__version__ = "1.0.0" +__version__ = "1.0.1" __maintainer__ = "Smartwa" __author__ = "ZKAW" __description__ = "Basic website cloner written in Python" From d6f80c069cc416dc2aca8ca34d0a4a2b22eeda66 Mon Sep 17 00:00:00 2001 From: Smartwa Date: Fri, 9 Jun 2023 19:17:39 +0300 Subject: [PATCH 7/9] Create python-publish.yml --- .github/workflows/python-publish.yml | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..bdaab28 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,39 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} From 9e7c9370985a03b4f2db861ad5bd6054248388ac Mon Sep 17 00:00:00 2001 From: Simatwa Caleb Date: Sat, 10 Jun 2023 11:19:29 +0300 Subject: [PATCH 8/9] Add option to add cookies --- wclone/app.py | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/wclone/app.py b/wclone/app.py index 01a4d29..5d42a9f 100644 --- a/wclone/app.py +++ b/wclone/app.py @@ -10,6 +10,7 @@ from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from . import __version__, __description__ +from json import load platforms = ["darwin", "ios", "android", "windows", "linux"] browsers = ["chrome", "firefox"] @@ -28,7 +29,7 @@ def run(self): def get_page_content(self, url): try: - content = session.get(url) + content = session.get(url, timeout=args.timeout) content.encoding = "utf-8" return content.text except Exception as e: @@ -255,7 +256,7 @@ def main(*args, **kwargs): try: return func(*args, **kwargs) except (KeyboardInterrupt, EOFError) as e: - logging.warning(f'^{e} - Exitting"') + logging.warning(f"^{e} - Exitting") except Exception as e: # logging.exception(e) logging.critical(get_excep(e)) @@ -285,7 +286,19 @@ def main(): "-o", "--output", metavar="FOLDER", help="Folder for saving contents - host" ) parser.add_argument( - "--headers", help="Path to .json file containing request headers" + "-t", + "--timeout", + help="Timeout while awaiting response (s) - %(default)s", + type=int, + default=20, + ) + parser.add_argument( + "--headers", + help="Path to .json file containing request headers", + metavar="PATH", + ) + parser.add_argument( + "--cookies", help="Path to .json file containing cookies", metavar="PATH" ) parser.add_argument( "--browser", @@ -311,9 +324,10 @@ def main(): parser.add_argument( "--use-tor", action="store_true", help="Use tor proxy - %(default)s" ) - args = parser.parse_args() - global url, workspace, get_excep, output_folder, session + global url, workspace, get_excep, output_folder, session, args + + args = parser.parse_args() logging.basicConfig( format="[%(asctime)s : %(levelname)s] - %(message)s", @@ -341,17 +355,20 @@ def main(): } if args.headers: - from json import load - with open(args.headers) as fh: session_def.headers = load(fh) else: - session_def.headers = { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", - "User-Agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "en-US,en;q=0.9", - } + session_def.headers.update( + { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "User-Agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.9", + } + ) + if args.cookies: + with open(args.cookies) as fh: + session_def.cookies.update(load(fh)) session = cloudscraper.create_scraper( sess=session_def, @@ -364,7 +381,7 @@ def main(): logging.info(f"Extracting files from '{url}'") extractor = Extractor(url) extractor.run() - logging.info(f"Total extracted files: {len(extractor.scraped_urls)}") + logging.info(f"Total extracted files : {len(extractor.scraped_urls)}") if __name__ == "__main__": From b545d9b05b42ca52aff69e3b2c4f4338fdfcd24e Mon Sep 17 00:00:00 2001 From: Simatwa Caleb Date: Sat, 10 Jun 2023 12:42:36 +0300 Subject: [PATCH 9/9] Update to version 1.0.2 --- wclone/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wclone/__init__.py b/wclone/__init__.py index 50e9787..8f0480a 100644 --- a/wclone/__init__.py +++ b/wclone/__init__.py @@ -1,4 +1,4 @@ -__version__ = "1.0.1" +__version__ = "1.0.2" __maintainer__ = "Smartwa" __author__ = "ZKAW" __description__ = "Basic website cloner written in Python"