Skip to content
This repository has been archived by the owner on Apr 2, 2024. It is now read-only.

Commit

Permalink
Added extra arguments for threads and wait time.
Browse files Browse the repository at this point in the history
  • Loading branch information
AbstractGeek committed Feb 4, 2017
1 parent 3703c32 commit 89e7393
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 24 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2016 Dinesh Natesan
Copyright (c) 2017 Dinesh Natesan

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
24 changes: 19 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Comic-scraper (Comic Downloader)
Downloads comics from various websites and creates cbz files from them.
Currently supports just readcomics.tv
# Comic-scraper (Comic/Manga Downloader)
Downloads comics and manga from various websites and creates cbz files from them.
Currently supports readcomics.tv, mangafox.me and mangahere.co

## Installation

Expand All @@ -13,7 +13,7 @@ pip install comic-scraper
### Via pip (local)
Clone a copy of the repository using the following command:
```
git clone git@github.com:AbstractGeek/comic-scraper.git
git clone https://github.com/AbstractGeek/comic-scraper.git
```

Open your terminal into the folder and type this into it (sudo might be necessary):
Expand All @@ -33,9 +33,10 @@ These can simply be installed by:
```
pip install -r requirements.txt
```
That's it. Use comic_scraper.py to download comics
That's it. Use comic_scraper.py to download comics and manga.

## Usage
### Comics
Find your comic of interest in readcomics.tv. Copy the url of the comic page.
For example, If I wanted to download spider-man-2016, I need to copy this url:
http://www.readcomics.tv/comic/spider-man-2016
Expand All @@ -56,3 +57,16 @@ For example, if I want to download chapters 10-20, I use the following command
comic-scraper -l ~/Comics/ -c 10:20 http://www.readcomics.tv/comic/spider-man-2016
```
Note: Only individual chapters or sequential chunks (start:stop) can currently be downloaded.

### Manga
The syntax for downloading manga is exactly the same as the comics. For example, if I wanted to download kindom manga, I need to copy the url from mangafox (or mangahere) website: http://mangafox.me/manga/kingdom/

To download all chapters of this manga, simply call the script and input the url.
```
comic-scraper http://mangafox.me/manga/kingdom/
```

To download selected chapters, add -c and input the chapter numbers. To set a custom location, add -l and input the location. Here is an example:
```
comic-scraper -l ~/Comics/ -c 1:100 http://mangafox.me/manga/kingdom/
```
56 changes: 41 additions & 15 deletions comic_scraper/comic_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,37 @@
from random import shuffle, uniform
from numpy import arange
from time import sleep
from copy import deepcopy


class Comic:
def __init__(self, comic_url, root_dir):
def __init__(self, comic_url, program_args):
self.url = comic_url
self.name = comic_url.split('/')[-1] \
if comic_url.split('/')[-1] else comic_url.split('/')[-2]
# Set download location
self.download_location = os.path.abspath(
os.path.join(root_dir, self.name))
os.path.join(program_args.location, self.name))
if not os.path.exists(self.download_location):
os.makedirs(self.download_location)
# Set threads and retry values
self.chapter_threads = program_args.chapterthreads
self.page_threads = program_args.pagethreads
self.wait_time = program_args.waittime
self.max_retries = program_args.retries
# Get all chapters and mode of download
self.all_chapters = self.get_chapters()

def get_chapters(self):
if 'mangafox' in self.url:
self.mode = ['manga', 'mangafox']
chapters = self.manga_extract_chapters(self.url)
chapters = self.manga_extract_chapters()
elif 'mangahere' in self.url:
self.mode = ['manga', 'mangahere']
chapters = self.manga_extract_chapters(self.url)
chapters = self.manga_extract_chapters()
elif 'readcomics' in self.url:
self.mode = ['comic']
chapters = self.comic_extract_chapters(self.url)
chapters = self.comic_extract_chapters()
else:
raise ValueError('The scraper currently only supports mangafox, ',
'mangahere and readcomics.tv ',
Expand All @@ -55,10 +61,11 @@ def set_download_chapters(self, potential_keys=None):
sorted(unsorted_chapters.items(), key=lambda t: t[0]))
# Print downloading chapters
print("Downloading the below chapters:")
print(keys)
print(sorted(keys))

def download_comic(self):
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.chapter_threads) as executor:
future_to_chapter = {
executor.submit(chapter.download_chapter): chapter_num
for chapter_num, chapter in self.chapters_to_download.items()}
Expand All @@ -73,8 +80,9 @@ def download_comic(self):
else:
print('Downloaded: Chapter-%g' % (chapter_num))

def manga_extract_chapters(self, url):
def manga_extract_chapters(self):
comic_name = self.name
url = self.url
r = requests.get(url)
soup = bsoup.BeautifulSoup(r.text, 'html.parser')

Expand All @@ -98,7 +106,8 @@ def manga_extract_chapters(self, url):
self, chapter_num, volume_num, chapter_link)
return chapters

def comic_extract_chapters(self, url):
def comic_extract_chapters(self):
url = self.url
comic = url.split('/')[-1]
r = requests.get(url)
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
Expand Down Expand Up @@ -130,6 +139,10 @@ def __init__(self, comic, chapter_num, volume_num, chapter_url):
self.chapter_num = chapter_num
self.volume_num = volume_num
self.chapter_url = chapter_url
# Threads and retry time
self.page_threads = comic.page_threads
self.wait_time = comic.wait_time
self.max_retries = comic.max_retries

def download_chapter(self):
''' Download and convert it into a cbz file '''
Expand All @@ -144,7 +157,8 @@ def download_chapter(self):
os.makedirs(self.chapter_location)

# Download individual pages in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.page_threads) as executor:
executor.map(download_func, pages)

# Convert the folder to a comic book zip filename
Expand Down Expand Up @@ -178,8 +192,8 @@ def manga_get_pages(self):
elif (self.comic_mode[1] == 'mangahere'):
base_url = self.chapter_url

max_retries = 5
wait_retry_time = 5
max_retries = deepcopy(self.max_retries)
wait_retry_time = deepcopy(self.wait_time)

while True:
# Get javascript blocks
Expand Down Expand Up @@ -232,8 +246,8 @@ def manga_download_page(self, page):
filename = os.path.join(self.chapter_location,
'%0.3d.jpg' % (page_num))

max_retries = 10
wait_retry_time = 10
max_retries = deepcopy(self.max_retries)
wait_retry_time = deepcopy(self.wait_time)

while True:
r = requests.get(page_url)
Expand Down Expand Up @@ -303,11 +317,23 @@ def main():
parser.add_argument(
"-c", "--chapters", default=False,
help="Specify chapters to download separated by : (10:20).")
parser.add_argument(
"-ct", "--chapterthreads", default=5,
help="Number of parallel chapters downloads.")
parser.add_argument(
"-pt", "--pagethreads", default=10,
help="Number of parallel chapter pages downloads (per chapter).")
parser.add_argument(
"-wt", "--waittime", default=10,
help="Wait time before retry if encountered with an error")
parser.add_argument(
"-rt", "--retries", default=10,
help="Number of retries before giving up")

args = parser.parse_args()

for url in args.urls:
comic = Comic(url, args.location)
comic = Comic(url, args)
print('Downloading comic: ' + comic.name)

# Get chapters to download
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from setuptools import setup

setup(name='comic-scraper',
version='0.1',
version='0.5',
description='Scraps comics and creates cbz files',
url='https://github.com/AbstractGeek/comic-scraper',
download_url='https://github.com/AbstractGeek/comic-scraper/tarball/0.1',
download_url='https://github.com/AbstractGeek/comic-scraper/tarball/0.5',
author='Dinesh Natesan',
author_email='abstractgeek@outlook.com',
license='MIT',
Expand All @@ -14,7 +14,7 @@
'Programming Language :: Python :: 3.5',
'Topic :: Games/Entertainment',
],
keywords='comics scraper',
keywords='comics manga scraper',
packages=['comic_scraper'],
install_requires=[
'beautifulsoup4',
Expand Down

0 comments on commit 89e7393

Please sign in to comment.