diff --git a/CHANGELOG.md b/CHANGELOG.md index 82b0886..0ae96dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ v2023.2.14.0 + 添加新的配置项`sizeLimit`,限制单个 EPUB 总大小 +v2023.1.18.0 + ++ 修复外部脚本的加载问题 + v2022.8.20.0 + 添加 Selenium 支持 diff --git a/EpubCrawler/__init__.py b/EpubCrawler/__init__.py index 988bca3..4504c29 100644 --- a/EpubCrawler/__init__.py +++ b/EpubCrawler/__init__.py @@ -10,4 +10,4 @@ from . import util __author__ = "ApacheCN" __email__ = "apachecn@163.com" __license__ = "SATA" -__version__ = "2022.8.20.0" +__version__ = "2023.1.18.0" diff --git a/EpubCrawler/__main__.py b/EpubCrawler/__main__.py index 531e4e9..394b321 100644 --- a/EpubCrawler/__main__.py +++ b/EpubCrawler/__main__.py @@ -4,6 +4,7 @@ from urllib.parse import urljoin import sys import json +import warnings from pyquery import PyQuery as pq import time from os import path @@ -19,6 +20,8 @@ from .config import config from .sele_crawler import crawl_sele from .common import * +warnings.filterwarnings("ignore") + def get_toc_from_cfg(): if config['list'] and len(config['list']) > 0: return config['list'] @@ -34,6 +37,7 @@ def get_toc_from_cfg(): headers=config['headers'], timeout=config['timeout'], proxies=config['proxy'], + verify=False, ).content.decode(config['encoding'], 'ignore') return get_toc(html, config['url']) @@ -93,6 +97,7 @@ def tr_download_page(url, art, imgs): headers=config['headers'], timeout=config['timeout'], proxies=config['proxy'], + verify=False, ).content.decode(config['encoding'], 'ignore') print(f'{url} 下载成功') art.update(get_article(html, url)) @@ -105,7 +110,7 @@ def tr_download_page(url, art, imgs): time.sleep(config['wait']) -def update_config(user_cfg): +def update_config(cfg_fname, user_cfg): global get_toc global get_article @@ -124,7 +129,8 @@ def update_config(user_cfg): set_img_pool(ThreadPoolExecutor(config['imgThreads'])) if config['external']: - mod = load_module(config['external']) + ex_fname = path.join(path.dirname(cfg_fname), config['external']) + mod = load_module(ex_fname) get_toc = getattr(mod, 'get_toc', get_toc) get_article = getattr(mod, 'get_article', get_article) @@ -178,7 +184,7 @@ def main(): return user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read()) - update_config(user_cfg) + update_config(cfg_fname, user_cfg) if config['selenium']: crawl_sele() diff --git a/EpubCrawler/config.py b/EpubCrawler/config.py index d1edeaa..7b2f518 100644 --- a/EpubCrawler/config.py +++ b/EpubCrawler/config.py @@ -22,8 +22,8 @@ config = { 'readTimeout': 60, 'imgSrc': ['data-src', 'data-original-src', 'src'], 'proxy': '', - 'textThreads': 5, - 'imgThreads': 5, + 'textThreads': 8, + 'imgThreads': 8, 'external': None, 'checkStatus': False, 'cache': True, diff --git a/EpubCrawler/img.py b/EpubCrawler/img.py index e00018e..5be8fbd 100644 --- a/EpubCrawler/img.py +++ b/EpubCrawler/img.py @@ -46,6 +46,7 @@ def tr_download_img(url, imgs, picname): retry=config['retry'], timeout=config['timeout'], proxies=config['proxy'], + verify=False, ).content print(f'{url} 下载成功') data = opti_img(data, config['optiMode'], config['colors']) or b'' diff --git a/publish.sh b/publish.sh new file mode 100644 index 0000000..20f4923 --- /dev/null +++ b/publish.sh @@ -0,0 +1,3 @@ +rm -rf dist +python setup.py sdist bdist_wheel +twine upload dist/* -u $(pip config get pypi.username) -p $(pip config get pypi.password) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6e2b53a..2bb4be6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -requests +requests[socks] pyquery GenEpub imgyaso