From b10d74a67fb2513a71bdddd87c04171c498e9955 Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Tue, 29 Nov 2022 16:28:48 +0800 Subject: [PATCH 1/7] 2022-11-29 16:28:48 --- publish.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 publish.sh diff --git a/publish.sh b/publish.sh new file mode 100644 index 0000000..20f4923 --- /dev/null +++ b/publish.sh @@ -0,0 +1,3 @@ +rm -rf dist +python setup.py sdist bdist_wheel +twine upload dist/* -u $(pip config get pypi.username) -p $(pip config get pypi.password) \ No newline at end of file From 41122f4f9d4b93bcb774885d9fd9535498142562 Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Sat, 31 Dec 2022 22:49:43 +0800 Subject: [PATCH 2/7] 2022-12-31 22:49:43 --- EpubCrawler/__main__.py | 2 ++ EpubCrawler/img.py | 1 + 2 files changed, 3 insertions(+) diff --git a/EpubCrawler/__main__.py b/EpubCrawler/__main__.py index 834af74..a894c6b 100644 --- a/EpubCrawler/__main__.py +++ b/EpubCrawler/__main__.py @@ -34,6 +34,7 @@ def get_toc_from_cfg(): headers=config['headers'], timeout=config['timeout'], proxies=config['proxy'], + verify=False, ).content.decode(config['encoding'], 'ignore') return get_toc(html, config['url']) @@ -93,6 +94,7 @@ def tr_download_page(url, art, imgs): headers=config['headers'], timeout=config['timeout'], proxies=config['proxy'], + verify=False, ).content.decode(config['encoding'], 'ignore') print(f'{url} 下载成功') art.update(get_article(html, url)) diff --git a/EpubCrawler/img.py b/EpubCrawler/img.py index e00018e..5be8fbd 100644 --- a/EpubCrawler/img.py +++ b/EpubCrawler/img.py @@ -46,6 +46,7 @@ def tr_download_img(url, imgs, picname): retry=config['retry'], timeout=config['timeout'], proxies=config['proxy'], + verify=False, ).content print(f'{url} 下载成功') data = opti_img(data, config['optiMode'], config['colors']) or b'' From bca45e86e74fd0c101b6c4720af5a424d392cf2c Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Sun, 1 Jan 2023 03:54:22 +0800 Subject: [PATCH 3/7] 2023-01-01 03:54:22 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6e2b53a..2bb4be6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -requests +requests[socks] pyquery GenEpub imgyaso From b27263a33527d9062430d8d002122cd080e92da0 Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Sun, 1 Jan 2023 04:05:28 +0800 Subject: [PATCH 4/7] 2023-01-01 04:05:28 --- EpubCrawler/__main__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/EpubCrawler/__main__.py b/EpubCrawler/__main__.py index a894c6b..fbcad71 100644 --- a/EpubCrawler/__main__.py +++ b/EpubCrawler/__main__.py @@ -4,6 +4,7 @@ from urllib.parse import urljoin import sys import json +import warnings from pyquery import PyQuery as pq import time from os import path @@ -19,6 +20,8 @@ from .config import config from .sele_crawler import crawl_sele from .common import * +warnings.filterwarnings("ignore") + def get_toc_from_cfg(): if config['list'] and len(config['list']) > 0: return config['list'] From b1c3bf0e503bd87eb1407192758eb3627430314a Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Sun, 1 Jan 2023 04:15:04 +0800 Subject: [PATCH 5/7] 2023-01-01 04:15:04 --- EpubCrawler/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EpubCrawler/config.py b/EpubCrawler/config.py index f28a839..b411f37 100644 --- a/EpubCrawler/config.py +++ b/EpubCrawler/config.py @@ -22,8 +22,8 @@ config = { 'readTimeout': 60, 'imgSrc': ['data-src', 'data-original-src', 'src'], 'proxy': '', - 'textThreads': 5, - 'imgThreads': 5, + 'textThreads': 8, + 'imgThreads': 8, 'external': None, 'checkStatus': False, 'cache': True, From 16a9371b5772cc47e4c31e56c6f62c8f33c773eb Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Wed, 18 Jan 2023 12:23:53 +0800 Subject: [PATCH 6/7] 2023-01-18 12:23:53 --- EpubCrawler/__main__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/EpubCrawler/__main__.py b/EpubCrawler/__main__.py index fbcad71..0aa6cd8 100644 --- a/EpubCrawler/__main__.py +++ b/EpubCrawler/__main__.py @@ -110,7 +110,7 @@ def tr_download_page(url, art, imgs): time.sleep(config['wait']) -def update_config(user_cfg): +def update_config(cfg_fname, user_cfg): global get_toc global get_article @@ -129,7 +129,8 @@ def update_config(user_cfg): set_img_pool(ThreadPoolExecutor(config['imgThreads'])) if config['external']: - mod = load_module(config['external']) + ex_fname = path.join(path.dirname(cfg_fname), config['external']) + mod = load_module(ex_fname) get_toc = getattr(mod, 'get_toc', get_toc) get_article = getattr(mod, 'get_article', get_article) @@ -148,7 +149,7 @@ def main(): return user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read()) - update_config(user_cfg) + update_config(cfg_fname, user_cfg) if config['selenium']: crawl_sele() From d620ce7b6bbd27b81f64665782a5e82f04549d62 Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Wed, 18 Jan 2023 12:25:46 +0800 Subject: [PATCH 7/7] 2023-01-18 12:25:46 --- CHANGELOG.md | 4 ++++ EpubCrawler/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e43034..e1948cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # 历史记录 +v2023.1.18.0 + ++ 修复外部脚本的加载问题 + v2022.8.20.0 + 添加 Selenium 支持 diff --git a/EpubCrawler/__init__.py b/EpubCrawler/__init__.py index 988bca3..4504c29 100644 --- a/EpubCrawler/__init__.py +++ b/EpubCrawler/__init__.py @@ -10,4 +10,4 @@ from . import util __author__ = "ApacheCN" __email__ = "apachecn@163.com" __license__ = "SATA" -__version__ = "2022.8.20.0" +__version__ = "2023.1.18.0"