1
0
mirror of https://github.com/apachecn/epub-crawler.git synced 2025-06-03 23:58:27 +00:00

2023-02-14 13:10:54

This commit is contained in:
wizardforcel 2023-02-14 13:10:54 +08:00
commit e153e2c4c4
7 changed files with 21 additions and 7 deletions

View File

@ -4,6 +4,10 @@ v2023.2.14.0
+ 添加新的配置项`sizeLimit`,限制单个 EPUB 总大小
v2023.1.18.0
+ 修复外部脚本的加载问题
v2022.8.20.0
+ 添加 Selenium 支持

View File

@ -10,4 +10,4 @@ from . import util
__author__ = "ApacheCN"
__email__ = "apachecn@163.com"
__license__ = "SATA"
__version__ = "2022.8.20.0"
__version__ = "2023.1.18.0"

View File

@ -4,6 +4,7 @@
from urllib.parse import urljoin
import sys
import json
import warnings
from pyquery import PyQuery as pq
import time
from os import path
@ -19,6 +20,8 @@ from .config import config
from .sele_crawler import crawl_sele
from .common import *
warnings.filterwarnings("ignore")
def get_toc_from_cfg():
if config['list'] and len(config['list']) > 0:
return config['list']
@ -34,6 +37,7 @@ def get_toc_from_cfg():
headers=config['headers'],
timeout=config['timeout'],
proxies=config['proxy'],
verify=False,
).content.decode(config['encoding'], 'ignore')
return get_toc(html, config['url'])
@ -93,6 +97,7 @@ def tr_download_page(url, art, imgs):
headers=config['headers'],
timeout=config['timeout'],
proxies=config['proxy'],
verify=False,
).content.decode(config['encoding'], 'ignore')
print(f'{url} 下载成功')
art.update(get_article(html, url))
@ -105,7 +110,7 @@ def tr_download_page(url, art, imgs):
time.sleep(config['wait'])
def update_config(user_cfg):
def update_config(cfg_fname, user_cfg):
global get_toc
global get_article
@ -124,7 +129,8 @@ def update_config(user_cfg):
set_img_pool(ThreadPoolExecutor(config['imgThreads']))
if config['external']:
mod = load_module(config['external'])
ex_fname = path.join(path.dirname(cfg_fname), config['external'])
mod = load_module(ex_fname)
get_toc = getattr(mod, 'get_toc', get_toc)
get_article = getattr(mod, 'get_article', get_article)
@ -178,7 +184,7 @@ def main():
return
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
update_config(user_cfg)
update_config(cfg_fname, user_cfg)
if config['selenium']:
crawl_sele()

View File

@ -22,8 +22,8 @@ config = {
'readTimeout': 60,
'imgSrc': ['data-src', 'data-original-src', 'src'],
'proxy': '',
'textThreads': 5,
'imgThreads': 5,
'textThreads': 8,
'imgThreads': 8,
'external': None,
'checkStatus': False,
'cache': True,

View File

@ -46,6 +46,7 @@ def tr_download_img(url, imgs, picname):
retry=config['retry'],
timeout=config['timeout'],
proxies=config['proxy'],
verify=False,
).content
print(f'{url} 下载成功')
data = opti_img(data, config['optiMode'], config['colors']) or b''

3
publish.sh Normal file
View File

@ -0,0 +1,3 @@
rm -rf dist
python setup.py sdist bdist_wheel
twine upload dist/* -u $(pip config get pypi.username) -p $(pip config get pypi.password)

View File

@ -1,4 +1,4 @@
requests
requests[socks]
pyquery
GenEpub
imgyaso