mirror of
https://github.com/apachecn/epub-crawler.git
synced 2025-06-03 23:58:27 +00:00
2023-02-14 13:10:54
This commit is contained in:
commit
e153e2c4c4
@ -4,6 +4,10 @@ v2023.2.14.0
|
||||
|
||||
+ 添加新的配置项`sizeLimit`,限制单个 EPUB 总大小
|
||||
|
||||
v2023.1.18.0
|
||||
|
||||
+ 修复外部脚本的加载问题
|
||||
|
||||
v2022.8.20.0
|
||||
|
||||
+ 添加 Selenium 支持
|
||||
|
@ -10,4 +10,4 @@ from . import util
|
||||
__author__ = "ApacheCN"
|
||||
__email__ = "apachecn@163.com"
|
||||
__license__ = "SATA"
|
||||
__version__ = "2022.8.20.0"
|
||||
__version__ = "2023.1.18.0"
|
||||
|
@ -4,6 +4,7 @@
|
||||
from urllib.parse import urljoin
|
||||
import sys
|
||||
import json
|
||||
import warnings
|
||||
from pyquery import PyQuery as pq
|
||||
import time
|
||||
from os import path
|
||||
@ -19,6 +20,8 @@ from .config import config
|
||||
from .sele_crawler import crawl_sele
|
||||
from .common import *
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
def get_toc_from_cfg():
|
||||
if config['list'] and len(config['list']) > 0:
|
||||
return config['list']
|
||||
@ -34,6 +37,7 @@ def get_toc_from_cfg():
|
||||
headers=config['headers'],
|
||||
timeout=config['timeout'],
|
||||
proxies=config['proxy'],
|
||||
verify=False,
|
||||
).content.decode(config['encoding'], 'ignore')
|
||||
return get_toc(html, config['url'])
|
||||
|
||||
@ -93,6 +97,7 @@ def tr_download_page(url, art, imgs):
|
||||
headers=config['headers'],
|
||||
timeout=config['timeout'],
|
||||
proxies=config['proxy'],
|
||||
verify=False,
|
||||
).content.decode(config['encoding'], 'ignore')
|
||||
print(f'{url} 下载成功')
|
||||
art.update(get_article(html, url))
|
||||
@ -105,7 +110,7 @@ def tr_download_page(url, art, imgs):
|
||||
time.sleep(config['wait'])
|
||||
|
||||
|
||||
def update_config(user_cfg):
|
||||
def update_config(cfg_fname, user_cfg):
|
||||
global get_toc
|
||||
global get_article
|
||||
|
||||
@ -124,7 +129,8 @@ def update_config(user_cfg):
|
||||
set_img_pool(ThreadPoolExecutor(config['imgThreads']))
|
||||
|
||||
if config['external']:
|
||||
mod = load_module(config['external'])
|
||||
ex_fname = path.join(path.dirname(cfg_fname), config['external'])
|
||||
mod = load_module(ex_fname)
|
||||
get_toc = getattr(mod, 'get_toc', get_toc)
|
||||
get_article = getattr(mod, 'get_article', get_article)
|
||||
|
||||
@ -178,7 +184,7 @@ def main():
|
||||
return
|
||||
|
||||
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
|
||||
update_config(user_cfg)
|
||||
update_config(cfg_fname, user_cfg)
|
||||
|
||||
if config['selenium']:
|
||||
crawl_sele()
|
||||
|
@ -22,8 +22,8 @@ config = {
|
||||
'readTimeout': 60,
|
||||
'imgSrc': ['data-src', 'data-original-src', 'src'],
|
||||
'proxy': '',
|
||||
'textThreads': 5,
|
||||
'imgThreads': 5,
|
||||
'textThreads': 8,
|
||||
'imgThreads': 8,
|
||||
'external': None,
|
||||
'checkStatus': False,
|
||||
'cache': True,
|
||||
|
@ -46,6 +46,7 @@ def tr_download_img(url, imgs, picname):
|
||||
retry=config['retry'],
|
||||
timeout=config['timeout'],
|
||||
proxies=config['proxy'],
|
||||
verify=False,
|
||||
).content
|
||||
print(f'{url} 下载成功')
|
||||
data = opti_img(data, config['optiMode'], config['colors']) or b''
|
||||
|
3
publish.sh
Normal file
3
publish.sh
Normal file
@ -0,0 +1,3 @@
|
||||
rm -rf dist
|
||||
python setup.py sdist bdist_wheel
|
||||
twine upload dist/* -u $(pip config get pypi.username) -p $(pip config get pypi.password)
|
@ -1,4 +1,4 @@
|
||||
requests
|
||||
requests[socks]
|
||||
pyquery
|
||||
GenEpub
|
||||
imgyaso
|
||||
|
Loading…
x
Reference in New Issue
Block a user