1
0
mirror of https://github.com/apachecn/epub-crawler.git synced 2025-06-06 17:56:57 +00:00

2023-02-14 13:10:54

This commit is contained in:
wizardforcel 2023-02-14 13:10:54 +08:00
commit e153e2c4c4
7 changed files with 21 additions and 7 deletions

View File

@ -4,6 +4,10 @@ v2023.2.14.0
+ 添加新的配置项`sizeLimit`,限制单个 EPUB 总大小 + 添加新的配置项`sizeLimit`,限制单个 EPUB 总大小
v2023.1.18.0
+ 修复外部脚本的加载问题
v2022.8.20.0 v2022.8.20.0
+ 添加 Selenium 支持 + 添加 Selenium 支持

View File

@ -10,4 +10,4 @@ from . import util
__author__ = "ApacheCN" __author__ = "ApacheCN"
__email__ = "apachecn@163.com" __email__ = "apachecn@163.com"
__license__ = "SATA" __license__ = "SATA"
__version__ = "2022.8.20.0" __version__ = "2023.1.18.0"

View File

@ -4,6 +4,7 @@
from urllib.parse import urljoin from urllib.parse import urljoin
import sys import sys
import json import json
import warnings
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
import time import time
from os import path from os import path
@ -19,6 +20,8 @@ from .config import config
from .sele_crawler import crawl_sele from .sele_crawler import crawl_sele
from .common import * from .common import *
warnings.filterwarnings("ignore")
def get_toc_from_cfg(): def get_toc_from_cfg():
if config['list'] and len(config['list']) > 0: if config['list'] and len(config['list']) > 0:
return config['list'] return config['list']
@ -34,6 +37,7 @@ def get_toc_from_cfg():
headers=config['headers'], headers=config['headers'],
timeout=config['timeout'], timeout=config['timeout'],
proxies=config['proxy'], proxies=config['proxy'],
verify=False,
).content.decode(config['encoding'], 'ignore') ).content.decode(config['encoding'], 'ignore')
return get_toc(html, config['url']) return get_toc(html, config['url'])
@ -93,6 +97,7 @@ def tr_download_page(url, art, imgs):
headers=config['headers'], headers=config['headers'],
timeout=config['timeout'], timeout=config['timeout'],
proxies=config['proxy'], proxies=config['proxy'],
verify=False,
).content.decode(config['encoding'], 'ignore') ).content.decode(config['encoding'], 'ignore')
print(f'{url} 下载成功') print(f'{url} 下载成功')
art.update(get_article(html, url)) art.update(get_article(html, url))
@ -105,7 +110,7 @@ def tr_download_page(url, art, imgs):
time.sleep(config['wait']) time.sleep(config['wait'])
def update_config(user_cfg): def update_config(cfg_fname, user_cfg):
global get_toc global get_toc
global get_article global get_article
@ -124,7 +129,8 @@ def update_config(user_cfg):
set_img_pool(ThreadPoolExecutor(config['imgThreads'])) set_img_pool(ThreadPoolExecutor(config['imgThreads']))
if config['external']: if config['external']:
mod = load_module(config['external']) ex_fname = path.join(path.dirname(cfg_fname), config['external'])
mod = load_module(ex_fname)
get_toc = getattr(mod, 'get_toc', get_toc) get_toc = getattr(mod, 'get_toc', get_toc)
get_article = getattr(mod, 'get_article', get_article) get_article = getattr(mod, 'get_article', get_article)
@ -178,7 +184,7 @@ def main():
return return
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read()) user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
update_config(user_cfg) update_config(cfg_fname, user_cfg)
if config['selenium']: if config['selenium']:
crawl_sele() crawl_sele()

View File

@ -22,8 +22,8 @@ config = {
'readTimeout': 60, 'readTimeout': 60,
'imgSrc': ['data-src', 'data-original-src', 'src'], 'imgSrc': ['data-src', 'data-original-src', 'src'],
'proxy': '', 'proxy': '',
'textThreads': 5, 'textThreads': 8,
'imgThreads': 5, 'imgThreads': 8,
'external': None, 'external': None,
'checkStatus': False, 'checkStatus': False,
'cache': True, 'cache': True,

View File

@ -46,6 +46,7 @@ def tr_download_img(url, imgs, picname):
retry=config['retry'], retry=config['retry'],
timeout=config['timeout'], timeout=config['timeout'],
proxies=config['proxy'], proxies=config['proxy'],
verify=False,
).content ).content
print(f'{url} 下载成功') print(f'{url} 下载成功')
data = opti_img(data, config['optiMode'], config['colors']) or b'' data = opti_img(data, config['optiMode'], config['colors']) or b''

3
publish.sh Normal file
View File

@ -0,0 +1,3 @@
rm -rf dist
python setup.py sdist bdist_wheel
twine upload dist/* -u $(pip config get pypi.username) -p $(pip config get pypi.password)

View File

@ -1,4 +1,4 @@
requests requests[socks]
pyquery pyquery
GenEpub GenEpub
imgyaso imgyaso