mirror of
https://github.com/apachecn/epub-crawler.git
synced 2025-06-06 17:56:57 +00:00
2023-02-14 13:10:54
This commit is contained in:
commit
e153e2c4c4
@ -4,6 +4,10 @@ v2023.2.14.0
|
|||||||
|
|
||||||
+ 添加新的配置项`sizeLimit`,限制单个 EPUB 总大小
|
+ 添加新的配置项`sizeLimit`,限制单个 EPUB 总大小
|
||||||
|
|
||||||
|
v2023.1.18.0
|
||||||
|
|
||||||
|
+ 修复外部脚本的加载问题
|
||||||
|
|
||||||
v2022.8.20.0
|
v2022.8.20.0
|
||||||
|
|
||||||
+ 添加 Selenium 支持
|
+ 添加 Selenium 支持
|
||||||
|
@ -10,4 +10,4 @@ from . import util
|
|||||||
__author__ = "ApacheCN"
|
__author__ = "ApacheCN"
|
||||||
__email__ = "apachecn@163.com"
|
__email__ = "apachecn@163.com"
|
||||||
__license__ = "SATA"
|
__license__ = "SATA"
|
||||||
__version__ = "2022.8.20.0"
|
__version__ = "2023.1.18.0"
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
import warnings
|
||||||
from pyquery import PyQuery as pq
|
from pyquery import PyQuery as pq
|
||||||
import time
|
import time
|
||||||
from os import path
|
from os import path
|
||||||
@ -19,6 +20,8 @@ from .config import config
|
|||||||
from .sele_crawler import crawl_sele
|
from .sele_crawler import crawl_sele
|
||||||
from .common import *
|
from .common import *
|
||||||
|
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
def get_toc_from_cfg():
|
def get_toc_from_cfg():
|
||||||
if config['list'] and len(config['list']) > 0:
|
if config['list'] and len(config['list']) > 0:
|
||||||
return config['list']
|
return config['list']
|
||||||
@ -34,6 +37,7 @@ def get_toc_from_cfg():
|
|||||||
headers=config['headers'],
|
headers=config['headers'],
|
||||||
timeout=config['timeout'],
|
timeout=config['timeout'],
|
||||||
proxies=config['proxy'],
|
proxies=config['proxy'],
|
||||||
|
verify=False,
|
||||||
).content.decode(config['encoding'], 'ignore')
|
).content.decode(config['encoding'], 'ignore')
|
||||||
return get_toc(html, config['url'])
|
return get_toc(html, config['url'])
|
||||||
|
|
||||||
@ -93,6 +97,7 @@ def tr_download_page(url, art, imgs):
|
|||||||
headers=config['headers'],
|
headers=config['headers'],
|
||||||
timeout=config['timeout'],
|
timeout=config['timeout'],
|
||||||
proxies=config['proxy'],
|
proxies=config['proxy'],
|
||||||
|
verify=False,
|
||||||
).content.decode(config['encoding'], 'ignore')
|
).content.decode(config['encoding'], 'ignore')
|
||||||
print(f'{url} 下载成功')
|
print(f'{url} 下载成功')
|
||||||
art.update(get_article(html, url))
|
art.update(get_article(html, url))
|
||||||
@ -105,7 +110,7 @@ def tr_download_page(url, art, imgs):
|
|||||||
time.sleep(config['wait'])
|
time.sleep(config['wait'])
|
||||||
|
|
||||||
|
|
||||||
def update_config(user_cfg):
|
def update_config(cfg_fname, user_cfg):
|
||||||
global get_toc
|
global get_toc
|
||||||
global get_article
|
global get_article
|
||||||
|
|
||||||
@ -124,7 +129,8 @@ def update_config(user_cfg):
|
|||||||
set_img_pool(ThreadPoolExecutor(config['imgThreads']))
|
set_img_pool(ThreadPoolExecutor(config['imgThreads']))
|
||||||
|
|
||||||
if config['external']:
|
if config['external']:
|
||||||
mod = load_module(config['external'])
|
ex_fname = path.join(path.dirname(cfg_fname), config['external'])
|
||||||
|
mod = load_module(ex_fname)
|
||||||
get_toc = getattr(mod, 'get_toc', get_toc)
|
get_toc = getattr(mod, 'get_toc', get_toc)
|
||||||
get_article = getattr(mod, 'get_article', get_article)
|
get_article = getattr(mod, 'get_article', get_article)
|
||||||
|
|
||||||
@ -178,7 +184,7 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
|
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
|
||||||
update_config(user_cfg)
|
update_config(cfg_fname, user_cfg)
|
||||||
|
|
||||||
if config['selenium']:
|
if config['selenium']:
|
||||||
crawl_sele()
|
crawl_sele()
|
||||||
|
@ -22,8 +22,8 @@ config = {
|
|||||||
'readTimeout': 60,
|
'readTimeout': 60,
|
||||||
'imgSrc': ['data-src', 'data-original-src', 'src'],
|
'imgSrc': ['data-src', 'data-original-src', 'src'],
|
||||||
'proxy': '',
|
'proxy': '',
|
||||||
'textThreads': 5,
|
'textThreads': 8,
|
||||||
'imgThreads': 5,
|
'imgThreads': 8,
|
||||||
'external': None,
|
'external': None,
|
||||||
'checkStatus': False,
|
'checkStatus': False,
|
||||||
'cache': True,
|
'cache': True,
|
||||||
|
@ -46,6 +46,7 @@ def tr_download_img(url, imgs, picname):
|
|||||||
retry=config['retry'],
|
retry=config['retry'],
|
||||||
timeout=config['timeout'],
|
timeout=config['timeout'],
|
||||||
proxies=config['proxy'],
|
proxies=config['proxy'],
|
||||||
|
verify=False,
|
||||||
).content
|
).content
|
||||||
print(f'{url} 下载成功')
|
print(f'{url} 下载成功')
|
||||||
data = opti_img(data, config['optiMode'], config['colors']) or b''
|
data = opti_img(data, config['optiMode'], config['colors']) or b''
|
||||||
|
3
publish.sh
Normal file
3
publish.sh
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
rm -rf dist
|
||||||
|
python setup.py sdist bdist_wheel
|
||||||
|
twine upload dist/* -u $(pip config get pypi.username) -p $(pip config get pypi.password)
|
@ -1,4 +1,4 @@
|
|||||||
requests
|
requests[socks]
|
||||||
pyquery
|
pyquery
|
||||||
GenEpub
|
GenEpub
|
||||||
imgyaso
|
imgyaso
|
||||||
|
Loading…
x
Reference in New Issue
Block a user