1
0
mirror of https://github.com/apachecn/epub-crawler.git synced 2025-06-03 23:58:27 +00:00
2023-04-27 23:07:04 +08:00

252 lines
6.9 KiB
Python

#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-
from urllib.parse import urljoin
import sys
import json
import yaml
import warnings
from pyquery import PyQuery as pq
import time
from os import path
import re
from concurrent.futures import ThreadPoolExecutor
import hashlib
from readability import Document
from GenEpub import gen_epub
from . import *
from .util import *
from .img import *
from .config import config
from .sele_crawler import crawl_sele
from .common import *
warnings.filterwarnings("ignore")
def get_toc_from_cfg():
if config['list'] and len(config['list']) > 0:
return config['list']
if not config['url']:
print('URL not specified')
sys.exit()
html = request_retry(
'GET', config['url'],
retry=config['retry'],
check_status=config['checkStatus'],
headers=config['headers'],
timeout=config['timeout'],
proxies=config['proxy'],
verify=False,
).content.decode(config['encoding'], 'ignore')
return get_toc(html, config['url'])
def get_toc(html, base):
root = pq(html)
if config['remove']:
root(config['remove']).remove()
el_links = root(config['link'])
vis = set()
res = []
for i in range(len(el_links)):
el_link = el_links.eq(i)
url = el_link.attr('href')
if not url:
text = el_link.text().strip()
res.append(text)
continue
url = re.sub(r'#.*$', '', url)
if base:
url = urljoin(base, url)
if not url.startswith('http'):
continue
if url in vis: continue
vis.add(url)
res.append(url)
return res
def tr_download_page_safe(url, art, imgs):
try:
tr_download_page(url, art, imgs)
except Exception as ex:
print(f'{url} 下载失败:{ex}')
def tr_download_page(url, art, imgs):
hash = hashlib.md5(url.encode('utf-8')).hexdigest()
cache = load_article(hash)
if cache is not None and config['cache']:
print(f'{url} 已存在于本地缓存中')
art.update(cache)
art['content'] = process_img(
art['content'], imgs,
page_url=url,
img_prefix='../Images/',
)
return
for i in range(config['retry']):
html = request_retry(
'GET', url,
retry=config['retry'],
check_status=config['checkStatus'],
headers=config['headers'],
timeout=config['timeout'],
proxies=config['proxy'],
verify=False,
).content.decode(config['encoding'], 'ignore')
r = get_article(html, url)
if not config['checkBlank'] or \
(r['title'] and r['content']):
break
if i == config['retry'] - 1:
raise Exception(f'{url} 标题或内容为空')
art.update(r)
save_article(hash, art)
art['content'] = process_img(
art['content'], imgs,
page_url=url,
img_prefix='../Images/',
)
print(f'{url} 下载成功')
time.sleep(config['wait'])
def update_config(cfg_fname, user_cfg):
global get_toc
global get_article
config.update(user_cfg)
if not config['title']:
config['title'] = 'title'
if config['proxy']:
proxies = {
'http': config['proxy'],
'https': config['proxy'],
}
config['proxy'] = proxies
set_img_pool(ThreadPoolExecutor(config['imgThreads']))
if config['external']:
ex_fname = path.join(path.dirname(cfg_fname), config['external'])
mod = load_module(ex_fname)
get_toc = getattr(mod, 'get_toc', get_toc)
get_article = getattr(mod, 'get_article', get_article)
if not config['timeout']:
config['timeout'] = (
config['connTimeout'],
config['readTimeout'],
)
def gen_epub_paging(articles, imgs, config):
limit = size_str_to_int(config['sizeLimit'])
art_part = []
img_part = {}
total = 0
ipt = 1
for a in articles:
art_imgs = re.findall(r'src="\.\./Images/(\w{32}\.png)"', a['content'])
size = sum(
len(imgs.get(iname, b''))
for iname in art_imgs
)
if total + size >= limit:
art_part.insert(0, {
'title': config['name'] + f' PT{ipt}',
'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
})
gen_epub(art_part, img_part)
art_part = []
img_part = {}
total = 0
ipt += 1
art_part.append(a)
img_part.update({
iname:imgs.get(iname, b'')
for iname in art_imgs
})
total += size
if art_part:
art_part.insert(0, {
'title': config['name'] + f' PT{ipt}',
'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
})
gen_epub(art_part, img_part)
def main():
cfg_fname = sys.argv[1] \
if len(sys.argv) > 1 \
else 'config.json'
if not path.exists(cfg_fname):
print('please provide config file')
return
ext = extname(cfg_fname).lower()
cont = open(cfg_fname, encoding='utf-8').read()
if ext == 'json':
user_cfg = json.loads(cont)
elif ext in ['yaml', 'yml']:
user_cfg = yaml.safe_load(cont)
elif ext == 'txt':
urls = [l.strip() for l in cont.split('\n')]
urls = [l for l in urls if l]
name = re.sub('\.\w+$', '', path.basename(cfg_fname))
user_cfg = {
'name': name,
'url': urls[0] if urls else '',
'list': urls,
}
else:
print('配置文件必须为 JSON、YAML 或 TXT')
return
update_config(cfg_fname, user_cfg)
if config['selenium']:
crawl_sele()
return
toc = get_toc_from_cfg()
articles = []
imgs = {}
if config['name']:
articles.append({
'title': config['name'],
'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
})
text_pool = ThreadPoolExecutor(config['textThreads'])
hdls = []
for url in toc:
print(f'page: {url}')
if not re.search(r'^https?://', url):
articles.append({'title': url, 'content': ''})
continue
art = {}
articles.append(art)
hdl = text_pool.submit(tr_download_page_safe, url, art, imgs)
hdls.append(hdl)
for h in hdls: h.result()
articles = [art for art in articles if art]
total = sum(len(v) for _, v in imgs.items())
limit = size_str_to_int(config['sizeLimit'])
if total <= limit:
gen_epub(articles, imgs)
else:
gen_epub_paging(articles[1:], imgs, config)
print('done...')
if __name__ == '__main__': main()