mirror of
https://github.com/apachecn/epub-crawler.git
synced 2025-06-03 23:58:27 +00:00
151 lines
3.8 KiB
Python
151 lines
3.8 KiB
Python
#!/usr/bin/env python3.7
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from urllib.parse import urljoin
|
|
import sys
|
|
import json
|
|
from pyquery import PyQuery as pq
|
|
import time
|
|
from os import path
|
|
import re
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from GenEpub import gen_epub
|
|
from . import *
|
|
from .util import *
|
|
from .img import *
|
|
from .config import config
|
|
|
|
def get_toc_from_cfg():
|
|
if config['list'] and len(config['list']) > 0:
|
|
return config['list']
|
|
|
|
if not config['url']:
|
|
print('URL not specified')
|
|
sys.exit()
|
|
|
|
html = request_retry(
|
|
'GET', config['url'],
|
|
retry=config['retry'],
|
|
headers=config['headers'],
|
|
timeout=config['timeout'],
|
|
proxies=config['proxy'],
|
|
).content.decode(config['encoding'])
|
|
return get_toc(html, config['url'])
|
|
|
|
def get_toc(html, base):
|
|
root = pq(html)
|
|
|
|
if config['remove']:
|
|
root(config['remove']).remove()
|
|
|
|
el_links = root(config['link'])
|
|
vis = set()
|
|
|
|
res = []
|
|
for i in range(len(el_links)):
|
|
el_link = el_links.eq(i)
|
|
url = el_link.attr('href')
|
|
if not url:
|
|
text = el_link.text().strip()
|
|
re.append(text)
|
|
continue
|
|
|
|
url = re.sub(r'#.*$', '', url)
|
|
if base:
|
|
url = urljoin(base, url)
|
|
if not url.startswith('http'):
|
|
continue
|
|
if url in vis: continue
|
|
vis.add(url)
|
|
res.append(url)
|
|
|
|
return res
|
|
|
|
def get_article(html, url):
|
|
root = pq(html)
|
|
|
|
if config['remove']:
|
|
root(config['remove']).remove()
|
|
|
|
el_title = root(config['title']).eq(0)
|
|
title = el_title.text().strip()
|
|
el_title.remove()
|
|
|
|
el_co = root(config['content'])
|
|
co = '\r\n'.join([
|
|
el_co.eq(i).html()
|
|
for i in range(len(el_co))
|
|
])
|
|
|
|
if config['credit']:
|
|
credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
|
|
co = credit + co
|
|
|
|
return {'title': title, 'content': co}
|
|
|
|
def tr_download_page(url, art, imgs):
|
|
try:
|
|
html = request_retry(
|
|
'GET', url,
|
|
retry=config['retry'],
|
|
headers=config['headers'],
|
|
timeout=config['timeout'],
|
|
proxies=config['proxy'],
|
|
).content.decode(config['encoding'])
|
|
art.update(get_article(html, url))
|
|
art['content'] = process_img(
|
|
art['content'], imgs,
|
|
page_url=url,
|
|
img_prefix='../Images/',
|
|
)
|
|
time.sleep(config['wait'])
|
|
except Exception as ex:
|
|
print(ex)
|
|
|
|
def main():
|
|
|
|
cfg_fname = sys.argv[1] \
|
|
if len(sys.argv) > 1 \
|
|
else 'config.json'
|
|
if not path.exists(cfg_fname):
|
|
print('please provide config file')
|
|
return
|
|
|
|
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
|
|
config.update(user_cfg)
|
|
if config['proxy']:
|
|
proxies = {
|
|
'http': config['proxy'],
|
|
'https': config['proxy'],
|
|
}
|
|
config['proxy'] = proxies
|
|
set_img_pool(ThreadPoolExecutor(config['imgThreads']))
|
|
|
|
toc = get_toc_from_cfg()
|
|
articles = []
|
|
imgs = {}
|
|
if config['name']:
|
|
articles.append({
|
|
'title': config['name'],
|
|
'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
|
|
})
|
|
|
|
text_pool = ThreadPoolExecutor(config['textThreads'])
|
|
hdls = []
|
|
for url in toc:
|
|
print(f'page: {url}')
|
|
if url.startswith('http'):
|
|
art = {}
|
|
articles.append(art)
|
|
hdl = text_pool.submit(tr_download_page, url, art, imgs)
|
|
hdls.append(hdl)
|
|
else:
|
|
articles.append({'title': url, 'content': ''})
|
|
|
|
for h in hdls: h.result()
|
|
|
|
gen_epub(articles, imgs)
|
|
print('done...')
|
|
|
|
if __name__ == '__main__': main()
|
|
|