1
0
mirror of https://github.com/apachecn/epub-crawler.git synced 2025-06-03 23:58:27 +00:00
epub-crawler/EpubCrawler/sele_crawler.py
2023-03-21 13:40:38 +08:00

172 lines
5.3 KiB
Python

from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from GenEpub import gen_epub
from urllib.parse import urljoin
import sys
import json
import re
import hashlib
import base64
import time
from concurrent.futures import ThreadPoolExecutor
import threading
import traceback
from .util import *
from .img import process_img
from .config import config
from .common import *
trlocal = threading.local()
drivers = []
JS_GET_IMG_B64 = '''
function getImageBase64(img_stor) {
var img = document.querySelector(img_stor)
if (!img) return ''
var canvas = document.createElement("canvas");
canvas.width = img.width;
canvas.height = img.height;
var ctx = canvas.getContext("2d");
ctx.drawImage(img, 0, 0, img.width, img.height);
var dataURL = canvas.toDataURL("image/png");
return dataURL;
}
'''
'''
def get_img_src(el_img):
url = ''
for prop in config['imgSrc']:
url = el_img.attr(prop)
if url: break
return url
def process_img_data_url(url, el_img, imgs, **kw):
if not re.search(RE_DATA_URL, url):
return False
picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png'
print(f'pic: {url} => {picname}')
if picname not in imgs:
enco_data = re.sub(RE_DATA_URL, '', url)
data = base64.b64decode(enco_data.encode('utf-8'))
data = opti_img(data, config['optiMode'], config['colors'])
imgs[picname] = data
el_img.attr('src', kw['img_prefix'] + picname)
return True
def process_img(driver, html, imgs, **kw):
kw.setdefault('img_prefix', 'img/')
root = pq(html)
el_imgs = root('img')
for i in range(len(el_imgs)):
el_img = el_imgs.eq(i)
url = get_img_src(el_img)
if not url: continue
if process_img_data_url(url, el_img, imgs, **kw):
continue
if not url.startswith('http'):
if kw.get('page_url'):
url = urljoin(kw.get('page_url'), url)
else: continue
picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png'
print(f'pic: {url} => {picname}')
if picname not in imgs:
try:
driver.get(url)
b64 = driver.execute_script(
JS_GET_IMG_B64 + '\nreturn getImageBase64("body>img")')
print(b64[:100])
process_img_data_url(b64, el_img, imgs, **kw)
time.sleep(config['wait'])
except Exception as ex: print(ex)
return root.html()
'''
def wait_content_cb(driver):
return driver.execute_script('''
var titlePresent = document.querySelector(arguments[0]) != null
var contPresent = document.querySelector(arguments[1]) != null
return titlePresent && contPresent
''', config['title'], config['content'])
def download_page(url, art, imgs):
hash = hashlib.md5(url.encode('utf-8')).hexdigest()
cache = load_article(hash)
if cache is not None and config['cache']:
print(f'{url} 已存在于本地缓存中')
art.update(cache)
art['content'] = process_img(
art['content'], imgs,
page_url=url,
img_prefix='../Images/',
)
return
if not hasattr(trlocal, 'driver'):
trlocal.driver = create_driver()
drivers.append(trlocal.driver)
driver = trlocal.driver
driver.get(url)
# 显式等待
if config['waitContent']:
WebDriverWait(driver, config['waitContent'], 0.5) \
.until(wait_content_cb, "无法获取标题或内容")
html = driver.find_element_by_css_selector('body').get_attribute('outerHTML')
art.update(get_article(html, url))
save_article(hash, art)
print(f'{url} 下载成功')
art['content'] = process_img(art['content'], imgs, page_url=url, img_prefix='../Images/')
time.sleep(config['wait'])
def download_page_safe(url, art, imgs):
try: download_page(url, art, imgs)
except: traceback.print_exc()
def create_driver():
options = Options()
if not config['debug']:
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--log-level=3')
driver = webdriver.Chrome(options=options)
driver.get(config['url'])
for kv in config.get('headers', {}).get('Cookie', '').split('; '):
kv = kv.split('=')
if len(kv) < 2: continue
driver.add_cookie({'name': kv[0], 'value': kv[1]})
driver.get(config['url'])
return driver
def crawl_sele():
articles = [{
'title': config['name'],
'content': f"<p>来源:<a href='" + config['url'] + "'>" + config['url'] + "</a></p>"
}]
imgs = {}
pool = ThreadPoolExecutor(config['textThreads'])
hdls = []
for url in config['list']:
if not re.search(r'^https?://', url):
articles.append({'title': url, 'content': ''})
continue
art = {}
articles.append(art)
h = pool.submit(download_page_safe, url, art, imgs)
hdls.append(h)
# download_page_safe(driver, url, articles, imgs)
for h in hdls: h.result()
articles = [art for art in articles if art]
gen_epub(articles, imgs)
for d in drivers: d.close()