from pyquery import PyQuery as pq from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.wait import WebDriverWait from GenEpub import gen_epub from urllib.parse import urljoin import sys import json import re import hashlib import base64 import time from concurrent.futures import ThreadPoolExecutor import threading import traceback from .util import * from .img import process_img from .config import config from .common import * trlocal = threading.local() drivers = [] JS_GET_IMG_B64 = ''' function getImageBase64(img_stor) { var img = document.querySelector(img_stor) if (!img) return '' var canvas = document.createElement("canvas"); canvas.width = img.width; canvas.height = img.height; var ctx = canvas.getContext("2d"); ctx.drawImage(img, 0, 0, img.width, img.height); var dataURL = canvas.toDataURL("image/png"); return dataURL; } ''' ''' def get_img_src(el_img): url = '' for prop in config['imgSrc']: url = el_img.attr(prop) if url: break return url def process_img_data_url(url, el_img, imgs, **kw): if not re.search(RE_DATA_URL, url): return False picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png' print(f'pic: {url} => {picname}') if picname not in imgs: enco_data = re.sub(RE_DATA_URL, '', url) data = base64.b64decode(enco_data.encode('utf-8')) data = opti_img(data, config['optiMode'], config['colors']) imgs[picname] = data el_img.attr('src', kw['img_prefix'] + picname) return True def process_img(driver, html, imgs, **kw): kw.setdefault('img_prefix', 'img/') root = pq(html) el_imgs = root('img') for i in range(len(el_imgs)): el_img = el_imgs.eq(i) url = get_img_src(el_img) if not url: continue if process_img_data_url(url, el_img, imgs, **kw): continue if not url.startswith('http'): if kw.get('page_url'): url = urljoin(kw.get('page_url'), url) else: continue picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png' print(f'pic: {url} => {picname}') if picname not in imgs: try: driver.get(url) b64 = driver.execute_script( JS_GET_IMG_B64 + '\nreturn getImageBase64("body>img")') print(b64[:100]) process_img_data_url(b64, el_img, imgs, **kw) time.sleep(config['wait']) except Exception as ex: print(ex) return root.html() ''' def wait_content_cb(driver): return driver.execute_script(''' var titlePresent = document.querySelector(arguments[0]) != null var contPresent = document.querySelector(arguments[1]) != null return titlePresent && contPresent ''', config['title'], config['content']) def download_page(url, art, imgs): hash = hashlib.md5(url.encode('utf-8')).hexdigest() cache = load_article(hash) if cache is not None and config['cache']: print(f'{url} 已存在于本地缓存中') art.update(cache) art['content'] = process_img( art['content'], imgs, page_url=url, img_prefix='../Images/', ) return if not hasattr(trlocal, 'driver'): trlocal.driver = create_driver() drivers.append(trlocal.driver) driver = trlocal.driver driver.get(url) # 显式等待 if config['waitContent']: WebDriverWait(driver, config['waitContent'], 0.5) \ .until(wait_content_cb, "无法获取标题或内容") html = driver.find_element_by_css_selector('body').get_attribute('outerHTML') art.update(get_article(html, url)) save_article(hash, art) print(f'{url} 下载成功') art['content'] = process_img(art['content'], imgs, page_url=url, img_prefix='../Images/') time.sleep(config['wait']) def download_page_safe(url, art, imgs): try: download_page(url, art, imgs) except: traceback.print_exc() def create_driver(): options = Options() if not config['debug']: options.add_argument('--headless') options.add_argument('--disable-gpu') options.add_argument('--log-level=3') driver = webdriver.Chrome(options=options) driver.get(config['url']) for kv in config.get('headers', {}).get('Cookie', '').split('; '): kv = kv.split('=') if len(kv) < 2: continue driver.add_cookie({'name': kv[0], 'value': kv[1]}) driver.get(config['url']) return driver def crawl_sele(): articles = [{ 'title': config['name'], 'content': f"

来源:" + config['url'] + "

" }] imgs = {} pool = ThreadPoolExecutor(config['textThreads']) hdls = [] for url in config['list']: if not re.search(r'^https?://', url): articles.append({'title': url, 'content': ''}) continue art = {} articles.append(art) h = pool.submit(download_page_safe, url, art, imgs) hdls.append(h) # download_page_safe(driver, url, articles, imgs) for h in hdls: h.result() articles = [art for art in articles if art] gen_epub(articles, imgs) for d in drivers: d.close()