2023-03-21 13:40:38

2025-06-03 23:58:27 +00:00 · 2023-03-21 13:40:38 +08:00 · 2023-03-21 13:40:38 +08:00 · 8d76cc195b
commit 8d76cc195b
parent 421f489415
1 changed files with 16 additions and 5 deletions
--- a/EpubCrawler/sele_crawler.py
+++ b/EpubCrawler/sele_crawler.py
@ -97,16 +97,22 @@ def wait_content_cb(driver):
    ''', config['title'], config['content'])

 def download_page(url, art, imgs):
-    print(url)
+    hash = hashlib.md5(url.encode('utf-8')).hexdigest()
+    cache = load_article(hash)
+    if cache is not None and config['cache']:
+        print(f'{url} 已存在于本地缓存中')
+        art.update(cache)
+        art['content'] = process_img(
+            art['content'], imgs,
+            page_url=url,
+            img_prefix='../Images/',
+        )
+        return
    
    if not hasattr(trlocal, 'driver'):
        trlocal.driver = create_driver()
        drivers.append(trlocal.driver)
    driver = trlocal.driver
-    
-    if not re.search(r'^https?://', url):
-        articles.append({'title': url, 'content': ''})
-        return
    driver.get(url)
    # 显式等待
    if config['waitContent']:
@ -114,6 +120,8 @@ def download_page(url, art, imgs):
            .until(wait_content_cb, "无法获取标题或内容")
    html = driver.find_element_by_css_selector('body').get_attribute('outerHTML')
    art.update(get_article(html, url))
+    save_article(hash, art)
+    print(f'{url} 下载成功')
    art['content'] = process_img(art['content'], imgs, page_url=url, img_prefix='../Images/')
    time.sleep(config['wait'])

@ -147,6 +155,9 @@ def crawl_sele():
    pool = ThreadPoolExecutor(config['textThreads'])
    hdls = []
    for url in config['list']:
+        if not re.search(r'^https?://', url):
+            articles.append({'title': url, 'content': ''})
+            continue
        art = {}
        articles.append(art)
        h = pool.submit(download_page_safe, url, art, imgs)