1
0
mirror of https://github.com/apachecn/epub-crawler.git synced 2025-06-03 23:58:27 +00:00

2023-03-21 13:40:38

This commit is contained in:
wizardforcel 2023-03-21 13:40:38 +08:00
parent 421f489415
commit 8d76cc195b

View File

@ -97,16 +97,22 @@ def wait_content_cb(driver):
''', config['title'], config['content'])
def download_page(url, art, imgs):
print(url)
hash = hashlib.md5(url.encode('utf-8')).hexdigest()
cache = load_article(hash)
if cache is not None and config['cache']:
print(f'{url} 已存在于本地缓存中')
art.update(cache)
art['content'] = process_img(
art['content'], imgs,
page_url=url,
img_prefix='../Images/',
)
return
if not hasattr(trlocal, 'driver'):
trlocal.driver = create_driver()
drivers.append(trlocal.driver)
driver = trlocal.driver
if not re.search(r'^https?://', url):
articles.append({'title': url, 'content': ''})
return
driver.get(url)
# 显式等待
if config['waitContent']:
@ -114,6 +120,8 @@ def download_page(url, art, imgs):
.until(wait_content_cb, "无法获取标题或内容")
html = driver.find_element_by_css_selector('body').get_attribute('outerHTML')
art.update(get_article(html, url))
save_article(hash, art)
print(f'{url} 下载成功')
art['content'] = process_img(art['content'], imgs, page_url=url, img_prefix='../Images/')
time.sleep(config['wait'])
@ -147,6 +155,9 @@ def crawl_sele():
pool = ThreadPoolExecutor(config['textThreads'])
hdls = []
for url in config['list']:
if not re.search(r'^https?://', url):
articles.append({'title': url, 'content': ''})
continue
art = {}
articles.append(art)
h = pool.submit(download_page_safe, url, art, imgs)