mirror of
https://github.com/apachecn/epub-crawler.git
synced 2025-06-06 17:56:57 +00:00
2023-03-21 13:40:38
This commit is contained in:
parent
421f489415
commit
8d76cc195b
@ -97,16 +97,22 @@ def wait_content_cb(driver):
|
|||||||
''', config['title'], config['content'])
|
''', config['title'], config['content'])
|
||||||
|
|
||||||
def download_page(url, art, imgs):
|
def download_page(url, art, imgs):
|
||||||
print(url)
|
hash = hashlib.md5(url.encode('utf-8')).hexdigest()
|
||||||
|
cache = load_article(hash)
|
||||||
|
if cache is not None and config['cache']:
|
||||||
|
print(f'{url} 已存在于本地缓存中')
|
||||||
|
art.update(cache)
|
||||||
|
art['content'] = process_img(
|
||||||
|
art['content'], imgs,
|
||||||
|
page_url=url,
|
||||||
|
img_prefix='../Images/',
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
if not hasattr(trlocal, 'driver'):
|
if not hasattr(trlocal, 'driver'):
|
||||||
trlocal.driver = create_driver()
|
trlocal.driver = create_driver()
|
||||||
drivers.append(trlocal.driver)
|
drivers.append(trlocal.driver)
|
||||||
driver = trlocal.driver
|
driver = trlocal.driver
|
||||||
|
|
||||||
if not re.search(r'^https?://', url):
|
|
||||||
articles.append({'title': url, 'content': ''})
|
|
||||||
return
|
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
# 显式等待
|
# 显式等待
|
||||||
if config['waitContent']:
|
if config['waitContent']:
|
||||||
@ -114,6 +120,8 @@ def download_page(url, art, imgs):
|
|||||||
.until(wait_content_cb, "无法获取标题或内容")
|
.until(wait_content_cb, "无法获取标题或内容")
|
||||||
html = driver.find_element_by_css_selector('body').get_attribute('outerHTML')
|
html = driver.find_element_by_css_selector('body').get_attribute('outerHTML')
|
||||||
art.update(get_article(html, url))
|
art.update(get_article(html, url))
|
||||||
|
save_article(hash, art)
|
||||||
|
print(f'{url} 下载成功')
|
||||||
art['content'] = process_img(art['content'], imgs, page_url=url, img_prefix='../Images/')
|
art['content'] = process_img(art['content'], imgs, page_url=url, img_prefix='../Images/')
|
||||||
time.sleep(config['wait'])
|
time.sleep(config['wait'])
|
||||||
|
|
||||||
@ -147,6 +155,9 @@ def crawl_sele():
|
|||||||
pool = ThreadPoolExecutor(config['textThreads'])
|
pool = ThreadPoolExecutor(config['textThreads'])
|
||||||
hdls = []
|
hdls = []
|
||||||
for url in config['list']:
|
for url in config['list']:
|
||||||
|
if not re.search(r'^https?://', url):
|
||||||
|
articles.append({'title': url, 'content': ''})
|
||||||
|
continue
|
||||||
art = {}
|
art = {}
|
||||||
articles.append(art)
|
articles.append(art)
|
||||||
h = pool.submit(download_page_safe, url, art, imgs)
|
h = pool.submit(download_page_safe, url, art, imgs)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user