mirror of
https://github.com/apachecn/epub-crawler.git
synced 2025-06-06 09:08:42 +00:00
33 lines
927 B
Python
33 lines
927 B
Python
import re
|
|
from pyquery import PyQuery as pq
|
|
from .config import config
|
|
from readability import Document
|
|
|
|
def get_article(html, url):
|
|
# 预处理掉 XML 声明和命名空间
|
|
html = re.sub(r'<\?xml[^>]*\?>', '', html)
|
|
html = re.sub(r'xmlns=".+?"', '', html)
|
|
root = pq(html)
|
|
|
|
if config['remove']:
|
|
root(config['remove']).remove()
|
|
|
|
el_title = root(config['title']).eq(0)
|
|
title = el_title.text().strip()
|
|
el_title.remove()
|
|
|
|
if config['content']:
|
|
el_co = root(config['content'])
|
|
co = '\r\n'.join([
|
|
el_co.eq(i).html()
|
|
for i in range(len(el_co))
|
|
])
|
|
else:
|
|
co = Document(str(root)).summary()
|
|
co = pq(co).find('body').html()
|
|
|
|
if config['credit']:
|
|
credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
|
|
co = credit + co
|
|
|
|
return {'title': title, 'content': co} |