1
0
mirror of https://github.com/apachecn/epub-crawler.git synced 2025-06-06 09:08:42 +00:00
2022-08-20 21:25:43 +08:00

33 lines
927 B
Python

import re
from pyquery import PyQuery as pq
from .config import config
from readability import Document
def get_article(html, url):
# 预处理掉 XML 声明和命名空间
html = re.sub(r'<\?xml[^>]*\?>', '', html)
html = re.sub(r'xmlns=".+?"', '', html)
root = pq(html)
if config['remove']:
root(config['remove']).remove()
el_title = root(config['title']).eq(0)
title = el_title.text().strip()
el_title.remove()
if config['content']:
el_co = root(config['content'])
co = '\r\n'.join([
el_co.eq(i).html()
for i in range(len(el_co))
])
else:
co = Document(str(root)).summary()
co = pq(co).find('body').html()
if config['credit']:
credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
co = credit + co
return {'title': title, 'content': co}