1
0
mirror of https://github.com/apachecn/epub-crawler.git synced 2025-06-02 07:13:21 +00:00
This commit is contained in:
wizardforcel 2021-08-20 23:04:45 +08:00
commit a739d679f3
13 changed files with 663 additions and 0 deletions

133
.gitignore vendored Normal file
View File

@ -0,0 +1,133 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# custom
cookies.json
history.json

15
.travis.yml Normal file
View File

@ -0,0 +1,15 @@
language: python
python: 3.6
install:
- 'pip install -r requirements.txt'
script:
- ":"
deploy:
- provider: pypi
user: __token__
password: $PYPI_TOKEN
distributions: 'sdist bdist_wheel'
skip_existing: true

10
EpubCrawler/__init__.py Normal file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-
"""EpubCrawler
https://github.com/apachecn/epub-crawler"""
__author__ = "ApacheCN"
__email__ = "apachecn@163.com"
__license__ = "SATA"
__version__ = "2021.8.20.0"

151
EpubCrawler/__main__.py Normal file
View File

@ -0,0 +1,151 @@
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-
from urllib.parse import urljoin
import sys
import json
from pyquery import PyQuery as pq
import time
from os import path
import re
from concurrent.futures import ThreadPoolExecutor
from GenEpub import gen_epub
from . import *
from .util import *
from .img import *
from .config import config
def get_toc_from_cfg():
if config['list'] and len(config['list']) > 0:
return config['list']
if not config['url']:
print('URL not specified')
sys.exit()
html = request_retry(
'GET', config['url'],
retry=config['retry'],
headers=config['headers'],
timeout=config['timeout'],
proxies=config['proxy'],
).content.decode(config['encoding'])
return get_toc(html, config['url'])
def get_toc(html, base):
root = pq(html)
if config['remove']:
root(config['remove']).remove()
el_links = root(config['link'])
vis = set()
res = []
for i in range(len(el_links)):
el_link = el_links.eq(i)
url = el_link.attr('href')
if not url:
text = el_link.text().strip()
re.append(text)
continue
url = re.sub(r'#.*$', '', url)
if base:
url = urljoin(base, url)
if not url.startswith('http'):
continue
if url in vis: continue
vis.add(url)
res.append(url)
return res
def get_article(html, url):
root = pq(html)
if config['remove']:
root(config['remove']).remove()
el_title = root(config['title']).eq(0)
title = el_title.text().strip()
el_title.remove()
el_co = root(config['content'])
co = '\r\n'.join([
el_co.eq(i).html()
for i in range(len(el_co))
])
if config['credit']:
credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
co = credit + co
return {'title': title, 'content': co}
def tr_download_page(url, art, imgs):
try:
html = request_retry(
'GET', url,
retry=config['retry'],
headers=config['headers'],
timeout=config['timeout'],
proxies=config['proxy'],
).content.decode(config['encoding'])
art.update(get_article(html, url))
art['content'] = process_img(
art['content'], imgs,
page_url=url,
img_prefix='../Images/',
)
time.sleep(config['wait'])
except Exception as ex:
print(ex)
def main():
cfg_fname = sys.argv[1] \
if len(sys.argv) > 1 \
else 'config.json'
if not path.exists(cfg_fname):
print('please provide config file')
return
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
config.update(user_cfg)
if config['proxy']:
proxies = {
'http': config['proxy'],
'https': config['proxy'],
}
config['proxy'] = proxies
set_img_pool(ThreadPoolExecutor(config['imgThreads']))
toc = get_toc_from_cfg()
articles = []
imgs = {}
if config['name']:
articles.append({
'title': config['name'],
'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
})
text_pool = ThreadPoolExecutor(config['textThreads'])
hdls = []
for url in toc:
print(f'page: {url}')
if url.startswith('http'):
art = {}
articles.append(art)
hdl = text_pool.submit(tr_download_page, url, art, imgs)
hdls.append(hdl)
else:
articles.append({'title': url, 'content': ''})
for h in hdls: h.result()
gen_epub(articles, imgs)
print('done...')
if __name__ == '__main__': main()

25
EpubCrawler/config.py Normal file
View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
config = {
'name': '',
'url': '',
'link': '',
'title': '',
'content': '',
'remove': '',
'retry': 10,
'wait': 0,
'encoding': 'utf-8',
'credit': True,
'headers': {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
},
'list': [],
'optiMode': 'quant',
'colors': 8,
'timeout': 8,
'imgSrc': ['data-src', 'data-original-src', 'src'],
'proxy': '',
'textThreads': 5,
'imgThreads': 5,
}

66
EpubCrawler/img.py Normal file
View File

@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, quote_plus
import hashlib
from pyquery import PyQuery as pq
import time
from .util import *
from .config import config
img_pool = ThreadPoolExecutor(5)
def set_img_pool(pool):
global img_pool
img_pool = pool
def get_img_src(el_img):
url = ''
for prop in config['imgSrc']:
url = el_img.attr(prop)
if url: break
return url
def tr_download_img(url, imgs, picname):
try:
data = request_retry(
'GET', url,
headers=config['headers'],
retry=config['retry'],
timeout=config['timeout'],
proxies=config['proxy'],
).content
data = opti_img(data, config['optiMode'], config['colors'])
imgs[picname] = data
time.sleep(config['wait'])
except Exception as ex:
print(ex)
def process_img(html, imgs, **kw):
kw.setdefault('img_prefix', 'img/')
root = pq(html)
el_imgs = root('img')
hdls = []
for i in range(len(el_imgs)):
el_img = el_imgs.eq(i)
url = get_img_src(el_img)
if not url: continue
if not url.startswith('http'):
if kw.get('page_url'):
url = urljoin(kw.get('page_url'), url)
else: continue
picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png'
print(f'pic: {url} => {picname}')
if picname not in imgs:
hdl = img_pool.submit(tr_download_img, url, imgs, picname)
hdls.append(hdl)
el_img.attr('src', kw['img_prefix'] + picname)
for h in hdls: h.result()
return root.html()

28
EpubCrawler/util.py Normal file
View File

@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
import requests
from imgyaso import pngquant_bts, \
adathres_bts, grid_bts, noise_bts, trunc_bts
def request_retry(method, url, retry=10, **kw):
kw.setdefault('timeout', 10)
for i in range(retry):
try:
return requests.request(method, url, **kw)
except KeyboardInterrupt as e:
raise e
except Exception as e:
print(f'{url} retry {i}')
if i == retry - 1: raise e
def opti_img(img, mode, colors):
if mode == 'quant':
return pngquant_bts(img, colors)
elif mode == 'grid':
return grid_bts(img)
elif mode == 'trunc':
return trunc_bts(img, colors)
elif mode == 'thres':
return adathres_bts(img)
else:
return img

37
LICENSE Normal file
View File

@ -0,0 +1,37 @@
The Star And Thank Author License (SATA)
Copyright © 2021 ApacheCN(apachecn@163.com)
Project Url: https://github.com/apachecn/epub-crawler
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
And wait, the most important, you shall star/+1/like the project(s) in project url
section above first, and then thank the author(s) in Copyright section.
Here are some suggested ways:
- Email the authors a thank-you letter, and make friends with him/her/them.
- Report bugs or issues.
- Tell friends what a wonderful project this is.
- And, sure, you can just express thanks in your mind without telling the world.
Contributors of this project by forking have the option to add his/her name and
forked project url at copyright and project url sections, but shall not delete
or modify anything else in these two sections.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

132
README.md Normal file
View File

@ -0,0 +1,132 @@
# epub-crawler
用于抓取网页内容并制作 EPUB 的小工具。
## 安装
通过 pip推荐
```
pip install epub-crawler
```
从源码安装:
```
pip install git+https://github.com/apachecn/epub-crawler
```
## 使用指南
```
crawl-epub [CONFIG]
CONFIG: JSON 格式的配置文件,默认为当前工作目录中的 config.json
```
配置文件包含以下属性:
+ `name: String`
元信息中的书籍名称,也是在当前工作目录中保存文件的名称
+ `url: String`(可空)
目录页面的 URL
+ `link: String`(可空)
链接`<a>`的选择器
+ `list: [String]`(可空)
待抓取页面的列表,如果这个列表不为空,则抓取这个列表,忽略`url``link`
+ `title: String`
文章页面的标题选择器
+ `content: String`
文章页面的内容选择器
+ `remove: String`(可空)
文章页面需要移除的元素的选择器
+ `credit: Boolean`(可空)
是否显示原文链接
+ `headers: {String: String}`(可空)
HTTP 请求的协议头,默认为`{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}`
+ `retry: Integer`(可空)
HTTP 请求的重试次数,默认为 10
+ `wait: Float`(可空)
两次请求之间的间隔(秒),默认为 0
+ `timeout: Integer`(可空)
HTTP 请求的超时(秒),默认为 8
+ `encoding: String`(可空)
网页编码,默认为 UTF-8
+ `optiMode: String`(可空)
图片处理的模型,`'none'`表示不处理,其它值请见 imgyaso 支持的模式,默认为`'quant'`
+ `colors: Integer`(可空)
imgyaso 接收的`colors`参数,默认为 8
+ `imgSrc: [String]`(可空)
图片源的属性,默认为`["data-src", "data-original-src", "src"]`
+ `proxy: String`(可空)
要使用的索引,格式为`<protocal>://<host>:<port>`
+ `textThreads: Integer`(可空)
爬取文本的线程数,默认为 5
+ `imgThreads: Integer`(可空)
爬取图片的线程数,默认为 5
用于抓取我们的 PyTorch 1.4 文档的示例:
```json
{
"name": "PyTorch 1.4 中文文档 & 教程",
"url": "https://gitee.com/apachecn/pytorch-doc-zh/blob/master/docs/1.4/SUMMARY.md",
"link": ".markdown-body li a",
"title": ".markdown-body>h1",
"content": ".markdown-body",
"remove": "a.anchor",
}
```
## 协议
本项目基于 SATA 协议发布。
您有义务为此开源项目点赞,并考虑额外给予作者适当的奖励。
## 赞助我们
![](https://home.apachecn.org/img/about/donate.jpg)
## 另见
+ [ApacheCN 学习资源](https://docs.apachecn.org/)
+ [计算机电子书](http://it-ebooks.flygon.net)
+ [布客新知](http://flygon.net/ixinzhi/)

5
history.md Normal file
View File

@ -0,0 +1,5 @@
# 历史记录
v2021.8.20
+ 使用 Python 重构

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
requests
pyquery
GenEpub
imgyaso

54
setup.py Normal file
View File

@ -0,0 +1,54 @@
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-
import setuptools
import EpubCrawler
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
with open("requirements.txt", "r", encoding="utf-8") as fh:
install_requires = fh.read().splitlines()
setuptools.setup(
name="EpubCrawler",
version=EpubCrawler.__version__,
url="https://github.com/apachecn/epub-crawler",
author=EpubCrawler.__author__,
author_email=EpubCrawler.__email__,
classifiers=[
"Development Status :: 4 - Beta",
"Environment :: Console",
"Intended Audience :: Developers",
"Intended Audience :: End Users/Desktop",
"License :: Other/Proprietary License",
"Natural Language :: Chinese (Simplified)",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Text Processing :: Markup :: HTML",
"Topic :: Utilities",
],
description="EpubCrawler用于抓取网页内容并制作 EPUB 的小工具",
long_description=long_description,
long_description_content_type="text/markdown",
keywords=[
"ebook",
"epub",
"crawler",
"爬虫",
"电子书",
],
install_requires=install_requires,
python_requires=">=3.6",
entry_points={
'console_scripts': [
"crawl-epub=EpubCrawler.__main__:main",
],
},
packages=setuptools.find_packages(),
)

3
update.sh Normal file
View File

@ -0,0 +1,3 @@
git add -A
git commit -am "$(date "+%Y-%m-%d %H:%M:%S")"
git push