commit a739d679f3365d9977aa3cbcbebb335acdbeacac Author: wizardforcel <562826179@qq.com> Date: Fri Aug 20 23:04:45 2021 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6c878f --- /dev/null +++ b/.gitignore @@ -0,0 +1,133 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# custom +cookies.json +history.json \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..b7d7727 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,15 @@ +language: python +python: 3.6 + +install: + - 'pip install -r requirements.txt' + +script: + - ":" + +deploy: + - provider: pypi + user: __token__ + password: $PYPI_TOKEN + distributions: 'sdist bdist_wheel' + skip_existing: true diff --git a/EpubCrawler/__init__.py b/EpubCrawler/__init__.py new file mode 100644 index 0000000..09a5166 --- /dev/null +++ b/EpubCrawler/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3.7 +# -*- coding: utf-8 -*- + +"""EpubCrawler +https://github.com/apachecn/epub-crawler""" + +__author__ = "ApacheCN" +__email__ = "apachecn@163.com" +__license__ = "SATA" +__version__ = "2021.8.20.0" \ No newline at end of file diff --git a/EpubCrawler/__main__.py b/EpubCrawler/__main__.py new file mode 100644 index 0000000..46ee5fa --- /dev/null +++ b/EpubCrawler/__main__.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3.7 +# -*- coding: utf-8 -*- + +from urllib.parse import urljoin +import sys +import json +from pyquery import PyQuery as pq +import time +from os import path +import re +from concurrent.futures import ThreadPoolExecutor +from GenEpub import gen_epub +from . import * +from .util import * +from .img import * +from .config import config + +def get_toc_from_cfg(): + if config['list'] and len(config['list']) > 0: + return config['list'] + + if not config['url']: + print('URL not specified') + sys.exit() + + html = request_retry( + 'GET', config['url'], + retry=config['retry'], + headers=config['headers'], + timeout=config['timeout'], + proxies=config['proxy'], + ).content.decode(config['encoding']) + return get_toc(html, config['url']) + +def get_toc(html, base): + root = pq(html) + + if config['remove']: + root(config['remove']).remove() + + el_links = root(config['link']) + vis = set() + + res = [] + for i in range(len(el_links)): + el_link = el_links.eq(i) + url = el_link.attr('href') + if not url: + text = el_link.text().strip() + re.append(text) + continue + + url = re.sub(r'#.*$', '', url) + if base: + url = urljoin(base, url) + if not url.startswith('http'): + continue + if url in vis: continue + vis.add(url) + res.append(url) + + return res + +def get_article(html, url): + root = pq(html) + + if config['remove']: + root(config['remove']).remove() + + el_title = root(config['title']).eq(0) + title = el_title.text().strip() + el_title.remove() + + el_co = root(config['content']) + co = '\r\n'.join([ + el_co.eq(i).html() + for i in range(len(el_co)) + ]) + + if config['credit']: + credit = f"
原文:{url}
" + co = credit + co + + return {'title': title, 'content': co} + +def tr_download_page(url, art, imgs): + try: + html = request_retry( + 'GET', url, + retry=config['retry'], + headers=config['headers'], + timeout=config['timeout'], + proxies=config['proxy'], + ).content.decode(config['encoding']) + art.update(get_article(html, url)) + art['content'] = process_img( + art['content'], imgs, + page_url=url, + img_prefix='../Images/', + ) + time.sleep(config['wait']) + except Exception as ex: + print(ex) + +def main(): + + cfg_fname = sys.argv[1] \ + if len(sys.argv) > 1 \ + else 'config.json' + if not path.exists(cfg_fname): + print('please provide config file') + return + + user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read()) + config.update(user_cfg) + if config['proxy']: + proxies = { + 'http': config['proxy'], + 'https': config['proxy'], + } + config['proxy'] = proxies + set_img_pool(ThreadPoolExecutor(config['imgThreads'])) + + toc = get_toc_from_cfg() + articles = [] + imgs = {} + if config['name']: + articles.append({ + 'title': config['name'], + 'content': f"

来源:{config['url']}

", + }) + + text_pool = ThreadPoolExecutor(config['textThreads']) + hdls = [] + for url in toc: + print(f'page: {url}') + if url.startswith('http'): + art = {} + articles.append(art) + hdl = text_pool.submit(tr_download_page, url, art, imgs) + hdls.append(hdl) + else: + articles.append({'title': url, 'content': ''}) + + for h in hdls: h.result() + + gen_epub(articles, imgs) + print('done...') + +if __name__ == '__main__': main() + \ No newline at end of file diff --git a/EpubCrawler/config.py b/EpubCrawler/config.py new file mode 100644 index 0000000..033c3d4 --- /dev/null +++ b/EpubCrawler/config.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +config = { + 'name': '', + 'url': '', + 'link': '', + 'title': '', + 'content': '', + 'remove': '', + 'retry': 10, + 'wait': 0, + 'encoding': 'utf-8', + 'credit': True, + 'headers': { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" + }, + 'list': [], + 'optiMode': 'quant', + 'colors': 8, + 'timeout': 8, + 'imgSrc': ['data-src', 'data-original-src', 'src'], + 'proxy': '', + 'textThreads': 5, + 'imgThreads': 5, +} \ No newline at end of file diff --git a/EpubCrawler/img.py b/EpubCrawler/img.py new file mode 100644 index 0000000..2936309 --- /dev/null +++ b/EpubCrawler/img.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- + +from concurrent.futures import ThreadPoolExecutor +from urllib.parse import urljoin, quote_plus +import hashlib +from pyquery import PyQuery as pq +import time +from .util import * +from .config import config + +img_pool = ThreadPoolExecutor(5) + +def set_img_pool(pool): + global img_pool + img_pool = pool + +def get_img_src(el_img): + url = '' + for prop in config['imgSrc']: + url = el_img.attr(prop) + if url: break + return url + +def tr_download_img(url, imgs, picname): + try: + data = request_retry( + 'GET', url, + headers=config['headers'], + retry=config['retry'], + timeout=config['timeout'], + proxies=config['proxy'], + ).content + data = opti_img(data, config['optiMode'], config['colors']) + imgs[picname] = data + time.sleep(config['wait']) + except Exception as ex: + print(ex) + +def process_img(html, imgs, **kw): + kw.setdefault('img_prefix', 'img/') + + root = pq(html) + el_imgs = root('img') + hdls = [] + + for i in range(len(el_imgs)): + el_img = el_imgs.eq(i) + url = get_img_src(el_img) + if not url: continue + if not url.startswith('http'): + if kw.get('page_url'): + url = urljoin(kw.get('page_url'), url) + else: continue + + picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png' + print(f'pic: {url} => {picname}') + if picname not in imgs: + hdl = img_pool.submit(tr_download_img, url, imgs, picname) + hdls.append(hdl) + + el_img.attr('src', kw['img_prefix'] + picname) + + for h in hdls: h.result() + return root.html() + + \ No newline at end of file diff --git a/EpubCrawler/util.py b/EpubCrawler/util.py new file mode 100644 index 0000000..b41f813 --- /dev/null +++ b/EpubCrawler/util.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- + +import requests +from imgyaso import pngquant_bts, \ + adathres_bts, grid_bts, noise_bts, trunc_bts + +def request_retry(method, url, retry=10, **kw): + kw.setdefault('timeout', 10) + for i in range(retry): + try: + return requests.request(method, url, **kw) + except KeyboardInterrupt as e: + raise e + except Exception as e: + print(f'{url} retry {i}') + if i == retry - 1: raise e + +def opti_img(img, mode, colors): + if mode == 'quant': + return pngquant_bts(img, colors) + elif mode == 'grid': + return grid_bts(img) + elif mode == 'trunc': + return trunc_bts(img, colors) + elif mode == 'thres': + return adathres_bts(img) + else: + return img \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c38b3ce --- /dev/null +++ b/LICENSE @@ -0,0 +1,37 @@ +The Star And Thank Author License (SATA) + +Copyright © 2021 ApacheCN(apachecn@163.com) + +Project Url: https://github.com/apachecn/epub-crawler + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +And wait, the most important, you shall star/+1/like the project(s) in project url +section above first, and then thank the author(s) in Copyright section. + +Here are some suggested ways: + + - Email the authors a thank-you letter, and make friends with him/her/them. + - Report bugs or issues. + - Tell friends what a wonderful project this is. + - And, sure, you can just express thanks in your mind without telling the world. + +Contributors of this project by forking have the option to add his/her name and +forked project url at copyright and project url sections, but shall not delete +or modify anything else in these two sections. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e91d317 --- /dev/null +++ b/README.md @@ -0,0 +1,132 @@ +# epub-crawler + +用于抓取网页内容并制作 EPUB 的小工具。 + +## 安装 + +通过 pip(推荐): + +``` +pip install epub-crawler +``` + +从源码安装: + +``` +pip install git+https://github.com/apachecn/epub-crawler +``` + +## 使用指南 + +``` +crawl-epub [CONFIG] + +CONFIG: JSON 格式的配置文件,默认为当前工作目录中的 config.json +``` + +配置文件包含以下属性: + ++ `name: String` + + 元信息中的书籍名称,也是在当前工作目录中保存文件的名称 + ++ `url: String`(可空) + + 目录页面的 URL + ++ `link: String`(可空) + + 链接``的选择器 + ++ `list: [String]`(可空) + + 待抓取页面的列表,如果这个列表不为空,则抓取这个列表,忽略`url`和`link` + ++ `title: String` + + 文章页面的标题选择器 + ++ `content: String` + + 文章页面的内容选择器 + ++ `remove: String`(可空) + + 文章页面需要移除的元素的选择器 + ++ `credit: Boolean`(可空) + + 是否显示原文链接 + ++ `headers: {String: String}`(可空) + + HTTP 请求的协议头,默认为`{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}` + ++ `retry: Integer`(可空) + + HTTP 请求的重试次数,默认为 10 + ++ `wait: Float`(可空) + + 两次请求之间的间隔(秒),默认为 0 + ++ `timeout: Integer`(可空) + + HTTP 请求的超时(秒),默认为 8 + ++ `encoding: String`(可空) + + 网页编码,默认为 UTF-8 + ++ `optiMode: String`(可空) + + 图片处理的模型,`'none'`表示不处理,其它值请见 imgyaso 支持的模式,默认为`'quant'` + ++ `colors: Integer`(可空) + + imgyaso 接收的`colors`参数,默认为 8 + ++ `imgSrc: [String]`(可空) + + 图片源的属性,默认为`["data-src", "data-original-src", "src"]` + ++ `proxy: String`(可空) + + 要使用的索引,格式为`://:` + ++ `textThreads: Integer`(可空) + + 爬取文本的线程数,默认为 5 + ++ `imgThreads: Integer`(可空) + + 爬取图片的线程数,默认为 5 + +用于抓取我们的 PyTorch 1.4 文档的示例: + +```json +{ + "name": "PyTorch 1.4 中文文档 & 教程", + "url": "https://gitee.com/apachecn/pytorch-doc-zh/blob/master/docs/1.4/SUMMARY.md", + "link": ".markdown-body li a", + "title": ".markdown-body>h1", + "content": ".markdown-body", + "remove": "a.anchor", +} +``` + +## 协议 + +本项目基于 SATA 协议发布。 + +您有义务为此开源项目点赞,并考虑额外给予作者适当的奖励。 + +## 赞助我们 + +![](https://home.apachecn.org/img/about/donate.jpg) + +## 另见 + ++ [ApacheCN 学习资源](https://docs.apachecn.org/) ++ [计算机电子书](http://it-ebooks.flygon.net) ++ [布客新知](http://flygon.net/ixinzhi/) diff --git a/history.md b/history.md new file mode 100644 index 0000000..638f6f7 --- /dev/null +++ b/history.md @@ -0,0 +1,5 @@ +# 历史记录 + +v2021.8.20 + ++ 使用 Python 重构 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4a9545c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +requests +pyquery +GenEpub +imgyaso diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8348338 --- /dev/null +++ b/setup.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3.7 +# -*- coding: utf-8 -*- + +import setuptools +import EpubCrawler + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +with open("requirements.txt", "r", encoding="utf-8") as fh: + install_requires = fh.read().splitlines() + +setuptools.setup( + name="EpubCrawler", + version=EpubCrawler.__version__, + url="https://github.com/apachecn/epub-crawler", + author=EpubCrawler.__author__, + author_email=EpubCrawler.__email__, + classifiers=[ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: End Users/Desktop", + "License :: Other/Proprietary License", + "Natural Language :: Chinese (Simplified)", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Text Processing :: Markup :: HTML", + "Topic :: Utilities", + ], + description="EpubCrawler,用于抓取网页内容并制作 EPUB 的小工具", + long_description=long_description, + long_description_content_type="text/markdown", + keywords=[ + "ebook", + "epub", + "crawler", + "爬虫", + "电子书", + ], + install_requires=install_requires, + python_requires=">=3.6", + entry_points={ + 'console_scripts': [ + "crawl-epub=EpubCrawler.__main__:main", + ], + }, + packages=setuptools.find_packages(), +) diff --git a/update.sh b/update.sh new file mode 100644 index 0000000..f76166c --- /dev/null +++ b/update.sh @@ -0,0 +1,3 @@ +git add -A +git commit -am "$(date "+%Y-%m-%d %H:%M:%S")" +git push \ No newline at end of file