init

2025-06-02 07:13:21 +00:00 · 2021-08-20 23:04:45 +08:00 · 2021-08-20 23:04:45 +08:00 · a739d679f3
commit a739d679f3
13 changed files with 663 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,133 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# custom
+cookies.json
+history.json
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,15 @@
+language: python
+python: 3.6
+
+install:
+    - 'pip install -r requirements.txt'
+
+script:
+    - ":"
+    
+deploy:
+    -   provider: pypi
+        user: __token__
+        password: $PYPI_TOKEN
+        distributions: 'sdist bdist_wheel'
+        skip_existing: true
--- a/EpubCrawler/init.py
+++ b/EpubCrawler/init.py
@ -0,0 +1,10 @@
+#!/usr/bin/env python3.7
+# -*- coding: utf-8 -*-
+
+"""EpubCrawler
+https://github.com/apachecn/epub-crawler"""
+
+__author__ = "ApacheCN"
+__email__ = "apachecn@163.com"
+__license__ = "SATA"
+__version__ = "2021.8.20.0"
--- a/EpubCrawler/main.py
+++ b/EpubCrawler/main.py
@ -0,0 +1,151 @@
+#!/usr/bin/env python3.7
+# -*- coding: utf-8 -*-
+
+from urllib.parse import urljoin
+import sys
+import json
+from pyquery import PyQuery as pq
+import time
+from os import path
+import re
+from concurrent.futures import ThreadPoolExecutor
+from GenEpub import gen_epub
+from . import *
+from .util import *
+from .img import *
+from .config import config
+
+def get_toc_from_cfg():
+    if config['list'] and len(config['list']) > 0:
+        return config['list']
+        
+    if not config['url']:
+        print('URL not specified')
+        sys.exit()
+        
+    html = request_retry(
+        'GET', config['url'],
+        retry=config['retry'],
+        headers=config['headers'],
+        timeout=config['timeout'],
+        proxies=config['proxy'],
+    ).content.decode(config['encoding'])
+    return get_toc(html, config['url'])
+    
+def get_toc(html, base):
+    root = pq(html)
+    
+    if config['remove']:
+        root(config['remove']).remove()
+        
+    el_links = root(config['link'])
+    vis = set()
+    
+    res = []
+    for i in range(len(el_links)):
+        el_link = el_links.eq(i)
+        url = el_link.attr('href')
+        if not url:
+            text = el_link.text().strip()
+            re.append(text)
+            continue
+            
+        url = re.sub(r'#.*$', '', url)
+        if base:
+            url = urljoin(base, url)
+        if not url.startswith('http'):
+            continue
+        if url in vis: continue
+        vis.add(url)
+        res.append(url)
+        
+    return res
+    
+def get_article(html, url):
+    root = pq(html)
+    
+    if config['remove']:
+        root(config['remove']).remove()
+        
+    el_title = root(config['title']).eq(0)
+    title = el_title.text().strip()
+    el_title.remove()
+    
+    el_co = root(config['content'])
+    co = '\r\n'.join([
+        el_co.eq(i).html()
+        for i in range(len(el_co))
+    ])
+    
+    if config['credit']:
+        credit = f"<blockquote>原文：<a href='{url}'>{url}</a></blockquote>"
+        co = credit + co
+        
+    return {'title': title, 'content': co}
+    
+def tr_download_page(url, art, imgs):
+    try:
+        html = request_retry(
+            'GET', url,
+            retry=config['retry'],
+            headers=config['headers'],
+            timeout=config['timeout'],
+            proxies=config['proxy'],
+        ).content.decode(config['encoding'])
+        art.update(get_article(html, url))
+        art['content'] = process_img(
+            art['content'], imgs,
+            page_url=url,
+            img_prefix='../Images/',
+        )
+        time.sleep(config['wait'])
+    except Exception as ex:
+        print(ex)
+
+def main():
+
+    cfg_fname = sys.argv[1] \
+        if len(sys.argv) > 1 \
+        else 'config.json'
+    if not path.exists(cfg_fname):
+        print('please provide config file')
+        return
+        
+    user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
+    config.update(user_cfg)
+    if config['proxy']:
+        proxies = {
+            'http': config['proxy'],
+            'https': config['proxy'],
+        }
+        config['proxy'] = proxies
+    set_img_pool(ThreadPoolExecutor(config['imgThreads']))
+    
+    toc = get_toc_from_cfg()
+    articles = []
+    imgs = {}
+    if config['name']:
+        articles.append({
+            'title': config['name'],
+            'content': f"<p>来源：<a href='{config['url']}'>{config['url']}</a></p>",
+        })
+    
+    text_pool = ThreadPoolExecutor(config['textThreads'])
+    hdls = []
+    for url in toc:
+        print(f'page: {url}')
+        if url.startswith('http'):
+            art = {}
+            articles.append(art)
+            hdl = text_pool.submit(tr_download_page, url, art, imgs)
+            hdls.append(hdl)
+        else:
+            articles.append({'title': url, 'content': ''})
+        
+    for h in hdls: h.result()
+            
+    gen_epub(articles, imgs)
+    print('done...')
+    
+if __name__ == '__main__': main()
+    
--- a/EpubCrawler/config.py
+++ b/EpubCrawler/config.py
@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+config = {
+    'name': '',
+    'url': '',
+    'link': '',
+    'title': '',
+    'content': '',
+    'remove': '',
+    'retry': 10,
+    'wait': 0,
+    'encoding': 'utf-8',
+    'credit': True,
+    'headers': {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
+    },
+    'list': [],
+    'optiMode': 'quant',
+    'colors': 8,
+    'timeout': 8,
+	'imgSrc': ['data-src', 'data-original-src', 'src'],
+    'proxy': '',
+    'textThreads': 5,
+    'imgThreads': 5,
+}
--- a/EpubCrawler/img.py
+++ b/EpubCrawler/img.py
@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+
+from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import urljoin, quote_plus
+import hashlib
+from pyquery import PyQuery as pq
+import time
+from .util import *
+from .config import config
+
+img_pool = ThreadPoolExecutor(5)
+
+def set_img_pool(pool):
+    global img_pool
+    img_pool = pool
+    
+def get_img_src(el_img):
+    url = ''
+    for prop in config['imgSrc']:
+        url = el_img.attr(prop)
+        if url: break
+    return url
+    
+def tr_download_img(url, imgs, picname):
+    try:
+        data = request_retry(
+            'GET', url,
+            headers=config['headers'],
+            retry=config['retry'],
+            timeout=config['timeout'],
+            proxies=config['proxy'],
+        ).content
+        data = opti_img(data, config['optiMode'], config['colors'])
+        imgs[picname] = data
+        time.sleep(config['wait'])
+    except Exception as ex:
+        print(ex)
+    
+def process_img(html, imgs, **kw):
+    kw.setdefault('img_prefix', 'img/')
+    
+    root = pq(html)
+    el_imgs = root('img')
+    hdls = []
+    
+    for i in range(len(el_imgs)):
+        el_img = el_imgs.eq(i)
+        url = get_img_src(el_img)
+        if not url: continue
+        if not url.startswith('http'):
+            if kw.get('page_url'):
+                url = urljoin(kw.get('page_url'), url)
+            else: continue
+        
+        picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png'
+        print(f'pic: {url} => {picname}')
+        if picname not in imgs:
+            hdl = img_pool.submit(tr_download_img, url, imgs, picname)
+            hdls.append(hdl)
+            
+        el_img.attr('src', kw['img_prefix'] + picname)
+        
+    for h in hdls: h.result()
+    return root.html()
+    
+    
--- a/EpubCrawler/util.py
+++ b/EpubCrawler/util.py
@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+
+import requests
+from imgyaso import pngquant_bts, \
+    adathres_bts, grid_bts, noise_bts, trunc_bts
+
+def request_retry(method, url, retry=10, **kw):
+    kw.setdefault('timeout', 10)
+    for i in range(retry):
+        try:
+            return requests.request(method, url, **kw)
+        except KeyboardInterrupt as e:
+            raise e
+        except Exception as e:
+            print(f'{url} retry {i}')
+            if i == retry - 1: raise e
+            
+def opti_img(img, mode, colors):
+    if mode == 'quant':
+        return pngquant_bts(img, colors)
+    elif mode == 'grid':
+        return grid_bts(img)
+    elif mode == 'trunc':
+        return trunc_bts(img, colors)
+    elif mode == 'thres':
+        return adathres_bts(img)
+    else:
+        return img
--- a/37
+++ b/37
@ -0,0 +1,37 @@
+The Star And Thank Author License (SATA)
+
+Copyright © 2021 ApacheCN(apachecn@163.com)
+
+Project Url: https://github.com/apachecn/epub-crawler
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software. 
+
+And wait, the most important, you shall star/+1/like the project(s) in project url 
+section above first, and then thank the author(s) in Copyright section. 
+
+Here are some suggested ways:
+
+ - Email the authors a thank-you letter, and make friends with him/her/them.
+ - Report bugs or issues.
+ - Tell friends what a wonderful project this is.
+ - And, sure, you can just express thanks in your mind without telling the world.
+
+Contributors of this project by forking have the option to add his/her name and 
+forked project url at copyright and project url sections, but shall not delete 
+or modify anything else in these two sections.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,132 @@
+# epub-crawler
+
+用于抓取网页内容并制作 EPUB 的小工具。
+
+## 安装
+
+通过 pip（推荐）：
+
+```
+pip install epub-crawler
+```
+
+从源码安装：
+
+```
+pip install git+https://github.com/apachecn/epub-crawler
+```
+
+## 使用指南
+
+```
+crawl-epub [CONFIG]
+
+CONFIG: JSON 格式的配置文件，默认为当前工作目录中的 config.json
+```
+
+配置文件包含以下属性：
+
+   `name: String`
+    
+    元信息中的书籍名称，也是在当前工作目录中保存文件的名称
+    
+   `url: String`（可空）
+
+    目录页面的 URL
+    
+   `link: String`（可空）
+
+    链接`<a>`的选择器
+    
+   `list: [String]`（可空）
+
+    待抓取页面的列表，如果这个列表不为空，则抓取这个列表，忽略`url`和`link`
+    
+   `title: String`
+
+    文章页面的标题选择器
+    
+   `content: String`
+
+    文章页面的内容选择器
+
+   `remove: String`（可空）
+
+    文章页面需要移除的元素的选择器
+    
+   `credit: Boolean`（可空）
+
+    是否显示原文链接
+    
+   `headers: {String: String}`（可空）
+
+    HTTP 请求的协议头，默认为`{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}`
+    
+   `retry: Integer`（可空）
+
+    HTTP 请求的重试次数，默认为 10
+    
+   `wait: Float`（可空）
+
+    两次请求之间的间隔（秒），默认为 0
+    
+   `timeout: Integer`（可空）
+
+    HTTP 请求的超时（秒），默认为 8
+    
+   `encoding: String`（可空）
+
+    网页编码，默认为 UTF-8
+    
+   `optiMode: String`（可空）
+
+    图片处理的模型，`'none'`表示不处理，其它值请见 imgyaso 支持的模式，默认为`'quant'`
+    
+   `colors: Integer`（可空）
+
+    imgyaso 接收的`colors`参数，默认为 8
+	
+   `imgSrc: [String]`（可空）
+
+    图片源的属性，默认为`["data-src", "data-original-src", "src"]`
+	
+   `proxy: String`（可空）
+
+    要使用的索引，格式为`<protocal>://<host>:<port>`
+	
+   `textThreads: Integer`（可空）
+
+    爬取文本的线程数，默认为 5
+	
+   `imgThreads: Integer`（可空）
+
+    爬取图片的线程数，默认为 5
+
+用于抓取我们的 PyTorch 1.4 文档的示例：
+
+```json
+{
+    "name": "PyTorch 1.4 中文文档 & 教程",
+    "url": "https://gitee.com/apachecn/pytorch-doc-zh/blob/master/docs/1.4/SUMMARY.md",
+    "link": ".markdown-body li a",
+    "title": ".markdown-body>h1",
+    "content": ".markdown-body",
+    "remove": "a.anchor",
+}
+```
+
+## 协议
+
+本项目基于 SATA 协议发布。
+
+您有义务为此开源项目点赞，并考虑额外给予作者适当的奖励。
+
+## 赞助我们
+
+![](https://home.apachecn.org/img/about/donate.jpg)
+
+## 另见
+
+   [ApacheCN 学习资源](https://docs.apachecn.org/)
+   [计算机电子书](http://it-ebooks.flygon.net)
+   [布客新知](http://flygon.net/ixinzhi/)
--- a/history.md
+++ b/history.md
@ -0,0 +1,5 @@
+# 历史记录
+
+v2021.8.20
+
+   使用 Python 重构
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+requests
+pyquery
+GenEpub
+imgyaso
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python3.7
+# -*- coding: utf-8 -*-
+
+import setuptools
+import EpubCrawler
+
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+with open("requirements.txt", "r", encoding="utf-8") as fh:
+    install_requires = fh.read().splitlines()
+
+setuptools.setup(
+    name="EpubCrawler",
+    version=EpubCrawler.__version__,
+    url="https://github.com/apachecn/epub-crawler",
+    author=EpubCrawler.__author__,
+    author_email=EpubCrawler.__email__,
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Environment :: Console",
+        "Intended Audience :: Developers",
+        "Intended Audience :: End Users/Desktop",
+        "License :: Other/Proprietary License",
+        "Natural Language :: Chinese (Simplified)",
+        "Natural Language :: English",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Topic :: Internet :: WWW/HTTP",
+        "Topic :: Text Processing :: Markup :: HTML",
+        "Topic :: Utilities",
+    ],
+    description="EpubCrawler，用于抓取网页内容并制作 EPUB 的小工具",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    keywords=[
+        "ebook",
+        "epub",
+        "crawler",
+        "爬虫",
+        "电子书",
+    ],
+    install_requires=install_requires,
+    python_requires=">=3.6",
+    entry_points={
+        'console_scripts': [
+            "crawl-epub=EpubCrawler.__main__:main",
+        ],
+    },
+    packages=setuptools.find_packages(),
+)
--- a/update.sh
+++ b/update.sh
@ -0,0 +1,3 @@
+git add -A
+git commit -am "$(date "+%Y-%m-%d %H:%M:%S")"
+git push