init

2025-06-03 23:58:27 +00:00 · 2021-08-20 23:04:45 +08:00 · 2021-08-20 23:04:45 +08:00 · a739d679f3
commit a739d679f3
13 changed files with 663 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,133 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # custom
 cookies.json
 history.json
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,15 @@
 language: python
 python: 3.6
 install:
    - 'pip install -r requirements.txt'
 script:
    - ":"
 deploy:
    -   provider: pypi
        user: __token__
        password: $PYPI_TOKEN
        distributions: 'sdist bdist_wheel'
        skip_existing: true
--- a/EpubCrawler/init.py
+++ b/EpubCrawler/init.py
@ -0,0 +1,10 @@
 #!/usr/bin/env python3.7
 # -*- coding: utf-8 -*-
 """EpubCrawler
 https://github.com/apachecn/epub-crawler"""
 __author__ = "ApacheCN"
 __email__ = "apachecn@163.com"
 __license__ = "SATA"
 __version__ = "2021.8.20.0"
--- a/EpubCrawler/main.py
+++ b/EpubCrawler/main.py
@ -0,0 +1,151 @@
 #!/usr/bin/env python3.7
 # -*- coding: utf-8 -*-
 from urllib.parse import urljoin
 import sys
 import json
 from pyquery import PyQuery as pq
 import time
 from os import path
 import re
 from concurrent.futures import ThreadPoolExecutor
 from GenEpub import gen_epub
 from . import *
 from .util import *
 from .img import *
 from .config import config
 def get_toc_from_cfg():
    if config['list'] and len(config['list']) > 0:
        return config['list']
    if not config['url']:
        print('URL not specified')
        sys.exit()
    html = request_retry(
        'GET', config['url'],
        retry=config['retry'],
        headers=config['headers'],
        timeout=config['timeout'],
        proxies=config['proxy'],
    ).content.decode(config['encoding'])
    return get_toc(html, config['url'])
 def get_toc(html, base):
    root = pq(html)
    if config['remove']:
        root(config['remove']).remove()
    el_links = root(config['link'])
    vis = set()
    res = []
    for i in range(len(el_links)):
        el_link = el_links.eq(i)
        url = el_link.attr('href')
        if not url:
            text = el_link.text().strip()
            re.append(text)
            continue
        url = re.sub(r'#.*$', '', url)
        if base:
            url = urljoin(base, url)
        if not url.startswith('http'):
            continue
        if url in vis: continue
        vis.add(url)
        res.append(url)
    return res
 def get_article(html, url):
    root = pq(html)
    if config['remove']:
        root(config['remove']).remove()
    el_title = root(config['title']).eq(0)
    title = el_title.text().strip()
    el_title.remove()
    el_co = root(config['content'])
    co = '\r\n'.join([
        el_co.eq(i).html()
        for i in range(len(el_co))
    ])
    if config['credit']:
        credit = f"<blockquote>原文：<a href='{url}'>{url}</a></blockquote>"
        co = credit + co
    return {'title': title, 'content': co}
 def tr_download_page(url, art, imgs):
    try:
        html = request_retry(
            'GET', url,
            retry=config['retry'],
            headers=config['headers'],
            timeout=config['timeout'],
            proxies=config['proxy'],
        ).content.decode(config['encoding'])
        art.update(get_article(html, url))
        art['content'] = process_img(
            art['content'], imgs,
            page_url=url,
            img_prefix='../Images/',
        )
        time.sleep(config['wait'])
    except Exception as ex:
        print(ex)
 def main():
    cfg_fname = sys.argv[1] \
        if len(sys.argv) > 1 \
        else 'config.json'
    if not path.exists(cfg_fname):
        print('please provide config file')
        return
    user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
    config.update(user_cfg)
    if config['proxy']:
        proxies = {
            'http': config['proxy'],
            'https': config['proxy'],
        }
        config['proxy'] = proxies
    set_img_pool(ThreadPoolExecutor(config['imgThreads']))
    toc = get_toc_from_cfg()
    articles = []
    imgs = {}
    if config['name']:
        articles.append({
            'title': config['name'],
            'content': f"<p>来源：<a href='{config['url']}'>{config['url']}</a></p>",
        })
    text_pool = ThreadPoolExecutor(config['textThreads'])
    hdls = []
    for url in toc:
        print(f'page: {url}')
        if url.startswith('http'):
            art = {}
            articles.append(art)
            hdl = text_pool.submit(tr_download_page, url, art, imgs)
            hdls.append(hdl)
        else:
            articles.append({'title': url, 'content': ''})
    for h in hdls: h.result()
    gen_epub(articles, imgs)
    print('done...')
 if __name__ == '__main__': main()
--- a/EpubCrawler/config.py
+++ b/EpubCrawler/config.py
@ -0,0 +1,25 @@
 # -*- coding: utf-8 -*-
 config = {
    'name': '',
    'url': '',
    'link': '',
    'title': '',
    'content': '',
    'remove': '',
    'retry': 10,
    'wait': 0,
    'encoding': 'utf-8',
    'credit': True,
    'headers': {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
    },
    'list': [],
    'optiMode': 'quant',
    'colors': 8,
    'timeout': 8,
 	'imgSrc': ['data-src', 'data-original-src', 'src'],
    'proxy': '',
    'textThreads': 5,
    'imgThreads': 5,
 }
--- a/EpubCrawler/img.py
+++ b/EpubCrawler/img.py
@ -0,0 +1,66 @@
 # -*- coding: utf-8 -*-
 from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urljoin, quote_plus
 import hashlib
 from pyquery import PyQuery as pq
 import time
 from .util import *
 from .config import config
 img_pool = ThreadPoolExecutor(5)
 def set_img_pool(pool):
    global img_pool
    img_pool = pool
 def get_img_src(el_img):
    url = ''
    for prop in config['imgSrc']:
        url = el_img.attr(prop)
        if url: break
    return url
 def tr_download_img(url, imgs, picname):
    try:
        data = request_retry(
            'GET', url,
            headers=config['headers'],
            retry=config['retry'],
            timeout=config['timeout'],
            proxies=config['proxy'],
        ).content
        data = opti_img(data, config['optiMode'], config['colors'])
        imgs[picname] = data
        time.sleep(config['wait'])
    except Exception as ex:
        print(ex)
 def process_img(html, imgs, **kw):
    kw.setdefault('img_prefix', 'img/')
    root = pq(html)
    el_imgs = root('img')
    hdls = []
    for i in range(len(el_imgs)):
        el_img = el_imgs.eq(i)
        url = get_img_src(el_img)
        if not url: continue
        if not url.startswith('http'):
            if kw.get('page_url'):
                url = urljoin(kw.get('page_url'), url)
            else: continue
        picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png'
        print(f'pic: {url} => {picname}')
        if picname not in imgs:
            hdl = img_pool.submit(tr_download_img, url, imgs, picname)
            hdls.append(hdl)
        el_img.attr('src', kw['img_prefix'] + picname)
    for h in hdls: h.result()
    return root.html()
--- a/EpubCrawler/util.py
+++ b/EpubCrawler/util.py
@ -0,0 +1,28 @@
 # -*- coding: utf-8 -*-
 import requests
 from imgyaso import pngquant_bts, \
    adathres_bts, grid_bts, noise_bts, trunc_bts
 def request_retry(method, url, retry=10, **kw):
    kw.setdefault('timeout', 10)
    for i in range(retry):
        try:
            return requests.request(method, url, **kw)
        except KeyboardInterrupt as e:
            raise e
        except Exception as e:
            print(f'{url} retry {i}')
            if i == retry - 1: raise e
 def opti_img(img, mode, colors):
    if mode == 'quant':
        return pngquant_bts(img, colors)
    elif mode == 'grid':
        return grid_bts(img)
    elif mode == 'trunc':
        return trunc_bts(img, colors)
    elif mode == 'thres':
        return adathres_bts(img)
    else:
        return img
--- a/37
+++ b/37
@ -0,0 +1,37 @@
 The Star And Thank Author License (SATA)
 Copyright © 2021 ApacheCN(apachecn@163.com)
 Project Url: https://github.com/apachecn/epub-crawler
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software. 
 And wait, the most important, you shall star/+1/like the project(s) in project url 
 section above first, and then thank the author(s) in Copyright section. 
 Here are some suggested ways:
 - Email the authors a thank-you letter, and make friends with him/her/them.
 - Report bugs or issues.
 - Tell friends what a wonderful project this is.
 - And, sure, you can just express thanks in your mind without telling the world.
 Contributors of this project by forking have the option to add his/her name and 
 forked project url at copyright and project url sections, but shall not delete 
 or modify anything else in these two sections.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,132 @@
 # epub-crawler
 用于抓取网页内容并制作 EPUB 的小工具。
 ## 安装
 通过 pip（推荐）：
 ```
 pip install epub-crawler
 ```
 从源码安装：
 ```
 pip install git+https://github.com/apachecn/epub-crawler
 ```
 ## 使用指南
 ```
 crawl-epub [CONFIG]
 CONFIG: JSON 格式的配置文件，默认为当前工作目录中的 config.json
 ```
 配置文件包含以下属性：
 +   `name: String`
    元信息中的书籍名称，也是在当前工作目录中保存文件的名称
 +   `url: String`（可空）
    目录页面的 URL
 +   `link: String`（可空）
    链接`<a>`的选择器
 +   `list: [String]`（可空）
    待抓取页面的列表，如果这个列表不为空，则抓取这个列表，忽略`url`和`link`
 +   `title: String`
    文章页面的标题选择器
 +   `content: String`
    文章页面的内容选择器
 +   `remove: String`（可空）
    文章页面需要移除的元素的选择器
 +   `credit: Boolean`（可空）
    是否显示原文链接
 +   `headers: {String: String}`（可空）
    HTTP 请求的协议头，默认为`{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}`
 +   `retry: Integer`（可空）
    HTTP 请求的重试次数，默认为 10
 +   `wait: Float`（可空）
    两次请求之间的间隔（秒），默认为 0
 +   `timeout: Integer`（可空）
    HTTP 请求的超时（秒），默认为 8
 +   `encoding: String`（可空）
    网页编码，默认为 UTF-8
 +   `optiMode: String`（可空）
    图片处理的模型，`'none'`表示不处理，其它值请见 imgyaso 支持的模式，默认为`'quant'`
 +   `colors: Integer`（可空）
    imgyaso 接收的`colors`参数，默认为 8
 +   `imgSrc: [String]`（可空）
    图片源的属性，默认为`["data-src", "data-original-src", "src"]`
 +   `proxy: String`（可空）
    要使用的索引，格式为`<protocal>://<host>:<port>`
 +   `textThreads: Integer`（可空）
    爬取文本的线程数，默认为 5
 +   `imgThreads: Integer`（可空）
    爬取图片的线程数，默认为 5
 用于抓取我们的 PyTorch 1.4 文档的示例：
 ```json
 {
    "name": "PyTorch 1.4 中文文档 & 教程",
    "url": "https://gitee.com/apachecn/pytorch-doc-zh/blob/master/docs/1.4/SUMMARY.md",
    "link": ".markdown-body li a",
    "title": ".markdown-body>h1",
    "content": ".markdown-body",
    "remove": "a.anchor",
 }
 ```
 ## 协议
 本项目基于 SATA 协议发布。
 您有义务为此开源项目点赞，并考虑额外给予作者适当的奖励。
 ## 赞助我们
 ![](https://home.apachecn.org/img/about/donate.jpg)
 ## 另见
 +   [ApacheCN 学习资源](https://docs.apachecn.org/)
 +   [计算机电子书](http://it-ebooks.flygon.net)
 +   [布客新知](http://flygon.net/ixinzhi/)
--- a/history.md
+++ b/history.md
@ -0,0 +1,5 @@
 # 历史记录
 v2021.8.20
 +   使用 Python 重构
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
 requests
 pyquery
 GenEpub
 imgyaso
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,54 @@
 #!/usr/bin/env python3.7
 # -*- coding: utf-8 -*-
 import setuptools
 import EpubCrawler
 with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()
 with open("requirements.txt", "r", encoding="utf-8") as fh:
    install_requires = fh.read().splitlines()
 setuptools.setup(
    name="EpubCrawler",
    version=EpubCrawler.__version__,
    url="https://github.com/apachecn/epub-crawler",
    author=EpubCrawler.__author__,
    author_email=EpubCrawler.__email__,
    classifiers=[
        "Development Status :: 4 - Beta",
        "Environment :: Console",
        "Intended Audience :: Developers",
        "Intended Audience :: End Users/Desktop",
        "License :: Other/Proprietary License",
        "Natural Language :: Chinese (Simplified)",
        "Natural Language :: English",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3 :: Only",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Topic :: Internet :: WWW/HTTP",
        "Topic :: Text Processing :: Markup :: HTML",
        "Topic :: Utilities",
    ],
    description="EpubCrawler，用于抓取网页内容并制作 EPUB 的小工具",
    long_description=long_description,
    long_description_content_type="text/markdown",
    keywords=[
        "ebook",
        "epub",
        "crawler",
        "爬虫",
        "电子书",
    ],
    install_requires=install_requires,
    python_requires=">=3.6",
    entry_points={
        'console_scripts': [
            "crawl-epub=EpubCrawler.__main__:main",
        ],
    },
    packages=setuptools.find_packages(),
 )
--- a/update.sh
+++ b/update.sh
@ -0,0 +1,3 @@
 git add -A
 git commit -am "$(date "+%Y-%m-%d %H:%M:%S")"
 git push