From 9fe8c24344718b2d31c35c728dae1d319a0a04c7 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Tue, 27 Feb 2018 08:27:27 -0500 Subject: [PATCH] Docs Signed-off-by: Kenneth Reitz --- Pipfile | 1 + Pipfile.lock | 118 +++++++++++++++++++- docs/Makefile | 20 ++++ docs/make.bat | 36 ++++++ docs/source/conf.py | 197 +++++++++++++++++++++++++++++++++ docs/source/index.rst | 251 ++++++++++++++++++++++++++++++++++++++++++ requests_html.py | 33 ++++-- 7 files changed, 640 insertions(+), 16 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/make.bat create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst diff --git a/Pipfile b/Pipfile index 2aca609..f88338d 100644 --- a/Pipfile +++ b/Pipfile @@ -22,6 +22,7 @@ twine = "*" requests-file = "*" pytest = "*" "e1839a8" = {path = ".", editable = true} +sphinx = "*" [scripts] diff --git a/Pipfile.lock b/Pipfile.lock index d150e86..0ee7d2b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "c88fa0d21e44545a4b9ce13ff463f2da284a846029b315dbb8757057b70166a4" + "sha256": "f6a1a62089049c03d073d0255f1547245bfb4277c62c8df273afc3fcc4f5e127" }, "host-environment-markers": { "implementation_name": "cpython", @@ -177,6 +177,13 @@ } }, "develop": { + "alabaster": { + "hashes": [ + "sha256:2eef172f44e8d301d25aff8068fddd65f767a3f04b5f15b0f4922f113aa1c732", + "sha256:37cdcb9e9954ed60912ebc1ca12a9d12178c26637abdf124e3cde2341c257fe0" + ], + "version": "==0.7.10" + }, "attrs": { "hashes": [ "sha256:a17a9573a6f475c99b551c0e0a812707ddda1ec9653bed04c13841404ed6f450", @@ -184,6 +191,13 @@ ], "version": "==17.4.0" }, + "babel": { + "hashes": [ + "sha256:ad209a68d7162c4cff4b29cdebe3dec4cef75492df501b0049a9433c96ce6f80", + "sha256:8ce4cb6fdd4393edd323227cba3a077bceb2a6ce5201c902c65e730046f41f14" + ], + "version": "==2.5.3" + }, "beautifulsoup4": { "hashes": [ "sha256:7015e76bf32f1f574636c4288399a6de66ce08fb7b2457f628a8d70c0fbabb11", @@ -219,6 +233,14 @@ ], "version": "==1.0.3" }, + "docutils": { + "hashes": [ + "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6", + "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", + "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274" + ], + "version": "==0.14" + }, "e1839a8": { "editable": true, "path": "." @@ -236,6 +258,20 @@ ], "version": "==2.6" }, + "imagesize": { + "hashes": [ + "sha256:3620cc0cadba3f7475f9940d22431fc4d407269f1be59ec9b8edcca26440cf18", + "sha256:5b326e4678b6925158ccc66a9fa3122b6106d7c876ee32d7de6ce59385b96315" + ], + "version": "==1.0.0" + }, + "jinja2": { + "hashes": [ + "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd", + "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4" + ], + "version": "==2.10" + }, "lxml": { "hashes": [ "sha256:41f59cbdab232f11680d5d4dec9f2e6782fd24d78e37ee833447702e34e675f4", @@ -269,6 +305,19 @@ ], "version": "==4.1.1" }, + "markupsafe": { + "hashes": [ + "sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665" + ], + "version": "==1.0" + }, + "packaging": { + "hashes": [ + "sha256:99276dc6e3a7851f32027a68f1095cd3f77c148091b092ea867a351811cfe388", + "sha256:5d50835fdf0a7edf0b55e311b7c887786504efea1177abd7e69329a8e5ea619e" + ], + "version": "==16.8" + }, "parse": { "hashes": [ "sha256:8048dde3f5ca07ad7ac7350460952d83b63eaacecdac1b37f45fd74870d849d2" @@ -295,6 +344,25 @@ ], "version": "==1.5.2" }, + "pygments": { + "hashes": [ + "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", + "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc" + ], + "version": "==2.2.0" + }, + "pyparsing": { + "hashes": [ + "sha256:fee43f17a9c4087e7ed1605bd6df994c6173c1e977d7ade7b651292fab2bd010", + "sha256:0832bcf47acd283788593e7a0f542407bd9550a55a8a8435214a1960e04bcb04", + "sha256:9e8143a3e15c13713506886badd96ca4b579a87fbdf49e550dbfc057d6cb218e", + "sha256:281683241b25fe9b80ec9d66017485f6deff1af5cde372469134b56ca8447a07", + "sha256:b8b3117ed9bdf45e14dcc89345ce638ec7e0e29b2b579fa1ecf32ce45ebac8a5", + "sha256:8f1e18d3fd36c6795bb7e02a39fd05c611ffc2596c1e0d995d34d67630426c18", + "sha256:e4d45427c6e20a59bf4f88c639dcc03ce30d193112047f94012102f235853a58" + ], + "version": "==2.2.0" + }, "pyquery": { "hashes": [ "sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3", @@ -309,6 +377,20 @@ ], "version": "==3.4.1" }, + "pytz": { + "hashes": [ + "sha256:ed6509d9af298b7995d69a440e2822288f2eca1681b8cce37673dbb10091e5fe", + "sha256:f93ddcdd6342f94cea379c73cddb5724e0d6d0a1c91c9bdef364dc0368ba4fda", + "sha256:61242a9abc626379574a166dc0e96a66cd7c3b27fc10868003fa210be4bff1c9", + "sha256:ba18e6a243b3625513d85239b3e49055a2f0318466e0b8a92b8fb8ca7ccdf55f", + "sha256:07edfc3d4d2705a20a6e99d97f0c4b61c800b8232dc1c04d87e8554f130148dd", + "sha256:3a47ff71597f821cd84a162e71593004286e5be07a340fd462f0d33a760782b5", + "sha256:5bd55c744e6feaa4d599a6cbd8228b4f8f9ba96de2c38d56f08e534b3c9edf0d", + "sha256:887ab5e5b32e4d0c86efddd3d055c1f363cbaa583beb8da5e22d2fa2f64d51ef", + "sha256:410bcd1d6409026fbaa65d9ed33bf6dd8b1e94a499e32168acfc7b332e4095c0" + ], + "version": "==2018.3" + }, "requests": { "hashes": [ "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b", @@ -337,12 +419,33 @@ ], "version": "==1.11.0" }, + "snowballstemmer": { + "hashes": [ + "sha256:9f3bcd3c401c3e862ec0ebe6d2c069ebc012ce142cce209c098ccb5b09136e89", + "sha256:919f26a68b2c17a7634da993d91339e288964f93c274f1343e3bbbe2096e1128" + ], + "version": "==1.2.1" + }, + "sphinx": { + "hashes": [ + "sha256:41ae26acc6130ccf6ed47e5cca73742b80d55a134f0ab897c479bba8d3640b8e", + "sha256:da987de5fcca21a4acc7f67a86a363039e67ac3e8827161e61b91deb131c0ee8" + ], + "version": "==1.7.1" + }, + "sphinxcontrib-websupport": { + "hashes": [ + "sha256:f4932e95869599b89bf4f80fc3989132d83c9faa5bf633e7b5e0c25dffb75da2", + "sha256:7a85961326aa3a400cd4ad3c816d70ed6f7c740acd7ce5d78cd0a67825072eb9" + ], + "version": "==1.0.1" + }, "tqdm": { "hashes": [ - "sha256:4c041f8019f7be65b8028ddde9a836f7ccc51c4637f1ff2ba9b5813d38d19d5a", - "sha256:df32e6f127dc0ccbc675eadb33f749abbcb8f174c5cb9ec49c0cdb73aa737377" + "sha256:f66468c14ccd011a627734c9b3fd72f20ce16f8faecc47384eb2507af5924fb9", + "sha256:5ec0d4442358e55cdb4a0471d04c6c831518fd8837f259db5537d90feab380df" ], - "version": "==4.19.5" + "version": "==4.19.6" }, "twine": { "hashes": [ @@ -357,6 +460,13 @@ "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" ], "version": "==1.22" + }, + "w3lib": { + "hashes": [ + "sha256:aaf7362464532b1036ab0092e2eee78e8fd7b56787baa9ed4967457b083d011b", + "sha256:55994787e93b411c2d659068b51b9998d9d0c05e0df188e6daf8f45836e1ea38" + ], + "version": "==1.19.0" } } } diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..2a2f5ff --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = requests-html +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..b2c56ed --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build +set SPHINXPROJ=requests-html + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..cde5294 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/stable/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- +import requests_html + +project = 'requests-html' +copyright = '2018, Kenneth Reitz' +author = 'Kenneth Reitz' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = 'v0.3.4' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'requests-htmldoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'requests-html.tex', 'requests-html Documentation', + 'Kenneth Reitz', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'requests-html', 'requests-html Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'requests-html', 'requests-html Documentation', + author, 'requests-html', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project +epub_author = author +epub_publisher = author +epub_copyright = copyright + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'https://docs.python.org/': None} + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..acaa208 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,251 @@ +.. requests-html documentation master file, created by + sphinx-quickstart on Tue Feb 27 08:03:45 2018. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Requests-HTML: HTML Parsing for Humans! +======================================== + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +.. image:: https://travis-ci.org/kennethreitz/requests-html.svg?branch=master + :target: https://travis-ci.org/kennethreitz/requests-html + +This library intends to make parsing HTML (e.g. scraping the web) as +simple and intuitive as possible. + +When using this library you automatically get: + +- Optional JavaScript support! +- CSS Selectors (a.k.a jQuery-style, thanks to PyQuery). +- XPath Selectors, for the faint at heart. +- Mocked user-agent (like a real web browser). +- Automatic following of redirects. +- Connection–pooling and cookie persistience. +- The Requests experience you know and love, with magical parsing abilities. + +.. Other nice features include: + + - Markdown export of pages and elements. + + +Installation +============ + +.. code-block:: shell + + $ pipenv install requests-html + ✨🍰✨ + +Or, if you want access to ``BrowserSession``: + +.. code-block:: shell + + $ pipenv install requests-html[browser] + ✨🍰✨ + +Only Python 3 is supported. + + +Usage +===== + +Make a GET request to 'python.org', using Requests: + +.. code-block:: pycon + + >>> from requests_html import HTMLSession + >>> session = HTMLSession() + + >>> r = session.get('https://python.org/') + +Grab a list of all links on the page, as–is (anchors excluded): + +.. code-block:: pycon + + >>> r.html.links + {'//docs.python.org/3/tutorial/', '/about/apps/', 'https://github.com/python/pythondotorg/issues', '/accounts/login/', '/dev/peps/', '/about/legal/', '//docs.python.org/3/tutorial/introduction.html#lists', '/download/alternatives', 'http://feedproxy.google.com/~r/PythonInsider/~3/kihd2DW98YY/python-370a4-is-available-for-testing.html', '/download/other/', '/downloads/windows/', 'https://mail.python.org/mailman/listinfo/python-dev', '/doc/av', 'https://devguide.python.org/', '/about/success/#engineering', 'https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event', 'https://www.openstack.org', '/about/gettingstarted/', 'http://feedproxy.google.com/~r/PythonInsider/~3/AMoBel8b8Mc/python-3.html', '/success-stories/industrial-light-magic-runs-python/', 'http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calculator', '/', 'http://pyfound.blogspot.com/', '/events/python-events/past/', '/downloads/release/python-2714/', 'https://wiki.python.org/moin/PythonBooks', 'http://plus.google.com/+Python', 'https://wiki.python.org/moin/', 'https://status.python.org/', '/community/workshops/', '/community/lists/', 'http://buildbot.net/', '/community/awards', 'http://twitter.com/ThePSF', 'https://docs.python.org/3/license.html', '/psf/donations/', 'http://wiki.python.org/moin/Languages', '/dev/', '/events/python-user-group/', 'https://wiki.qt.io/PySide', '/community/sigs/', 'https://wiki.gnome.org/Projects/PyGObject', 'http://www.ansible.com', 'http://www.saltstack.com', 'http://planetpython.org/', '/events/python-events', '/about/help/', '/events/python-user-group/past/', '/about/success/', '/psf-landing/', '/about/apps', '/about/', 'http://www.wxpython.org/', '/events/python-user-group/665/', 'https://www.python.org/psf/codeofconduct/', '/dev/peps/peps.rss', '/downloads/source/', '/psf/sponsorship/sponsors/', 'http://bottlepy.org', 'http://roundup.sourceforge.net/', 'http://pandas.pydata.org/', 'http://brochure.getpython.info/', 'https://bugs.python.org/', '/community/merchandise/', 'http://tornadoweb.org', '/events/python-user-group/650/', 'http://flask.pocoo.org/', '/downloads/release/python-364/', '/events/python-user-group/660/', '/events/python-user-group/638/', '/psf/', '/doc/', 'http://blog.python.org', '/events/python-events/604/', '/about/success/#government', 'http://python.org/dev/peps/', 'https://docs.python.org', 'http://feedproxy.google.com/~r/PythonInsider/~3/zVC80sq9s00/python-364-is-now-available.html', '/users/membership/', '/about/success/#arts', 'https://wiki.python.org/moin/Python2orPython3', '/downloads/', '/jobs/', 'http://trac.edgewall.org/', 'http://feedproxy.google.com/~r/PythonInsider/~3/wh73_1A-N7Q/python-355rc1-and-python-348rc1-are-now.html', '/privacy/', 'https://pypi.python.org/', 'http://www.riverbankcomputing.co.uk/software/pyqt/intro', 'http://www.scipy.org', '/community/forums/', '/about/success/#scientific', '/about/success/#software-development', '/shell/', '/accounts/signup/', 'http://www.facebook.com/pythonlang?fref=ts', '/community/', 'https://kivy.org/', '/about/quotes/', 'http://www.web2py.com/', '/community/logos/', '/community/diversity/', '/events/calendars/', 'https://wiki.python.org/moin/BeginnersGuide', '/success-stories/', '/doc/essays/', '/dev/core-mentorship/', 'http://ipython.org', '/events/', '//docs.python.org/3/tutorial/controlflow.html', '/about/success/#education', '/blogs/', '/community/irc/', 'http://pycon.blogspot.com/', '//jobs.python.org', 'http://www.pylonsproject.org/', 'http://www.djangoproject.com/', '/downloads/mac-osx/', '/about/success/#business', 'http://feedproxy.google.com/~r/PythonInsider/~3/x_c9D0S-4C4/python-370b1-is-now-available-for.html', 'http://wiki.python.org/moin/TkInter', 'https://docs.python.org/faq/', '//docs.python.org/3/tutorial/controlflow.html#defining-functions'} + +Grab a list of all links on the page, in absolute form (anchors excluded): + +.. code-block:: pycon + + >>> r.html.absolute_links + {'https://github.com/python/pythondotorg/issues', 'https://docs.python.org/3/tutorial/', 'https://www.python.org/about/success/', 'http://feedproxy.google.com/~r/PythonInsider/~3/kihd2DW98YY/python-370a4-is-available-for-testing.html', 'https://www.python.org/dev/peps/', 'https://mail.python.org/mailman/listinfo/python-dev', 'https://www.python.org/doc/', 'https://www.python.org/', 'https://www.python.org/about/', 'https://www.python.org/events/python-events/past/', 'https://devguide.python.org/', 'https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event', 'https://www.openstack.org', 'http://feedproxy.google.com/~r/PythonInsider/~3/AMoBel8b8Mc/python-3.html', 'https://docs.python.org/3/tutorial/introduction.html#lists', 'http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calculator', 'http://pyfound.blogspot.com/', 'https://wiki.python.org/moin/PythonBooks', 'http://plus.google.com/+Python', 'https://wiki.python.org/moin/', 'https://www.python.org/events/python-events', 'https://status.python.org/', 'https://www.python.org/about/apps', 'https://www.python.org/downloads/release/python-2714/', 'https://www.python.org/psf/donations/', 'http://buildbot.net/', 'http://twitter.com/ThePSF', 'https://docs.python.org/3/license.html', 'http://wiki.python.org/moin/Languages', 'https://docs.python.org/faq/', 'https://jobs.python.org', 'https://www.python.org/about/success/#software-development', 'https://www.python.org/about/success/#education', 'https://www.python.org/community/logos/', 'https://www.python.org/doc/av', 'https://wiki.qt.io/PySide', 'https://www.python.org/events/python-user-group/660/', 'https://wiki.gnome.org/Projects/PyGObject', 'http://www.ansible.com', 'http://www.saltstack.com', 'https://www.python.org/dev/peps/peps.rss', 'http://planetpython.org/', 'https://www.python.org/events/python-user-group/past/', 'https://docs.python.org/3/tutorial/controlflow.html#defining-functions', 'https://www.python.org/community/diversity/', 'https://docs.python.org/3/tutorial/controlflow.html', 'https://www.python.org/community/awards', 'https://www.python.org/events/python-user-group/638/', 'https://www.python.org/about/legal/', 'https://www.python.org/dev/', 'https://www.python.org/download/alternatives', 'https://www.python.org/downloads/', 'https://www.python.org/community/lists/', 'http://www.wxpython.org/', 'https://www.python.org/about/success/#government', 'https://www.python.org/psf/', 'https://www.python.org/psf/codeofconduct/', 'http://bottlepy.org', 'http://roundup.sourceforge.net/', 'http://pandas.pydata.org/', 'http://brochure.getpython.info/', 'https://www.python.org/downloads/source/', 'https://bugs.python.org/', 'https://www.python.org/downloads/mac-osx/', 'https://www.python.org/about/help/', 'http://tornadoweb.org', 'http://flask.pocoo.org/', 'https://www.python.org/users/membership/', 'http://blog.python.org', 'https://www.python.org/privacy/', 'https://www.python.org/about/gettingstarted/', 'http://python.org/dev/peps/', 'https://www.python.org/about/apps/', 'https://docs.python.org', 'https://www.python.org/success-stories/', 'https://www.python.org/community/forums/', 'http://feedproxy.google.com/~r/PythonInsider/~3/zVC80sq9s00/python-364-is-now-available.html', 'https://www.python.org/community/merchandise/', 'https://www.python.org/about/success/#arts', 'https://wiki.python.org/moin/Python2orPython3', 'http://trac.edgewall.org/', 'http://feedproxy.google.com/~r/PythonInsider/~3/wh73_1A-N7Q/python-355rc1-and-python-348rc1-are-now.html', 'https://pypi.python.org/', 'https://www.python.org/events/python-user-group/650/', 'http://www.riverbankcomputing.co.uk/software/pyqt/intro', 'https://www.python.org/about/quotes/', 'https://www.python.org/downloads/windows/', 'https://www.python.org/events/calendars/', 'http://www.scipy.org', 'https://www.python.org/community/workshops/', 'https://www.python.org/blogs/', 'https://www.python.org/accounts/signup/', 'https://www.python.org/events/', 'https://kivy.org/', 'http://www.facebook.com/pythonlang?fref=ts', 'http://www.web2py.com/', 'https://www.python.org/psf/sponsorship/sponsors/', 'https://www.python.org/community/', 'https://www.python.org/download/other/', 'https://www.python.org/psf-landing/', 'https://www.python.org/events/python-user-group/665/', 'https://wiki.python.org/moin/BeginnersGuide', 'https://www.python.org/accounts/login/', 'https://www.python.org/downloads/release/python-364/', 'https://www.python.org/dev/core-mentorship/', 'https://www.python.org/about/success/#business', 'https://www.python.org/community/sigs/', 'https://www.python.org/events/python-user-group/', 'http://ipython.org', 'https://www.python.org/shell/', 'https://www.python.org/community/irc/', 'https://www.python.org/about/success/#engineering', 'http://www.pylonsproject.org/', 'http://pycon.blogspot.com/', 'https://www.python.org/about/success/#scientific', 'https://www.python.org/doc/essays/', 'http://www.djangoproject.com/', 'https://www.python.org/success-stories/industrial-light-magic-runs-python/', 'http://feedproxy.google.com/~r/PythonInsider/~3/x_c9D0S-4C4/python-370b1-is-now-available-for.html', 'http://wiki.python.org/moin/TkInter', 'https://www.python.org/jobs/', 'https://www.python.org/events/python-events/604/'} + +Select an element with a CSS Selector: + +.. code-block:: pycon + + >>> about = r.html.find('#about', first=True) + +Grab an element's text contents: + +.. code-block:: pycon + + >>> print(about.text) + About + Applications + Quotes + Getting Started + Help + Python Brochure + +Introspect an Element's attributes: + +.. code-block:: pycon + + >>> about.attrs + {'id': 'about', 'class': ('tier-1', 'element-1'), 'aria-haspopup': 'true'} + +Render out an Element's HTML: + +.. code-block:: pycon + + >>> about.html + '
  • \nAbout\n\n
  • ' + + + +Select Elements within Elements: + +.. code-block:: pycon + + >>> about.find('a') + [, , , , , ] + +Search for links within an element: + +.. code-block:: pycon + + >>> about.absolute_links + {'http://brochure.getpython.info/', 'https://www.python.org/about/gettingstarted/', 'https://www.python.org/about/', 'https://www.python.org/about/quotes/', 'https://www.python.org/about/help/', 'https://www.python.org/about/apps/'} + + +Search for text on the page: + +.. code-block:: pycon + + >>> r.html.search('Python is a {} language')[0] + programming + +More complex CSS Selector example (copied from Chrome dev tools): + +.. code-block:: pycon + + >>> r = session.get('https://github.com/') + >>> sel = 'body > div.application-main > div.jumbotron.jumbotron-codelines > div > div > div.col-md-7.text-center.text-md-left > p' + + >>> print(r.html.find(sel, first=True).text) + GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers. + +XPath is also supported: + +.. code-block:: pycon + + >>> r.html.xpath('a') + [] + +JavaScript Support +================== + +Let's grab some text that's rendered by JavaScript: + +.. code-block:: shell + + $ pipenv install requests-html[browser] + +.. code-block:: pycon + + >>> from requests_html import BrowserHTMLSession + >>> session = BrowserHTMLSession() + + >>> r = session.get('http://python-requests.org') + >>> r.html.search('Python 2 will retire in only {months} months!')['months'] + '' + + +Using without Requests +====================== + +You can also use this library without Requests: + +.. code-block:: pycon + + >>> from requests_html import HTML + >>> doc = """""" + + >>> html = HTML(html=doc, url='fakeurl', default_encoding='utf-8') + >>> html.links + {'https://httpbin.org'} + + +Other Fun (with Markdown) +========================= + +If you'd like to take an element and convert it to Markdown, for example, use `html2text`, by Aaron Swartz: + +.. code-block:: shell + + $ pipenv install html2text + +.. code-block:: pycon + + >>> from html2text import HTML2Text + >>> h = html2text.HTML2Text() + >>> print(h.handle(about.html)) + * [About](/about/) + + * [Applications](/about/apps/) + * [Quotes](/about/quotes/) + * [Getting Started](/about/gettingstarted/) + * [Help](/about/help/) + * [Python Brochure](http://brochure.getpython.info/) + + +API Documentation +================= + +Main Classes +------------ + +.. module:: requests_html + +These classes are the main interface to ``requests-html``: + + +.. autoclass:: HTML + :inherited-members: + +.. autoclass:: Element + :inherited-members: + +Utility Functions +----------------- + +.. autofunction:: user_agent + +HTML Sessions +------------- + +These sessions are for making HTTP requests: + +.. autoclass:: HTMLSession + :inherited-members: + +.. autoclass:: BrowserHTMLSession + :inherited-members: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/requests_html.py b/requests_html.py index 445d517..56d20af 100644 --- a/requests_html.py +++ b/requests_html.py @@ -22,7 +22,10 @@ DEFAULT_ENCODING = 'utf-8' useragent = UserAgent() class HTMLResponse(requests.Response): - """docstring for Response""" + """An HTML-enabled :class:`Response ` object. + Same as Requests class:`Response ` object, but with an + intelligent ``.html`` property added. + """ def __init__(self, *args, **kwargs): super(HTMLResponse, self).__init__(*args, **kwargs) @@ -57,6 +60,7 @@ class BaseParser: @property def html(self): + """Unicode representation of the HTML content.""" if self._html: return self._html else: @@ -64,10 +68,14 @@ class BaseParser: @html.setter def set_html(self, html): + """Property setter for self.html.""" self._html = html @property def encoding(self): + """The encoding string to be used, extracted from the HTML and + :class:`HTMLResponse ` headers. + """ if self._encoding: return self._encoding @@ -79,7 +87,7 @@ class BaseParser: @property def pq(self): - """PyQuery representation of the element.""" + """PyQuery representation of the :class:`Element ` or :class:`HTML `.""" return PyQuery(self.element) @property @@ -88,16 +96,16 @@ class BaseParser: @property def text(self): - """The text content of the element.""" + """The text content of the :class:`Element ` or :class:`HTML `..""" return self.pq.text() @property def full_text(self): - """The full text content (including links) of the element.""" + """The full text content (including links) of the :class:`Element ` or :class:`HTML `..""" return self.lxml.text_content() def find(self, selector, first=False, _encoding=None): - """Given a jQuery selector, returns a list of element objects.""" + """Given a jQuery selector, returns a list of :class:`Element ` objects.""" def gen(): for found in self.pq(selector): yield Element(element=found, url=self.url, default_encoding=_encoding or self.encoding) @@ -113,7 +121,7 @@ class BaseParser: return c def xpath(self, selector, first=False, _encoding=None): - """Given an XPath selector, returns a list of element objects.""" + """Given an XPath selector, returns a list of :class:`Element ` objects.""" c = [Element(element=e, url=self.url, default_encoding=_encoding or self.encoding) for e in self.lxml.xpath(selector)] if first: try: @@ -124,11 +132,11 @@ class BaseParser: return c def search(self, template): - """Searches the element for the given parse template.""" + """Searches the :class:`Element ` for the given parse template.""" return parse_search(template, self.html) def search_all(self, template): - """Searches the element (multiple times) for the given parse + """Searches the :class:`Element ` (multiple times) for the given parse template. """ return [r for r in findall(template, self.html)] @@ -172,7 +180,7 @@ class BaseParser: @property def base_url(self): - """The base URL for the page.""" + """The base URL for the page. Supports the tag.""" # Support for tag. base = self.find('base', first=True) @@ -203,7 +211,7 @@ class Element(BaseParser): @property def attrs(self): - """Returns a dictionary of the attributes of the element.""" + """Returns a dictionary of the attributes of the class:`Element `.""" attrs = {k: self.pq.attr[k].strip() for k in self.element.keys()} # Split class up, as there are ussually many of them: @@ -214,7 +222,7 @@ class Element(BaseParser): class HTML(BaseParser): - """An HTML document.""" + """An HTML document, ready for parsing.""" def __init__(self, *, url, html, default_encoding=DEFAULT_ENCODING): super(HTML, self).__init__( @@ -272,7 +280,8 @@ class HTMLSession(requests.Session): class BrowserHTMLSession(HTMLSession): - """A web-browser interpreted session (for JavaScript).""" + """A web-browser interpreted session (for JavaScript), powered by + PyQt5's QWebEngineView.""" def __init__(self, *args, **kwargs): super(BrowserHTMLSession, self).__init__(*args, **kwargs)