Merge pull request #222 from jcushman/require-url

Require url inputs to be explicit
pull/226/head
Gael Pasgrimaud 2021-08-06 08:52:32 +02:00 committed by GitHub
commit 0e9b337cb8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 16 additions and 17 deletions

View File

@ -1,3 +1,10 @@
2.0 (unreleased)
----------------
- Breaking change: inputs starting with ``"http://"`` or ``"https://"`` like
``PyQuery("http://example.com")`` will no longer fetch the contents of the URL.
Users desiring the old behavior should switch to ``PyQuery(url="http://example.com")``.
1.4.4 (unreleased)
------------------

View File

@ -7,17 +7,17 @@ Scraping
PyQuery is able to load an html document from a url::
>>> pq(your_url)
>>> pq(url=your_url)
[<html>]
By default it uses python's urllib.
If `requests`_ is installed then it will use it. This allow you to use most of `requests`_ parameters::
>>> pq(your_url, headers={'user-agent': 'pyquery'})
>>> pq(url=your_url, headers={'user-agent': 'pyquery'})
[<html>]
>>> pq(your_url, {'q': 'foo'}, method='post', verify=True)
>>> pq(url=your_url, data={'q': 'foo'}, method='post', verify=True)
[<html>]

View File

@ -150,14 +150,6 @@ class PyQuery(list):
self._base_url = None
self.parser = kwargs.pop('parser', None)
if (len(args) >= 1 and
isinstance(args[0], str) and
args[0].split('://', 1)[0] in ('http', 'https')):
kwargs['url'] = args[0]
if len(args) >= 2:
kwargs['data'] = args[1]
args = []
if 'parent' in kwargs:
self._parent = kwargs.pop('parent')
else:

View File

@ -902,14 +902,14 @@ class TestWebScrapping(TestCase):
self.application_url = self.s.application_url.rstrip('/')
def test_get(self):
d = pq(self.application_url, {'q': 'foo'},
d = pq(url=self.application_url, data={'q': 'foo'},
method='get')
print(d)
self.assertIn('REQUEST_METHOD: GET', d('p').text())
self.assertIn('q=foo', d('p').text())
def test_post(self):
d = pq(self.application_url, {'q': 'foo'},
d = pq(url=self.application_url, data={'q': 'foo'},
method='post')
self.assertIn('REQUEST_METHOD: POST', d('p').text())
self.assertIn('q=foo', d('p').text())
@ -919,7 +919,7 @@ class TestWebScrapping(TestCase):
import requests
session = requests.Session()
session.headers.update({'X-FOO': 'bar'})
d = pq(self.application_url, {'q': 'foo'},
d = pq(url=self.application_url, data={'q': 'foo'},
method='get', session=session)
self.assertIn('HTTP_X_FOO: bar', d('p').text())
else:
@ -932,7 +932,7 @@ class TestWebScrapping(TestCase):
class TestWebScrappingEncoding(TestCase):
def test_get(self):
d = pq(u'http://ru.wikipedia.org/wiki/Заглавная_страница',
d = pq(url=u'http://ru.wikipedia.org/wiki/Заглавная_страница',
method='get')
print(d)
self.assertEqual(d('#pt-login').text(), u'Войти')
@ -950,9 +950,9 @@ class TestWebScrappingTimeouts(TestCase):
self.application_url = self.s.application_url.rstrip('/')
def test_get(self):
pq(self.application_url)
pq(url=self.application_url)
with self.assertRaises(Exception):
pq(self.application_url, timeout=1)
pq(url=self.application_url, timeout=1)
def tearDown(self):
self.s.shutdown()