scrapy/tests/test_utils_gz.py

import unittest
from os.path import join

from w3lib.encoding import html_to_unicode

from scrapy.utils.gz import gunzip, is_gzipped
from scrapy.http import Response, Headers
from tests import tests_datadir

SAMPLEDIR = join(tests_datadir, 'compressed')


class GunzipTest(unittest.TestCase):

    def test_gunzip_basic(self):
        with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f:
            text = gunzip(f.read())
            self.assertEqual(len(text), 9950)

    def test_gunzip_truncated(self):
        with open(join(SAMPLEDIR, 'truncated-crc-error.gz'), 'rb') as f:
            text = gunzip(f.read())
            assert text.endswith(b'</html')

    def test_gunzip_no_gzip_file_raises(self):
        with open(join(SAMPLEDIR, 'feed-sample1.xml'), 'rb') as f:
            self.assertRaises(IOError, gunzip, f.read())

    def test_gunzip_truncated_short(self):
        with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f:
            text = gunzip(f.read())
            assert text.endswith(b'</html>')

    def test_is_x_gzipped_right(self):
        hdrs = Headers({"Content-Type": "application/x-gzip"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))

    def test_is_gzipped_right(self):
        hdrs = Headers({"Content-Type": "application/gzip"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))

    def test_is_gzipped_not_quite(self):
        hdrs = Headers({"Content-Type": "application/gzippppp"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertFalse(is_gzipped(r1))

    def test_is_gzipped_case_insensitive(self):
        hdrs = Headers({"Content-Type": "Application/X-Gzip"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))

        hdrs = Headers({"Content-Type": "application/X-GZIP ; charset=utf-8"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))

    def test_is_gzipped_empty(self):
        r1 = Response("http://www.example.com")
        self.assertFalse(is_gzipped(r1))

    def test_is_gzipped_wrong(self):
        hdrs = Headers({"Content-Type": "application/javascript"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertFalse(is_gzipped(r1))

    def test_is_gzipped_with_charset(self):
        hdrs = Headers({"Content-Type": "application/x-gzip;charset=utf-8"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))

    def test_gunzip_illegal_eof(self):
        with open(join(SAMPLEDIR, 'unexpected-eof.gz'), 'rb') as f:
            text = html_to_unicode('charset=cp1252', gunzip(f.read()))[1]
            with open(join(SAMPLEDIR, 'unexpected-eof-output.txt'), 'rb') as o:
                expected_text = o.read().decode("utf-8")
                self.assertEqual(len(text), len(expected_text))
                self.assertEqual(text, expected_text)
fixed bug handling truncated gzipped responses. closes #319 2011-06-06 18:25:14 -03:00			`import unittest`
			`from os.path import join`

Modifying existing gzip read failure recovery mechanism to patch read for broken archives 2016-04-06 08:47:06 +05:30			`from w3lib.encoding import html_to_unicode`

Test to show bug with is_gzipped and Content-Type: application/gzip;charset. 2016-06-12 01:38:01 +02:00			`from scrapy.utils.gz import gunzip, is_gzipped`
			`from scrapy.http import Response, Headers`
PY3: port scrapy/utils/gz.py 2014-08-01 02:25:33 -03:00			`from tests import tests_datadir`
fixed bug handling truncated gzipped responses. closes #319 2011-06-06 18:25:14 -03:00
			`SAMPLEDIR = join(tests_datadir, 'compressed')`

PY3: port scrapy/utils/gz.py 2014-08-01 02:25:33 -03:00
PY3 port scrapy.spiders 2015-08-28 02:12:36 +05:00			`class GunzipTest(unittest.TestCase):`
fixed bug handling truncated gzipped responses. closes #319 2011-06-06 18:25:14 -03:00
			`def test_gunzip_basic(self):`
			`with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f:`
			`text = gunzip(f.read())`
			`self.assertEqual(len(text), 9950)`

			`def test_gunzip_truncated(self):`
			`with open(join(SAMPLEDIR, 'truncated-crc-error.gz'), 'rb') as f:`
			`text = gunzip(f.read())`
PY3: port scrapy/utils/gz.py 2014-08-01 02:25:33 -03:00			`assert text.endswith(b'</html')`
fixed bug handling truncated gzipped responses. closes #319 2011-06-06 18:25:14 -03:00
			`def test_gunzip_no_gzip_file_raises(self):`
			`with open(join(SAMPLEDIR, 'feed-sample1.xml'), 'rb') as f:`
			`self.assertRaises(IOError, gunzip, f.read())`

httpcompression middleware improvement 2012-10-10 12:21:59 +03:00			`def test_gunzip_truncated_short(self):`
			`with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f:`
			`text = gunzip(f.read())`
PY3: port scrapy/utils/gz.py 2014-08-01 02:25:33 -03:00			`assert text.endswith(b'</html>')`
Test to show bug with is_gzipped and Content-Type: application/gzip;charset. 2016-06-12 01:38:01 +02:00
is_gzipped: Separated tests again. 2016-06-14 21:33:51 +02:00			`def test_is_x_gzipped_right(self):`
Test to show bug with is_gzipped and Content-Type: application/gzip;charset. 2016-06-12 01:38:01 +02:00			`hdrs = Headers({"Content-Type": "application/x-gzip"})`
			`r1 = Response("http://www.example.com", headers=hdrs)`
			`self.assertTrue(is_gzipped(r1))`
is_gzipped: Separated tests again. 2016-06-14 21:33:51 +02:00
			`def test_is_gzipped_right(self):`
Separated tests based on case 2016-06-12 10:49:34 +02:00			`hdrs = Headers({"Content-Type": "application/gzip"})`
			`r1 = Response("http://www.example.com", headers=hdrs)`
			`self.assertTrue(is_gzipped(r1))`

Added new testcases suggested by @redapple. 2016-06-14 14:22:18 +02:00			`def test_is_gzipped_not_quite(self):`
			`hdrs = Headers({"Content-Type": "application/gzippppp"})`
			`r1 = Response("http://www.example.com", headers=hdrs)`
			`self.assertFalse(is_gzipped(r1))`

			`def test_is_gzipped_case_insensitive(self):`
			`hdrs = Headers({"Content-Type": "Application/X-Gzip"})`
			`r1 = Response("http://www.example.com", headers=hdrs)`
			`self.assertTrue(is_gzipped(r1))`

			`hdrs = Headers({"Content-Type": "application/X-GZIP ; charset=utf-8"})`
			`r1 = Response("http://www.example.com", headers=hdrs)`
			`self.assertTrue(is_gzipped(r1))`

Separated tests based on case 2016-06-12 10:49:34 +02:00			`def test_is_gzipped_empty(self):`
			`r1 = Response("http://www.example.com")`
Added new testcases suggested by @redapple. 2016-06-14 14:22:18 +02:00			`self.assertFalse(is_gzipped(r1))`
Separated tests based on case 2016-06-12 10:49:34 +02:00
			`def test_is_gzipped_wrong(self):`
Test to show bug with is_gzipped and Content-Type: application/gzip;charset. 2016-06-12 01:38:01 +02:00			`hdrs = Headers({"Content-Type": "application/javascript"})`
Separated tests based on case 2016-06-12 10:49:34 +02:00			`r1 = Response("http://www.example.com", headers=hdrs)`
Added new testcases suggested by @redapple. 2016-06-14 14:22:18 +02:00			`self.assertFalse(is_gzipped(r1))`
Separated tests based on case 2016-06-12 10:49:34 +02:00
			`def test_is_gzipped_with_charset(self):`
Test to show bug with is_gzipped and Content-Type: application/gzip;charset. 2016-06-12 01:38:01 +02:00			`hdrs = Headers({"Content-Type": "application/x-gzip;charset=utf-8"})`
			`r1 = Response("http://www.example.com", headers=hdrs)`
			`self.assertTrue(is_gzipped(r1))`
Modifying existing gzip read failure recovery mechanism to patch read for broken archives 2016-04-06 08:47:06 +05:30
			`def test_gunzip_illegal_eof(self):`
			`with open(join(SAMPLEDIR, 'unexpected-eof.gz'), 'rb') as f:`
			`text = html_to_unicode('charset=cp1252', gunzip(f.read()))[1]`
			`with open(join(SAMPLEDIR, 'unexpected-eof-output.txt'), 'rb') as o:`
			`expected_text = o.read().decode("utf-8")`
			`self.assertEqual(len(text), len(expected_text))`
			`self.assertEqual(text, expected_text)`