RegexLinkExtractorTestCase

2025-02-23 16:24:18 +00:00 · 2014-01-11 15:12:54 +06:00 · 2014-01-11 15:12:54 +06:00 · a54e31cebc
commit a54e31cebc
parent 968141cd42
1 changed files with 19 additions and 0 deletions
--- a/scrapy/tests/test_contrib_linkextractors.py
+++ b/scrapy/tests/test_contrib_linkextractors.py
@ -1,5 +1,6 @@
 import re
 import unittest
+from scrapy.contrib.linkextractors.regex import RegexLinkExtractor
 from scrapy.http import HtmlResponse
 from scrapy.link import Link
 from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
@ -312,5 +313,23 @@ class HtmlParserLinkExtractorTestCase(unittest.TestCase):
        ])


+class RegexLinkExtractorTestCase(unittest.TestCase):
+
+    def setUp(self):
+        body = get_testdata('link_extractor', 'sgml_linkextractor.html')
+        self.response = HtmlResponse(url='http://example.com/index', body=body)
+
+    def test_extraction(self):
+        # Default arguments
+        lx = RegexLinkExtractor()
+        # Note that RegexLinkExtractor returns links in arbitrary order,
+        # so we need to sort them for comparison
+        self.assertEqual(sorted(lx.extract_links(self.response), key=lambda x: x.url), [
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
+            Link(url='http://www.google.com/something', text=u''),
+        ])
+
+
 if __name__ == "__main__":
    unittest.main()