mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 12:03:58 +00:00
removed nltk dependency from IBL code
This commit is contained in:
parent
8815de94ff
commit
baa523055f
@ -8,9 +8,5 @@ Based Learning (IBL) algorithm, as described in the following papers:
|
||||
Extracting web data using instance based learning
|
||||
http://portal.acm.org/citation.cfm?id=1265174
|
||||
|
||||
This code has some additional dependencies too:
|
||||
|
||||
* numpy
|
||||
* nltk
|
||||
|
||||
This code requires the numpy library.
|
||||
"""
|
||||
|
@ -3,13 +3,13 @@ Region Extract
|
||||
|
||||
Custom extraction for regions in a document
|
||||
"""
|
||||
import re
|
||||
import operator
|
||||
import copy
|
||||
import pprint
|
||||
import cStringIO
|
||||
from itertools import groupby
|
||||
|
||||
import nltk
|
||||
from numpy import array
|
||||
|
||||
from scrapy.contrib.ibl.descriptor import FieldDescriptor
|
||||
@ -572,7 +572,8 @@ class TemplatePageExtractor(object):
|
||||
def __str__(self):
|
||||
return str(self.extractor)
|
||||
|
||||
_tokenize = nltk.tokenize.WordPunctTokenizer().tokenize
|
||||
# Based on nltk's WordPunctTokenizer
|
||||
_tokenize = re.compile(r'\w+|[^\w\s]+', re.UNICODE | re.MULTILINE | re.DOTALL).findall
|
||||
|
||||
class TextRegionDataExtractor(object):
|
||||
"""Data Extractor for extracting text fragments from within a larger
|
||||
|
@ -11,17 +11,12 @@ from scrapy.contrib.ibl.descriptor import (FieldDescriptor as A,
|
||||
from scrapy.contrib.ibl.extractors import (contains_any_numbers,
|
||||
image_url)
|
||||
|
||||
try:
|
||||
import nltk
|
||||
except ImportError:
|
||||
nltk = None
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
numpy = None
|
||||
|
||||
if nltk and numpy:
|
||||
if numpy:
|
||||
from scrapy.contrib.ibl.extraction import InstanceBasedLearningExtractor
|
||||
|
||||
# simple page with all features
|
||||
@ -791,8 +786,6 @@ TEST_DATA = [
|
||||
class TestExtraction(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
if not nltk:
|
||||
raise SkipTest("nltk not available")
|
||||
if not numpy:
|
||||
raise SkipTest("numpy not available")
|
||||
|
||||
|
@ -12,17 +12,12 @@ from scrapy.utils.py26 import json
|
||||
from scrapy.contrib.ibl.htmlpage import HtmlPage
|
||||
from scrapy.tests.test_contrib_ibl import path
|
||||
|
||||
try:
|
||||
import nltk
|
||||
except ImportError:
|
||||
nltk = None
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
numpy = None
|
||||
|
||||
if nltk and numpy:
|
||||
if numpy:
|
||||
from scrapy.contrib.ibl.extraction.pageparsing import (
|
||||
InstanceLearningParser, TemplatePageParser, ExtractionPageParser)
|
||||
from scrapy.contrib.ibl.extraction.pageobjects import TokenDict, TokenType
|
||||
@ -174,8 +169,6 @@ def _tags(pp, predicate):
|
||||
class TestPageParsing(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
if not nltk:
|
||||
raise SkipTest("nltk not available")
|
||||
if not numpy:
|
||||
raise SkipTest("numpy not available")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user