-
Notifications
You must be signed in to change notification settings - Fork 59
Open
Description
I've been following webstruct tutorial and I'm getting few peculiar errors.
From the tutorial I end up with code along the lines of this:
from itertools import islice
import pkg_resources
import webstruct
def token_identity(html_token):
return {'token': html_token.token}
def token_isupper(html_token):
return {'isupper': html_token.token.isupper()}
def parent_tag(html_token):
return {'parent_tag': html_token.parent.tag}
def border_at_left(html_token):
return {'border_at_left': html_token.index == 0}
DATA_DIR = pkg_resources.resource_filename('project', 'data/business_annotated')
def get_training():
trees = webstruct.load_trees("{}/*.html".format(DATA_DIR), webstruct.WebAnnotatorLoader())
trees = islice(trees, 0, 10) # todo
return trees
def tokenize_training(trees):
html_tokenizer = webstruct.HtmlTokenizer()
tokens, labels = html_tokenizer.tokenize(trees)
return tokens, labels
def main():
print('creating model...')
model = webstruct.create_wapiti_pipeline(
'company.wapiti',
token_features=[token_identity, token_isupper, parent_tag, border_at_left],
train_args='--algo l-bfgs --maxiter 50 --compact',
)
print('getting training data...')
tokens, labels = tokenize_training(get_training())
print('fitting training data...')
model.fit(tokens, labels)
print('starting extract...')
ner = webstruct.NER(model)
print(ner.extract_from_url('http://scrapinghub.com/contact'))
if __name__ == '__main__':
main()
The first error I get is TypeError when trying to use extract something with ner:
Traceback (most recent call last):
File "/home/dex/projects/project/project/spiders/test.py", line 54, in <module>
main()
File "/home/dex/projects/project/project/spiders/test.py", line 51, in main
print(ner.extract_from_url('http://scrapinghub.com/contact'))
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 58, in extract_from_url
return self.extract(data)
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 46, in extract
groups = IobEncoder.group(zip(html_tokens, tags))
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/sequence_encoding.py", line 128, in group
return list(cls.iter_group(data, strict))
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/sequence_encoding.py", line 136, in iter_group
if iob_tag.startswith('I-') and tag != iob_tag[2:]:
TypeError: startswith first arg must be bytes or a tuple of bytes, not str
It seems like python3 support issue as it's expects bytes but get a string?
Second error is when trying to build a ner straight from model without fitting it first:
def main():
print('creating model...')
model = webstruct.create_wapiti_pipeline(
'company.wapiti',
token_features=[token_identity, token_isupper, parent_tag, border_at_left],
train_args='--algo l-bfgs --maxiter 50 --compact',
)
# print('getting training data...')
# tokens, labels = tokenize_training(get_training())
# print('fitting training data...')
# model.fit(tokens, labels)
# print('starting extract...')
ner = webstruct.NER(model)
print(ner.extract_from_url('http://scrapinghub.com/contact'))
Results in:
Traceback (most recent call last):
File "/home/dex/projects/project/project/spiders/test.py", line 53, in <module>
main()
File "/home/dex/projects/project/project/spiders/test.py", line 50, in main
print(ner.extract_from_url('http://scrapinghub.com/contact'))
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 58, in extract_from_url
return self.extract(data)
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 45, in extract
html_tokens, tags = self.extract_raw(bytes_data)
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 67, in extract_raw
tags = self.model.predict([html_tokens])[0]
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/sklearn/pipeline.py", line 327, in predict
return self.steps[-1][-1].predict(Xt)
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 211, in predict
sequences = self._to_wapiti_sequences(X)
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 230, in _to_wapiti_sequences
X = self.feature_encoder.transform(X)
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 313, in transform
return [self.transform_single(feature_dicts) for feature_dicts in X]
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 313, in <listcomp>
return [self.transform_single(feature_dicts) for feature_dicts in X]
File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 308, in transform_single
line = ' '.join(_tostr(dct.get(key)) for key in self.feature_names_)
TypeError: 'NoneType' object is not iterable
The errors seem to be very vague and I don't even know where to start debugging this. Am I missing something?
I'm running:
webstruct - 0.5
scikit-learn - 0.18.2
scipy - 0.19
libwapiti - 0.2.1
Metadata
Metadata
Assignees
Labels
No labels