-
Notifications
You must be signed in to change notification settings - Fork 576
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
modified: twitterscraper/tweet.py #100
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,14 +2,13 @@ | |
import random | ||
import requests | ||
import datetime as dt | ||
import json | ||
import ujson | ||
from functools import partial | ||
from multiprocessing.pool import Pool | ||
|
||
from fake_useragent import UserAgent | ||
from twitterscraper.tweet import Tweet | ||
|
||
|
||
ua = UserAgent() | ||
HEADERS_LIST = [ua.chrome, ua.google, ua['google chrome'], ua.firefox, ua.ff] | ||
|
||
|
@@ -31,11 +30,12 @@ def query_single_page(url, html_response=True, retry=10): | |
headers = {'User-Agent': random.choice(HEADERS_LIST)} | ||
|
||
try: | ||
response = requests.get(url, headers=headers) | ||
sess = requests.Session() | ||
response = sess.get(url, headers=headers) | ||
if html_response: | ||
html = response.text | ||
else: | ||
json_resp = response.json() | ||
json_resp = ujson.loads(response.text) | ||
html = json_resp['items_html'] | ||
|
||
tweets = list(Tweet.from_html(html)) | ||
|
@@ -56,7 +56,7 @@ def query_single_page(url, html_response=True, retry=10): | |
except requests.exceptions.Timeout as e: | ||
logging.exception('TimeOut {} while requesting "{}"'.format( | ||
e, url)) | ||
except json.decoder.JSONDecodeError as e: | ||
except Exception as e: | ||
logging.exception('Failed to parse JSON "{}" while requesting "{}".'.format( | ||
e, url)) | ||
|
||
|
@@ -145,11 +145,11 @@ def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date | |
stepsize = roundup(no_days, poolsize) | ||
dateranges = [begindate + dt.timedelta(days=elem) for elem in range(0,no_days,stepsize)] | ||
dateranges.append(enddate) | ||
|
||
if limit: | ||
limit_per_pool = roundup(limit, poolsize) | ||
else: | ||
limit_per_pool = None | ||
limit_per_pool = limit | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change will result in twitterscraper scraping approximately for P*limit number of tweets (where P is the poolsize) instead of the given limit. Please remove this change. |
||
# if limit: | ||
# limit_per_pool = roundup(limit, poolsize) | ||
# else: | ||
# limit_per_pool = None | ||
|
||
queries = ['{} since:{} until:{}'.format(query, since, until) | ||
for since, until in zip(dateranges[:-1], dateranges[1:])] | ||
|
@@ -170,5 +170,4 @@ def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date | |
pool.close() | ||
pool.join() | ||
|
||
return all_tweets | ||
|
||
return all_tweets |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,9 +4,9 @@ | |
from coala_utils.decorators import generate_ordering | ||
|
||
|
||
@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'retweets', 'likes') | ||
@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'reply_to_id', 'retweets', 'likes') | ||
class Tweet: | ||
def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, likes, html): | ||
def __init__(self, user, fullname, id, url, timestamp, text, reply_to_id, replies, retweets, likes, html): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. placing a new argument at this location breaks backward compatibility. I suggest you move it to the end of the list of arguments. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The newly implemented 'reply_to_user' is not passed to the Tweet class and hence will not appear in the output. |
||
self.user = user | ||
self.fullname = fullname | ||
self.id = id | ||
|
@@ -17,6 +17,7 @@ def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, | |
self.retweets = retweets | ||
self.likes = likes | ||
self.html = html | ||
self.reply_to_id = reply_to_id if reply_to_id != id else '0' | ||
|
||
@classmethod | ||
def from_soup(cls, tweet): | ||
|
@@ -38,7 +39,9 @@ def from_soup(cls, tweet): | |
'span', 'ProfileTweet-action--favorite u-hiddenVisually').find( | ||
'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0', | ||
html=str(tweet.find('p', 'tweet-text')) or "", | ||
) | ||
reply_to_id = tweet.find('div', 'tweet')['data-conversation-id'] or '0', | ||
reply_to_user = tweet.find('div', 'tweet')['data-mentions'] or "", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is already implemented in PR #98 . Maybe it is best to remove it here. |
||
) | ||
|
||
@classmethod | ||
def from_html(cls, html): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the difference between json.loads() and ujson.loads() ? If there is no clear reason for using ujson instead of json, I prefer the usage of json.