Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modified: twitterscraper/tweet.py #100

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 11 additions & 12 deletions twitterscraper/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
import random
import requests
import datetime as dt
import json
import ujson
from functools import partial
from multiprocessing.pool import Pool

from fake_useragent import UserAgent
from twitterscraper.tweet import Tweet


ua = UserAgent()
HEADERS_LIST = [ua.chrome, ua.google, ua['google chrome'], ua.firefox, ua.ff]

Expand All @@ -31,11 +30,12 @@ def query_single_page(url, html_response=True, retry=10):
headers = {'User-Agent': random.choice(HEADERS_LIST)}

try:
response = requests.get(url, headers=headers)
sess = requests.Session()
response = sess.get(url, headers=headers)
if html_response:
html = response.text
else:
json_resp = response.json()
json_resp = ujson.loads(response.text)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the difference between json.loads() and ujson.loads() ? If there is no clear reason for using ujson instead of json, I prefer the usage of json.

html = json_resp['items_html']

tweets = list(Tweet.from_html(html))
Expand All @@ -56,7 +56,7 @@ def query_single_page(url, html_response=True, retry=10):
except requests.exceptions.Timeout as e:
logging.exception('TimeOut {} while requesting "{}"'.format(
e, url))
except json.decoder.JSONDecodeError as e:
except Exception as e:
logging.exception('Failed to parse JSON "{}" while requesting "{}".'.format(
e, url))

Expand Down Expand Up @@ -145,11 +145,11 @@ def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date
stepsize = roundup(no_days, poolsize)
dateranges = [begindate + dt.timedelta(days=elem) for elem in range(0,no_days,stepsize)]
dateranges.append(enddate)

if limit:
limit_per_pool = roundup(limit, poolsize)
else:
limit_per_pool = None
limit_per_pool = limit
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change will result in twitterscraper scraping approximately for P*limit number of tweets (where P is the poolsize) instead of the given limit. Please remove this change.

# if limit:
# limit_per_pool = roundup(limit, poolsize)
# else:
# limit_per_pool = None

queries = ['{} since:{} until:{}'.format(query, since, until)
for since, until in zip(dateranges[:-1], dateranges[1:])]
Expand All @@ -170,5 +170,4 @@ def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date
pool.close()
pool.join()

return all_tweets

return all_tweets
9 changes: 6 additions & 3 deletions twitterscraper/tweet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from coala_utils.decorators import generate_ordering


@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'retweets', 'likes')
@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'reply_to_id', 'retweets', 'likes')
class Tweet:
def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, likes, html):
def __init__(self, user, fullname, id, url, timestamp, text, reply_to_id, replies, retweets, likes, html):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

placing a new argument at this location breaks backward compatibility. I suggest you move it to the end of the list of arguments.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The newly implemented 'reply_to_user' is not passed to the Tweet class and hence will not appear in the output.

self.user = user
self.fullname = fullname
self.id = id
Expand All @@ -17,6 +17,7 @@ def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets,
self.retweets = retweets
self.likes = likes
self.html = html
self.reply_to_id = reply_to_id if reply_to_id != id else '0'

@classmethod
def from_soup(cls, tweet):
Expand All @@ -38,7 +39,9 @@ def from_soup(cls, tweet):
'span', 'ProfileTweet-action--favorite u-hiddenVisually').find(
'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0',
html=str(tweet.find('p', 'tweet-text')) or "",
)
reply_to_id = tweet.find('div', 'tweet')['data-conversation-id'] or '0',
reply_to_user = tweet.find('div', 'tweet')['data-mentions'] or "",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is already implemented in PR #98 . Maybe it is best to remove it here.

)

@classmethod
def from_html(cls, html):
Expand Down