From ec4f17cbb8f9c462f84cd3944d214649da458ad3 Mon Sep 17 00:00:00 2001 From: Hengruo Zhang Date: Sun, 4 Mar 2018 01:28:39 -0500 Subject: [PATCH 1/4] modified: twitterscraper/tweet.py --- twitterscraper/tweet.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/twitterscraper/tweet.py b/twitterscraper/tweet.py index cad43fa..2bb6aa1 100644 --- a/twitterscraper/tweet.py +++ b/twitterscraper/tweet.py @@ -4,9 +4,9 @@ from coala_utils.decorators import generate_ordering -@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'retweets', 'likes') +@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'reply_to_id', 'retweets', 'likes') class Tweet: - def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, likes, html): + def __init__(self, user, fullname, id, url, timestamp, text, reply_to_id, replies, retweets, likes, html): self.user = user self.fullname = fullname self.id = id @@ -17,6 +17,7 @@ def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, self.retweets = retweets self.likes = likes self.html = html + self.reply_to_id = reply_to_id @classmethod def from_soup(cls, tweet): @@ -38,7 +39,8 @@ def from_soup(cls, tweet): 'span', 'ProfileTweet-action--favorite u-hiddenVisually').find( 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0', html=str(tweet.find('p', 'tweet-text')) or "", - ) + reply_to_id = tweet.findChildren()[0]['data-conversation-id'] or '0', + ) @classmethod def from_html(cls, html): From c90b031e105bccdfb05bc976ee5d9ad3241281d3 Mon Sep 17 00:00:00 2001 From: Hengruo Zhang Date: Mon, 5 Mar 2018 17:30:44 -0500 Subject: [PATCH 2/4] modified: twitterscraper/query.py --- twitterscraper/query.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/twitterscraper/query.py b/twitterscraper/query.py index d8c5ffd..76d0d7a 100644 --- a/twitterscraper/query.py +++ b/twitterscraper/query.py @@ -2,14 +2,13 @@ import random import requests import datetime as dt -import json +import ujson from functools import partial from multiprocessing.pool import Pool from fake_useragent import UserAgent from twitterscraper.tweet import Tweet - ua = UserAgent() HEADERS_LIST = [ua.chrome, ua.google, ua['google chrome'], ua.firefox, ua.ff] @@ -31,11 +30,12 @@ def query_single_page(url, html_response=True, retry=10): headers = {'User-Agent': random.choice(HEADERS_LIST)} try: - response = requests.get(url, headers=headers) + sess = requests.Session() + response = sess.get(url, headers=headers) if html_response: html = response.text else: - json_resp = response.json() + json_resp = ujson.loads(response.text) html = json_resp['items_html'] tweets = list(Tweet.from_html(html)) @@ -56,7 +56,7 @@ def query_single_page(url, html_response=True, retry=10): except requests.exceptions.Timeout as e: logging.exception('TimeOut {} while requesting "{}"'.format( e, url)) - except json.decoder.JSONDecodeError as e: + except Exception as e: logging.exception('Failed to parse JSON "{}" while requesting "{}".'.format( e, url)) @@ -170,5 +170,4 @@ def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date pool.close() pool.join() - return all_tweets - + return all_tweets \ No newline at end of file From 47390ec2c7e19f62aa9ae74502e647f64e43915d Mon Sep 17 00:00:00 2001 From: hengruo Date: Mon, 5 Mar 2018 17:46:09 -0500 Subject: [PATCH 3/4] modified: twitterscraper/query.py --- twitterscraper/query.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/twitterscraper/query.py b/twitterscraper/query.py index 76d0d7a..bb3a3c6 100644 --- a/twitterscraper/query.py +++ b/twitterscraper/query.py @@ -145,11 +145,11 @@ def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date stepsize = roundup(no_days, poolsize) dateranges = [begindate + dt.timedelta(days=elem) for elem in range(0,no_days,stepsize)] dateranges.append(enddate) - - if limit: - limit_per_pool = roundup(limit, poolsize) - else: - limit_per_pool = None + limit_per_pool = limit + # if limit: + # limit_per_pool = roundup(limit, poolsize) + # else: + # limit_per_pool = None queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:])] From 9d41825843723a6aaf6fed59ae9139d0c5dd05af Mon Sep 17 00:00:00 2001 From: hengruo Date: Thu, 8 Mar 2018 00:50:39 -0500 Subject: [PATCH 4/4] modified: twitterscraper/tweet.py --- twitterscraper/tweet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/twitterscraper/tweet.py b/twitterscraper/tweet.py index 2bb6aa1..05d09c5 100644 --- a/twitterscraper/tweet.py +++ b/twitterscraper/tweet.py @@ -17,7 +17,7 @@ def __init__(self, user, fullname, id, url, timestamp, text, reply_to_id, replie self.retweets = retweets self.likes = likes self.html = html - self.reply_to_id = reply_to_id + self.reply_to_id = reply_to_id if reply_to_id != id else '0' @classmethod def from_soup(cls, tweet): @@ -39,7 +39,8 @@ def from_soup(cls, tweet): 'span', 'ProfileTweet-action--favorite u-hiddenVisually').find( 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0', html=str(tweet.find('p', 'tweet-text')) or "", - reply_to_id = tweet.findChildren()[0]['data-conversation-id'] or '0', + reply_to_id = tweet.find('div', 'tweet')['data-conversation-id'] or '0', + reply_to_user = tweet.find('div', 'tweet')['data-mentions'] or "", ) @classmethod