diff --git a/twitterscraper/query.py b/twitterscraper/query.py index d8c5ffd..bb3a3c6 100644 --- a/twitterscraper/query.py +++ b/twitterscraper/query.py @@ -2,14 +2,13 @@ import random import requests import datetime as dt -import json +import ujson from functools import partial from multiprocessing.pool import Pool from fake_useragent import UserAgent from twitterscraper.tweet import Tweet - ua = UserAgent() HEADERS_LIST = [ua.chrome, ua.google, ua['google chrome'], ua.firefox, ua.ff] @@ -31,11 +30,12 @@ def query_single_page(url, html_response=True, retry=10): headers = {'User-Agent': random.choice(HEADERS_LIST)} try: - response = requests.get(url, headers=headers) + sess = requests.Session() + response = sess.get(url, headers=headers) if html_response: html = response.text else: - json_resp = response.json() + json_resp = ujson.loads(response.text) html = json_resp['items_html'] tweets = list(Tweet.from_html(html)) @@ -56,7 +56,7 @@ def query_single_page(url, html_response=True, retry=10): except requests.exceptions.Timeout as e: logging.exception('TimeOut {} while requesting "{}"'.format( e, url)) - except json.decoder.JSONDecodeError as e: + except Exception as e: logging.exception('Failed to parse JSON "{}" while requesting "{}".'.format( e, url)) @@ -145,11 +145,11 @@ def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date stepsize = roundup(no_days, poolsize) dateranges = [begindate + dt.timedelta(days=elem) for elem in range(0,no_days,stepsize)] dateranges.append(enddate) - - if limit: - limit_per_pool = roundup(limit, poolsize) - else: - limit_per_pool = None + limit_per_pool = limit + # if limit: + # limit_per_pool = roundup(limit, poolsize) + # else: + # limit_per_pool = None queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:])] @@ -170,5 +170,4 @@ def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date pool.close() pool.join() - return all_tweets - + return all_tweets \ No newline at end of file diff --git a/twitterscraper/tweet.py b/twitterscraper/tweet.py index cad43fa..05d09c5 100644 --- a/twitterscraper/tweet.py +++ b/twitterscraper/tweet.py @@ -4,9 +4,9 @@ from coala_utils.decorators import generate_ordering -@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'retweets', 'likes') +@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'reply_to_id', 'retweets', 'likes') class Tweet: - def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, likes, html): + def __init__(self, user, fullname, id, url, timestamp, text, reply_to_id, replies, retweets, likes, html): self.user = user self.fullname = fullname self.id = id @@ -17,6 +17,7 @@ def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, self.retweets = retweets self.likes = likes self.html = html + self.reply_to_id = reply_to_id if reply_to_id != id else '0' @classmethod def from_soup(cls, tweet): @@ -38,7 +39,9 @@ def from_soup(cls, tweet): 'span', 'ProfileTweet-action--favorite u-hiddenVisually').find( 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0', html=str(tweet.find('p', 'tweet-text')) or "", - ) + reply_to_id = tweet.find('div', 'tweet')['data-conversation-id'] or '0', + reply_to_user = tweet.find('div', 'tweet')['data-mentions'] or "", + ) @classmethod def from_html(cls, html):