test_twarc2.py

import os
import pytz
import twarc
import dotenv
import pytest
import logging
import pathlib
import datetime
import threading

from unittest import TestCase
from twarc.version import version, user_agent

dotenv.load_dotenv()
consumer_key = os.environ.get("CONSUMER_KEY")
consumer_secret = os.environ.get("CONSUMER_SECRET")
bearer_token = os.environ.get("BEARER_TOKEN")
access_token = os.environ.get("ACCESS_TOKEN")
access_token_secret = os.environ.get("ACCESS_TOKEN_SECRET")

test_data = pathlib.Path("test-data")
logging.basicConfig(filename="test.log", level=logging.INFO)

# Implicitly test the constructor in application auth mode. This ensures that
# the tests don't depend on test ordering, and allows using the pytest
# functionality to only run a single test at a time.

T = twarc.Twarc2(
    consumer_key=consumer_key,
    consumer_secret=consumer_secret,
)


def test_version():
    import setup

    assert setup.version == version

    assert user_agent
    assert f"twarc/{version}" in user_agent


def test_auth_types_interaction():
    """
    Test the various options for configuration work as expected.
    """

    # 1. bearer_token auth -> app auth
    tw = twarc.Twarc2(bearer_token=bearer_token)
    assert tw.auth_type == "application"

    for response in tw.user_lookup(range(1, 101)):
        assert response["data"]

    tw.client.close()

    # 2. consumer_keys
    tw = twarc.Twarc2(consumer_key=consumer_key, consumer_secret=consumer_secret)
    assert tw.auth_type == "application"

    for response in tw.user_lookup(range(1, 101)):
        assert response["data"]

    tw.client.close()

    # 3. Full user auth
    tw = twarc.Twarc2(
        access_token=access_token,
        access_token_secret=access_token_secret,
        consumer_key=consumer_key,
        consumer_secret=consumer_secret,
    )
    assert tw.auth_type == "user"

    for response in tw.user_lookup(range(1, 101)):
        assert response["data"]

    tw.client.close()

    with pytest.raises(twarc.client2.InvalidAuthType):
        tw.sample()


def test_sample():
    # event to tell the filter stream to close
    event = threading.Event()

    for count, result in enumerate(T.sample(event=event)):
        assert int(result["data"]["id"])

        # users are passed by reference an dincluded as includes
        user_id = result["data"]["author_id"]
        assert len(pick_id(user_id, result["includes"]["users"])) == 1

        if count > 10:
            # close the sample
            event.set()

    assert count == 11


@pytest.mark.parametrize("sort_order", ["recency", "relevancy"])
def test_search_recent(sort_order):
    found_tweets = 0
    pages = 0

    for response_page in T.search_recent("politics", sort_order=sort_order):
        pages += 1
        tweets = response_page["data"]
        found_tweets += len(tweets)

        if pages == 2:
            break

    assert 100 <= found_tweets <= 200


def test_counts_recent():
    found_counts = 0

    for response_page in T.counts_recent("twitter is:verified", granularity="day"):
        counts = response_page["data"]
        found_counts += len(counts)
        break

    assert 7 <= found_counts <= 8


@pytest.mark.skipif(
    os.environ.get("SKIP_ACADEMIC_PRODUCT_TRACK") != None,
    reason="No Academic Research Product Track access",
)
def test_counts_empty_page():
    found_counts = 0

    for response_page in T.counts_all(
        "beans",
        start_time=datetime.datetime(2006, 3, 21),
        end_time=datetime.datetime(2006, 6, 1),
        granularity="day",
    ):
        counts = response_page["data"]
        found_counts += len(counts)

    assert found_counts == 72


def test_search_times():
    found = False
    now = datetime.datetime.now(tz=pytz.timezone("Australia/Melbourne"))
    # twitter api doesn't resolve microseconds so strip them for comparison
    now = now.replace(microsecond=0)
    end = now - datetime.timedelta(seconds=60)
    start = now - datetime.timedelta(seconds=61)

    for response_page in T.search_recent("tweet", start_time=start, end_time=end):
        for tweet in response_page["data"]:
            found = True
            # convert created_at to datetime with utc timezone
            dt = tweet["created_at"].strip("Z")
            dt = datetime.datetime.fromisoformat(dt)
            dt = dt.replace(tzinfo=datetime.timezone.utc)
            assert dt >= start
            assert dt <= end

    assert found


def test_user_ids_lookup():
    users_found = 0
    users_not_found = 0

    for response in T.user_lookup(range(1, 1000)):
        for profile in response["data"]:
            users_found += 1

        for error in response["errors"]:
            # Note that errors includes lookup of contained entitites within a
            # tweet, so a pinned tweet that doesn't exist anymore results in an
            # additional error entry, even if the profile is present.
            if error["resource_type"] == "user":
                users_not_found += 1

    assert users_found >= 1
    assert users_found + users_not_found == 999


def test_usernames_lookup():
    users_found = 0
    usernames = ["jack", "barackobama", "rihanna"]
    for response in T.user_lookup(usernames, usernames=True):
        for profile in response["data"]:
            users_found += 1
    assert users_found == 3


def test_tweet_lookup():
    tweets_found = 0
    tweets_not_found = 0

    for response in T.tweet_lookup(range(1000, 2000)):
        for tweet in response["data"]:
            tweets_found += 1

        for error in response["errors"]:
            # Note that errors includes lookup of contained entitites within a
            # tweet, so a pinned tweet that doesn't exist anymore results in an
            # additional error entry, even if the profile is present.
            if error["resource_type"] == "tweet":
                tweets_not_found += 1

    assert tweets_found >= 1
    assert tweets_found + tweets_not_found == 1000


# Alas, fetching the stream in GitHub action yields a 400 HTTP error
# maybe this will go away since it used to work fine.


@pytest.mark.skipif(
    os.environ.get("GITHUB_ACTIONS") != None,
    reason="stream() seems to throw a 400 error under GitHub Actions?!",
)
def test_stream():
    # remove any active stream rules
    rules = T.get_stream_rules()
    if "data" in rules and len(rules["data"]) > 0:
        rule_ids = [r["id"] for r in rules["data"]]
        T.delete_stream_rule_ids(rule_ids)

    # make sure they are empty
    rules = T.get_stream_rules()
    assert "data" not in rules

    # add two rules
    rules = T.add_stream_rules(
        [{"value": "hey", "tag": "twarc-test"}, {"value": "joe", "tag": "twarc-test"}]
    )
    assert len(rules["data"]) == 2

    # make sure they are there
    rules = T.get_stream_rules()
    assert len(rules["data"]) == 2

    # these properties should be set
    assert rules["data"][0]["id"]
    assert rules["data"][0]["tag"] == "twarc-test"
    assert rules["data"][1]["id"]
    assert rules["data"][1]["tag"] == "twarc-test"

    # the order of the values is not guaranteed
    assert "hey" in [r["value"] for r in rules["data"]]
    assert "joe" in [r["value"] for r in rules["data"]]

    # collect some data
    event = threading.Event()
    for count, result in enumerate(T.stream(event=event)):
        assert result["data"]["id"]
        assert result["data"]["text"]
        assert len(result["matching_rules"]) > 0
        for rule in result["matching_rules"]:
            assert rule["id"]
            assert rule["tag"] == "twarc-test"
        if count > 25:
            event.set()
    assert count > 25

    # delete the rules
    rule_ids = [r["id"] for r in rules["data"]]
    T.delete_stream_rule_ids(rule_ids)

    # make sure they are gone
    rules = T.get_stream_rules()
    assert "data" not in rules


def test_timeline():
    """
    Test the user timeline endpoints.

    """
    # get @jack's first pages of tweets and mentions
    found = 0
    for pages, tweets in enumerate(T.timeline(12)):
        found += len(tweets["data"])
        if pages == 3:
            break
    assert found >= 200

    found = 0
    for pages, tweets in enumerate(T.mentions(12)):
        found += len(tweets["data"])
        if pages == 3:
            break
    assert found >= 200


def test_timeline_username():
    """
    Test the user timeline endpoints with username.

    """

    found = 0
    for pages, tweets in enumerate(T.timeline("jack")):
        found += len(tweets["data"])
        if pages == 3:
            break
    assert found >= 200

    found = 0
    for pages, tweets in enumerate(T.mentions("jack")):
        found += len(tweets["data"])
        if pages == 3:
            break
    assert found >= 200


def test_missing_timeline():
    results = T.timeline(1033441111677788160)
    assert len(list(results)) == 0


def test_follows():
    """
    Test followers and and following.

    """

    found = 0
    for pages, users in enumerate(T.following(12)):
        pages += 1
        found += len(users["data"])
        if pages == 2:
            break
    assert found >= 1000

    found = 0
    for pages, users in enumerate(T.followers(12)):
        found += len(users["data"])
        if pages == 2:
            break
    assert found >= 1000


def test_follows_username():
    """
    Test followers and and following by username.

    """

    found = 0
    for pages, users in enumerate(T.following("jack")):
        pages += 1
        found += len(users["data"])
        if pages == 2:
            break
    assert found >= 1000

    found = 0
    for pages, users in enumerate(T.followers("jack")):
        found += len(users["data"])
        if pages == 2:
            break
    assert found >= 1000


def test_flattened():
    """
    This test uses the search API to test response flattening. It will look
    at each tweet to find evidence that all the expansions have worked. Once it
    finds them all it stops. If it has retrieved 500 tweets and not found any
    of the expansions it stops and assumes that something is not right. This
    500 cutoff or the query may need to be adjusted based on experience.
    """
    found_geo = False
    found_in_reply_to_user = False
    found_attachments_media = False
    found_attachments_polls = False
    found_entities_mentions = False
    found_referenced_tweets = False

    count = 0

    for response in T.search_recent(
        "(vote poll has:hashtags has:mentions -is:retweet) OR (checked into has:images -is:retweet)"
    ):
        # Search api always returns a response of tweets with metadata but flatten
        # will put these in a list
        tweets = twarc.expansions.flatten(response)
        assert len(tweets) > 1

        for tweet in tweets:
            count += 1

            assert "id" in tweet
            logging.info("got search tweet #%s %s", count, tweet["id"])

            author_id = tweet["author_id"]
            assert "author" in tweet
            assert tweet["author"]["id"] == author_id

            if "in_reply_to_user_id" in tweet:
                assert "in_reply_to_user" in tweet
                found_in_reply_to_user = True

            if "attachments" in tweet:
                if "media_keys" in tweet["attachments"]:
                    assert "media" in tweet["attachments"]
                    assert tweet["attachments"]["media"]
                    assert tweet["attachments"]["media"][0]["width"]
                    found_attachments_media = True
                if "poll_ids" in tweet["attachments"]:
                    assert "poll" in tweet["attachments"]
                    assert tweet["attachments"]["poll"]
                    found_attachments_polls = True

            if "geo" in tweet:
                assert tweet["geo"]["place_id"]
                assert tweet["geo"]["place_id"] == tweet["geo"]["id"]
                found_geo = True

            if "entities" in tweet and "mentions" in tweet["entities"]:
                assert tweet["entities"]["mentions"][0]["username"]
                found_entities_mentions = True

            # need to ensure there are no errors because a referenced tweet
            # might be protected or deleted in which case it would not have been
            # included in the response and would not have been flattened
            if "errors" not in response and "referenced_tweets" in tweet:
                assert tweet["referenced_tweets"][0]["text"]
                found_referenced_tweets = True

        if (
            found_geo
            and found_in_reply_to_user
            and found_attachments_media
            and found_attachments_polls
            and found_entities_mentions
            and found_referenced_tweets
        ):
            logging.info("found all expansions!")
        elif count > 10000:
            logging.info("didn't find all expansions in 10000 tweets")

    assert found_geo, "found geo"
    assert found_in_reply_to_user, "found in_reply_to_user"
    assert found_attachments_media, "found media"
    assert found_attachments_polls, "found polls"
    assert found_entities_mentions, "found mentions"
    assert found_referenced_tweets, "found referenced tweets"


def test_ensure_flattened():
    resp = next(T.search_recent("twitter", max_results=20))

    # flatten a response
    flat1 = twarc.expansions.ensure_flattened(resp)
    assert isinstance(flat1, list)
    assert len(flat1) > 1
    assert "author" in flat1[0]

    # flatten the flattened list
    flat2 = twarc.expansions.ensure_flattened(flat1)
    assert isinstance(flat2, list)
    assert len(flat2) == len(flat1)
    assert "author" in flat2[0]

    # flatten a tweet object which will force it into a list
    flat3 = twarc.expansions.ensure_flattened(flat2[0])
    assert isinstance(flat3, list)
    assert len(flat3) == 1

    # flatten an object without includes:
    # List of records, data is a dict:
    flat4 = twarc.expansions.ensure_flattened([{"data": {"fake": "tweet"}}])
    assert isinstance(flat4, list)
    assert len(flat4) == 1
    # 1 record, data is a dict:
    flat5 = twarc.expansions.ensure_flattened({"data": {"fake": "tweet"}})
    assert isinstance(flat5, list)
    assert len(flat5) == 1
    # List of records, data is a list:
    flat6 = twarc.expansions.ensure_flattened([{"data": [{"fake": "tweet"}]}])
    assert isinstance(flat6, list)
    assert len(flat6) == 1
    # 1 record, data is a list:
    flat7 = twarc.expansions.ensure_flattened({"data": [{"fake": "tweet"}]})
    assert isinstance(flat7, list)
    assert len(flat7) == 1
    TestCase().assertDictEqual(flat4[0], flat5[0])
    TestCase().assertDictEqual(flat6[0], flat7[0])
    TestCase().assertDictEqual(flat4[0], flat7[0])

    resp.pop("includes")
    flat8 = twarc.expansions.ensure_flattened(resp)
    assert len(flat8) > 1
    # Flatten worked without includes, wrote empty object:
    assert "author" in flat8[0]
    TestCase().assertDictEqual(flat8[0]["author"], {})

    # If there's some other type of data:
    with pytest.raises(ValueError):
        twarc.expansions.ensure_flattened([[{"data": {"fake": "list_of_lists"}}]])


def test_ensure_flattened_errors():
    """
    Test that ensure_flattened doesn't return tweets for API responses that only contain errors.
    """
    data = {"errors": ["fake error"]}
    assert twarc.expansions.ensure_flattened(data) == []


def test_ensure_user_id():
    """
    Test _ensure_user_id's ability to discriminate correctly between IDs and
    screen names.
    """
    # presumably IDs don't change
    assert T._ensure_user_id("jack") == "12"

    # should hold for all users, even if the screen name exists
    assert T._ensure_user_id("12") == "12"

    # this is a screen name but not an ID
    # would help to find more "stable" example?
    assert T._ensure_user_id("42069") == "17334495"
    # should 42069 passed as int return ID or screen name?

    assert T._ensure_user_id("1033441111677788160") == "1033441111677788160"
    assert T._ensure_user_id(1033441111677788160) == "1033441111677788160"


def test_liking_users():
    # This is one of @jack's tweets about the Twitter API
    likes = T.liking_users(1460417326130421765)

    like_count = 0

    for page in likes:
        assert "data" in page
        # These should be user objects.
        assert "description" in page["data"][0]
        like_count += len(page["data"])
        if like_count > 300:
            break


def test_retweeted_by():
    # This is one of @jack's tweets about the Twitter API
    retweet_users = T.retweeted_by(1460417326130421765)

    retweet_count = 0

    for page in retweet_users:
        assert "data" in page
        # These should be user objects.
        assert "description" in page["data"][0]
        retweet_count += len(page["data"])
        if retweet_count > 150:
            break


def test_liked_tweets():
    # What has @jack liked?
    liked_tweets = T.liked_tweets(12)

    like_count = 0

    for page in liked_tweets:
        assert "data" in page
        # These should be tweet objects.
        assert "text" in page["data"][0]
        like_count += len(page["data"])
        if like_count > 300:
            break


def test_list_lookup():
    parks_list = T.list_lookup(715919216927322112)
    assert "data" in parks_list
    assert parks_list["data"]["name"] == "National-parks"


def test_list_members():
    response = list(T.list_members(715919216927322112))
    assert len(response) == 1
    members = twarc.expansions.flatten(response[0])
    assert len(members) == 8


def test_list_followers():
    response = list(T.list_followers(715919216927322112))
    assert len(response) >= 2
    followers = twarc.expansions.flatten(response[0])
    assert len(followers) > 50


def test_list_memberships():
    response = list(T.list_memberships("64flavors"))
    assert len(response) == 1
    lists = twarc.expansions.flatten(response[0])
    assert len(lists) >= 9


def test_followed_lists():
    response = list(T.followed_lists("nasa"))
    assert len(response) == 1
    lists = twarc.expansions.flatten(response[0])
    assert len(lists) >= 1


def test_owned_lists():
    response = list(T.owned_lists("nasa"))
    assert len(response) >= 1
    lists = twarc.expansions.flatten(response[0])
    assert len(lists) >= 11


def test_list_tweets():
    response = next(T.list_tweets(715919216927322112))
    assert "data" in response
    tweets = twarc.expansions.flatten(response)
    assert len(tweets) >= 90


def test_user_lookup_non_existent():
    with pytest.raises(ValueError):
        # This user does not exist, and a value error should be raised
        T._ensure_user("noasdfasdf")


def test_twarc_metadata():
    # With metadata (default)
    event = threading.Event()
    for i, response in enumerate(T.sample(event=event)):
        assert "__twarc" in response
        if i == 10:
            event.set()

    for response in T.tweet_lookup(range(1000, 2000)):
        assert "__twarc" in response
        assert "__twarc" in twarc.expansions.flatten(response)[0]

    # Witout metadata
    T.metadata = False
    event = threading.Event()
    for i, response in enumerate(T.sample(event=event)):
        assert "__twarc" not in response
        if i == 10:
            event.set()

    for response in T.tweet_lookup(range(1000, 2000)):
        assert "__twarc" not in response

    T.metadata = True


def test_docs_requirements():
    """
    Make sure that the mkdocs requirements has everything that is in the
    twarc requirements so the readthedocs build doesn't fail.
    """
    twarc_reqs = set(open("requirements.txt").read().split())
    mkdocs_reqs = set(open("requirements-mkdocs.txt").read().split())

    assert twarc_reqs.issubset(mkdocs_reqs)


def test_geo():
    print(T.geo(query="Silver Spring"))


def pick_id(id, objects):
    """pick an object out of a list of objects using its id"""
    return list(filter(lambda o: o["id"] == id, objects))