Skip to content

Commit

Permalink
Issue Resolved: Add better progress tracking waldo-vision#22
Browse files Browse the repository at this point in the history
  • Loading branch information
swastikom committed May 27, 2023
1 parent f6f7923 commit 2467a37
Showing 1 changed file with 33 additions and 14 deletions.
47 changes: 33 additions & 14 deletions utils/link_retrieval.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from tqdm import tqdm
import os
import pandas as pd
import requests
Expand All @@ -7,19 +8,23 @@
from pathlib import Path

# Set up command line arguments
parser = argparse.ArgumentParser(description="Get URL's from API and store them locally")
parser = argparse.ArgumentParser(
description="Get URL's from API and store them locally")
parser.add_argument("-e", "--endpoint", help='Target URL on server', required=False,
default="https://waldo.vision/api/analysis/urls", type=str)
parser.add_argument("-k", "--key", help='API Key', required=False,
default=os.environ.get("WALDO_API_KEY"), type=str)
parser.add_argument("-i", "--id", help='API Key ID', required=False, type=str,
default=os.environ.get("WALDO_API_ID"))
parser.add_argument("-o", "--output", help='Folder to store output', required=True, type=str)
parser.add_argument('--minreviews', help='Minimum number of reviews', required=True, type=int)
parser.add_argument(
"-o", "--output", help='Folder to store output', required=True, type=str)
parser.add_argument(
'--minreviews', help='Minimum number of reviews', required=True, type=int)
parser.add_argument('--rating', help='Minimum rating', required=True, type=int)

args = vars(parser.parse_args())


def parse_data(data):
"""
Convert the data to a pandas DataFrame and validate the URLs.
Expand All @@ -34,8 +39,10 @@ def parse_data(data):

for obj in response_list:
obj_dataframe = pd.DataFrame(obj, index=[0])
obj_dataframe.rename(columns={"id": "id", "ytUrl": "url", "game": "game"}, inplace=True)
response_dataframe = pd.concat([response_dataframe, obj_dataframe], ignore_index=True)
obj_dataframe.rename(
columns={"id": "id", "ytUrl": "url", "game": "game"}, inplace=True)
response_dataframe = pd.concat(
[response_dataframe, obj_dataframe], ignore_index=True)

# Validate the URLs
for row in response_dataframe['url']:
Expand All @@ -47,6 +54,7 @@ def parse_data(data):
print(f"Error while parsing data: {e}")
return pd.DataFrame(columns=['id', 'url', 'game'])


def main():
"""
Pull URLs from the API that meet the criteria specified in the requirements argument.
Expand All @@ -63,18 +71,25 @@ def main():

# Make the API request and retrieve the data
try:
response = requests.get(endpoint, params=params, headers=headers, timeout=10)
response = requests.get(endpoint, params=params,
headers=headers, timeout=10)
data = response.json()
print(data)
total_pages = data["totalPages"]

valid_urls = pd.DataFrame(columns=['id', 'url', 'game'])
for page in range(0, total_pages + 1): # Query all pages sequentially
params["page"] = page # Update page number
print(f"Requesting page {page}")
response = requests.get(endpoint, params=params, headers=headers, timeout=10)
data = response.json()
valid_urls = pd.concat([valid_urls, parse_data(data)], ignore_index=True)

with tqdm(total=total_pages + 1, desc="Progress", unit="page") as pbar:
for page in range(0, total_pages + 1): # Query all pages sequentially
params["page"] = page # Update page number
pbar.set_postfix(page=page)
pbar.update(1)

response = requests.get(
endpoint, params=params, headers=headers, timeout=10)
data = response.json()
valid_urls = pd.concat(
[valid_urls, parse_data(data)], ignore_index=True)

# Filter out duplicate links
valid_urls.drop_duplicates(subset=["url"], inplace=True)
Expand All @@ -85,7 +100,9 @@ def main():

# Save the downloaded links to a file
valid_urls_df = pd.DataFrame(valid_urls)
valid_urls_df.to_csv(os.path.join(Path(download_dir), "links.csv"), index=True, columns=["id", "url", "game"])
valid_urls_df.to_csv(os.path.join(
Path(download_dir), "links.csv"), index=True, columns=["id", "url", "game"])

except requests.exceptions.Timeout as timeout_error:
print(f"Request timed out: {timeout_error}")
except requests.exceptions.TooManyRedirects as redirect_error:
Expand All @@ -94,5 +111,7 @@ def main():
print(f"Request failed: {request_error}")
except Exception as e:
print(f"An error occurred: {e}")


if __name__ == "__main__":
main()
main()

0 comments on commit 2467a37

Please sign in to comment.