allora-network · clementupshot · Nov 4, 2024 · Nov 4, 2024
diff --git a/data/tiingo_data_fetcher.py b/data/tiingo_data_fetcher.py
@@ -1,8 +1,10 @@
 import os
+import time
 
 import pandas as pd
 import requests
 from dotenv import load_dotenv
+from tqdm import tqdm
 
 # Load the .env.local file if it exists, otherwise load .env
 if os.path.exists(".env.local"):
@@ -76,56 +78,75 @@ def fetch_tiingo_stock_data(self, symbol, start_date, end_date, frequency="daily
         return df
 
     def fetch_tiingo_crypto_data(self, symbol, start_date, end_date, frequency="5min"):
-        """Fetch historical cryptocurrency data from Tiingo."""
+        """Fetch historical cryptocurrency data from Tiingo in 3-day batches."""
 
         filename = self._generate_filename(symbol, start_date, end_date, frequency)
 
         # Check if the CSV file already exists
         if os.path.exists(filename):
-            print(f"Loading stock data from {filename}...")
+            print(f"Loading crypto data from {filename}...")
             return pd.read_csv(filename)
 
-        # Define the URL, headers, and parameters for the request
-        url = f"{BASE_APIURL}/tiingo/crypto/prices"
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Token {TIINGO_API_KEY}",
-        }
-        params = {
-            "tickers": symbol,
-            "startDate": start_date,
-            "endDate": end_date,
-            # The minimum value is "1min". Units in minutes (min), hours (hour), and days (day) are accepted.
-            # Format is # + (min/hour/day); e.g. "15min", "4hour" or "1day".
-            # If no value is provided, defaults to 5min.
-            "resampleFreq": frequency,
-        }
-
-        # Send request to Tiingo API
-        try:
-            response = requests.get(url, headers=headers, params=params, timeout=10)
-            response.raise_for_status()  # Raise an exception for HTTP errors
-        except requests.exceptions.RequestException as e:
-            print(f"Request error: {e}")
-            return pd.DataFrame()
-
-        # Parse JSON response
-        try:
-            data = response.json()
-            if not data or "priceData" not in data[0]:
-                print(f"No crypto data found for {symbol}")
-                return pd.DataFrame()
-        except (ValueError, KeyError) as e:
-            print(f"Error parsing response data: {e}")
-            return pd.DataFrame()
-
-        df = self._normalize_tiingo_data(data[0]["priceData"], symbol)
-
-        # Save the fetched data to CSV
-        print(f"Saving crypto data to {filename}...")
-        df.to_csv(filename, index=False)
-
-        return df
+        # Convert dates to pandas datetime for easier manipulation
+        start = pd.to_datetime(start_date)
+        end = pd.to_datetime(end_date)
+
+        # Calculate total number of 3-day periods
+        total_days = (end - start).days
+        total_batches = (total_days + 2) // 3  # Round up to nearest batch
+
+        # Initialize empty DataFrame for all results
+        all_data = pd.DataFrame()
+
+        # Process in 3-day chunks with progress bar
+        with tqdm(total=total_batches, desc=f"Fetching {symbol} data") as pbar:
+            current_start = start
+            while current_start < end:
+                current_end = min(current_start + pd.Timedelta(days=3), end)
+
+                # Define the URL, headers, and parameters for the request
+                url = f"{BASE_APIURL}/tiingo/crypto/prices"
+                headers = {
+                    "Content-Type": "application/json",
+                    "Authorization": f"Token {TIINGO_API_KEY}",
+                }
+                params = {
+                    "tickers": symbol,
+                    "startDate": current_start.strftime("%Y-%m-%d"),
+                    "endDate": current_end.strftime("%Y-%m-%d"),
+                    "resampleFreq": frequency,
+                }
+
+                # Send request to Tiingo API
+                try:
+                    response = requests.get(url, headers=headers, params=params, timeout=10)
+                    response.raise_for_status()
+
+                    data = response.json()
+                    if data and "priceData" in data[0]:
+                        batch_df = self._normalize_tiingo_data(data[0]["priceData"], symbol)
+                        all_data = pd.concat([all_data, batch_df], ignore_index=True)
+
+                    # Add a small delay to avoid hitting rate limits
+                    time.sleep(0.5)
+
+                except requests.exceptions.RequestException as e:
+                    print(f"\nRequest error for period {current_start.date()} to {current_end.date()}: {e}")
+                except (ValueError, KeyError) as e:
+                    print(f"\nError parsing response data: {e}")
+
+                current_start = current_end
+                pbar.update(1)
+
+        # Remove any duplicate rows that might occur at batch boundaries
+        all_data = all_data.drop_duplicates()
+
+        if not all_data.empty:
+            # Save the fetched data to CSV
+            print(f"Saving crypto data to {filename}...")
+            all_data.to_csv(filename, index=False)
+
+        return all_data
 
     def _normalize_tiingo_data(self, data, asset_name):
         """Normalize Tiingo stock data to match the required schema."""

diff --git a/train.py b/train.py
@@ -2,6 +2,7 @@
 # conda activate modelmaker
 # pip install setuptools==72.1.0 Cython==3.0.11 numpy==1.24.3
 # pip install -r requirements.txt
+import os
 import sys
 from datetime import datetime, timedelta
 
@@ -81,6 +82,32 @@ def select_data(fetcher, default_selection=None, file_path=None):
 
     if selection == "3":
         print("You selected to load data from a CSV file.")
+
+        # Add numbered directory listing functionality
+        sets_dir = os.path.join(os.path.dirname(__file__), "data", "sets")
+        if os.path.exists(sets_dir):
+            print("\nAvailable files in data/sets:")
+            csv_files = [f for f in os.listdir(sets_dir) if f.endswith('.csv')]
+
+            if not csv_files:
+                print("No CSV files found in data/sets directory.")
+            else:
+                for idx, file in enumerate(csv_files, 1):
+                    print(f"{idx}. {file}")
+                print()
+
+                if file_path is None:
+                    file_selection = input("Enter the number of the file or provide a custom path: ").strip()
+                    try:
+                        file_idx = int(file_selection)
+                        if 1 <= file_idx <= len(csv_files):
+                            file_path = os.path.join(sets_dir, csv_files[file_idx - 1])
+                        else:
+                            print_colored("Invalid file number. Using provided path as-is.", "warning")
+                    except ValueError:
+                        # If input is not a number, treat it as a custom path
+                        file_path = file_selection
+
         if file_path is None:
             file_path = input("Enter the CSV file path: ").strip()
         return CSVLoader.load_csv(file_path)