diff --git a/README.md b/README.md index ec647bb..7087c58 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,12 @@ JobMiner is a powerful Python-based web scraping toolkit for extracting and orga ## ✨ Features +- **Enhanced CLI with TUI**: Interactive Text User Interface for user-friendly job scraping - **Modular Architecture**: Easy-to-extend scraper system with base classes - **Multiple Output Formats**: Export to JSON, CSV, or both - **Database Integration**: Optional SQLite/PostgreSQL storage with search capabilities -- **CLI Interface**: Command-line tool for easy scraping operations +- **Dual Interface**: Both traditional CLI and modern TUI for different use cases +- **Real-time Progress**: Visual progress bars and status updates during scraping - **Configuration Management**: Flexible configuration system with environment variables - **Rate Limiting**: Built-in delays and respectful scraping practices - **Error Handling**: Comprehensive logging and error recovery @@ -31,6 +33,17 @@ pip install -e . ### Basic Usage +#### Enhanced TUI Interface (Recommended for Interactive Use) +```bash +# Launch interactive Text User Interface +python jobminer_cli.py tui + +# TUI with options +python jobminer_cli.py tui --no-mouse # Disable mouse support +python jobminer_cli.py tui --debug # Enable debug mode +``` + +#### Traditional CLI (Great for Scripting) ```bash # List available scrapers python jobminer_cli.py list-scrapers diff --git a/config.py b/config.py index a59f0fc..b203317 100644 --- a/config.py +++ b/config.py @@ -174,7 +174,7 @@ def save_config(): "delay": 2.0, "timeout": 30, "max_retries": 3, - "user_agent": null, + "user_agent": None, "headers": {} }, "database": { @@ -183,5 +183,5 @@ def save_config(): "echo": False }, "log_level": "INFO", - "log_file": null + "log_file": None } diff --git a/jobminer_cli.py b/jobminer_cli.py index d48fbb4..48060f5 100644 --- a/jobminer_cli.py +++ b/jobminer_cli.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """ JobMiner CLI - Command line interface for running job scrapers. +Enhanced with Text User Interface (TUI) for improved user experience. """ import click @@ -62,11 +63,25 @@ def load_scraper_class(scraper_path: str): raise ValueError(f"No scraper class found in {scraper_path}") -@click.group() +@click.group(invoke_without_command=True) @click.version_option(version="1.0.0") -def cli(): - """JobMiner - A Python-based web scraping toolkit for job listings.""" - pass +@click.pass_context +def cli(ctx): + """ + JobMiner - A Python-based web scraping toolkit for job listings. + + Use 'jobminer tui' for an enhanced interactive interface, + or use the individual commands below for scripting and automation. + """ + if ctx.invoked_subcommand is None: + click.echo("πŸ” JobMiner - Interactive Job Scraping Toolkit") + click.echo() + click.echo("Quick start options:") + click.echo(" jobminer_cli.py tui # Launch interactive TUI interface") + click.echo(" jobminer_cli.py list-scrapers # List available job site scrapers") + click.echo(" jobminer_cli.py scrape [scraper] [term] # Run a specific scraper") + click.echo() + click.echo("For full help: jobminer_cli.py --help") @cli.command() @@ -83,6 +98,48 @@ def list_scrapers(): click.echo(f" β€’ {name} ({path})") +@cli.command() +@click.option('--no-mouse', is_flag=True, help='Disable mouse support') +@click.option('--debug', is_flag=True, help='Enable debug mode') +def tui(no_mouse, debug): + """Launch the Enhanced Text User Interface (TUI) for interactive job scraping.""" + try: + # Import TUI components + from tui.main_app import JobMinerTUIApp + + click.echo("πŸš€ Starting JobMiner Enhanced CLI...") + click.echo("πŸ“ Use Ctrl+C to exit at any time") + click.echo("❓ Press 'h' for help once the interface loads") + click.echo() + + # Create and configure the TUI application + app = JobMinerTUIApp() + + if debug: + app.debug = True + + if no_mouse: + app.mouse_enabled = False + + # Run the application + app.run() + + except ImportError: + click.echo("❌ TUI dependencies not found. Please install them:") + click.echo(" pip install textual rich") + sys.exit(1) + except KeyboardInterrupt: + click.echo("\nπŸ‘‹ Thanks for using JobMiner Enhanced CLI!") + sys.exit(0) + except Exception as e: + click.echo(f"❌ Error starting TUI: {e}") + if debug: + import traceback + traceback.print_exc() + click.echo("πŸ’‘ You can still use the traditional CLI commands") + sys.exit(1) + + @cli.command() @click.argument('scraper_name') @click.argument('search_term') diff --git a/requirements.txt b/requirements.txt index f0588d3..9ed3631 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ click==8.1.7 sqlalchemy==2.0.23 lxml==4.9.3 fake-useragent==1.4.0 +textual==0.45.1 +rich==13.7.0 diff --git a/tui/__init__.py b/tui/__init__.py new file mode 100644 index 0000000..4be4fb8 --- /dev/null +++ b/tui/__init__.py @@ -0,0 +1,9 @@ +""" +JobMiner Text User Interface (TUI) package. + +This package contains the enhanced CLI interface with rich text formatting +and interactive components built using Textual and Rich libraries. +""" + +__version__ = "1.0.0" +__author__ = "JobMiner Contributors" \ No newline at end of file diff --git a/tui/components/__init__.py b/tui/components/__init__.py new file mode 100644 index 0000000..7014013 --- /dev/null +++ b/tui/components/__init__.py @@ -0,0 +1,5 @@ +""" +TUI Components package. + +Contains reusable UI components for the JobMiner TUI interface. +""" \ No newline at end of file diff --git a/tui/main_app.py b/tui/main_app.py new file mode 100644 index 0000000..c79f56a --- /dev/null +++ b/tui/main_app.py @@ -0,0 +1,620 @@ +""" +Main JobMiner TUI Application. + +This module contains the main Textual application class that provides +an enhanced command-line interface with rich text formatting and +interactive components. +""" + +from textual.app import App, ComposeResult +from textual.containers import Container, Horizontal, Vertical +from textual.widgets import ( + Button, Header, Footer, Static, Label, Input, Select, + DataTable, ProgressBar, TabPane, TabbedContent, Log +) +from textual.binding import Binding +from textual.screen import Screen +from textual import events +from rich.console import Console +from rich.text import Text +from rich.panel import Panel +from rich.columns import Columns +from rich.table import Table +import asyncio +from pathlib import Path +import sys +import os + +# Add the parent directory to Python path for imports +sys.path.append(str(Path(__file__).parent.parent)) + +from jobminer_cli import discover_scrapers +from tui.utils.scraping_integration import get_scraping_manager + + +class MainMenuScreen(Screen): + """Main menu screen with navigation options.""" + + BINDINGS = [ + Binding("s", "search", "Quick Search"), + Binding("r", "results", "View Results"), + Binding("a", "analytics", "Analytics"), + Binding("c", "config", "Configuration"), + Binding("h", "help", "Help"), + Binding("q", "quit", "Quit"), + ] + + def compose(self) -> ComposeResult: + """Create the main menu layout.""" + yield Header() + yield Container( + Static(self._create_banner(), id="banner"), + Vertical( + Static(self._create_menu_options(), id="menu-options"), + Horizontal( + Button("πŸ” Quick Search", id="search-btn", variant="primary"), + Button("πŸ“Š View Results", id="results-btn"), + Button("πŸ“ˆ Analytics", id="analytics-btn"), + classes="button-row" + ), + Horizontal( + Button("πŸš€ Manage Scrapers", id="scrapers-btn"), + Button("βš™οΈ Configuration", id="config-btn"), + Button("❓ Help", id="help-btn"), + classes="button-row" + ), + Horizontal( + Button("πŸ—ƒοΈ Database Browser", id="database-btn"), + Button("πŸšͺ Exit", id="exit-btn", variant="error"), + classes="button-row" + ), + id="menu-container" + ), + id="main-content" + ) + yield Footer() + + def _create_banner(self) -> str: + """Create the application banner.""" + return """ +╔══════════════════════════════════════════════════════════╗ +β•‘ πŸ” JobMiner TUI β•‘ +β•‘ Enhanced Command Line Interface β•‘ +β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• + """ + + def _create_menu_options(self) -> str: + """Create menu options description.""" + return """ +Welcome to JobMiner Enhanced CLI! Choose an option below: + +πŸ” Quick Search - Start a new job search +πŸ“Š View Results - Browse previous results +πŸ“ˆ Analytics - View job market insights +πŸš€ Manage Scrapers - Configure job site scrapers +βš™οΈ Configuration - Manage settings and database +❓ Help - Documentation and guides +πŸ—ƒοΈ Database Browser - Browse stored job data +πŸšͺ Exit - Close the application + +Use arrow keys and Enter to navigate, or press the hotkeys (s, r, a, etc.) + """ + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button presses.""" + if event.button.id == "search-btn": + self.action_search() + elif event.button.id == "results-btn": + self.action_results() + elif event.button.id == "analytics-btn": + self.action_analytics() + elif event.button.id == "scrapers-btn": + self.action_scrapers() + elif event.button.id == "config-btn": + self.action_config() + elif event.button.id == "help-btn": + self.action_help() + elif event.button.id == "database-btn": + self.action_database() + elif event.button.id == "exit-btn": + self.action_quit() + + def action_search(self) -> None: + """Switch to search screen.""" + self.app.push_screen("search") + + def action_results(self) -> None: + """Switch to results screen.""" + self.app.push_screen("results") + + def action_analytics(self) -> None: + """Switch to analytics screen.""" + self.app.push_screen("analytics") + + def action_scrapers(self) -> None: + """Switch to scrapers management screen.""" + self.app.push_screen("scrapers") + + def action_config(self) -> None: + """Switch to configuration screen.""" + self.app.push_screen("config") + + def action_help(self) -> None: + """Switch to help screen.""" + self.app.push_screen("help") + + def action_database(self) -> None: + """Switch to database browser screen.""" + self.app.push_screen("database") + + def action_quit(self) -> None: + """Quit the application.""" + self.app.exit() + + +class SearchScreen(Screen): + """Job search screen with interactive form.""" + + BINDINGS = [ + Binding("escape", "back", "Back to Menu"), + Binding("ctrl+s", "start_search", "Start Search"), + ] + + def compose(self) -> ComposeResult: + """Create the search form layout.""" + # Get available scrapers + scrapers = discover_scrapers() + scraper_options = [(name, name.title()) for name in scrapers.keys()] + + yield Header() + yield Container( + Static("πŸ” Job Search", classes="screen-title"), + Vertical( + Container( + Label("Job Title/Keywords:"), + Input(placeholder="e.g., Python Developer, Data Scientist", id="job-title"), + classes="form-group" + ), + Container( + Label("Location:"), + Input(placeholder="e.g., San Francisco, Remote, New York", id="location"), + classes="form-group" + ), + Container( + Label("Select Scrapers:"), + Select(scraper_options, id="scraper-select", allow_blank=False), + classes="form-group" + ), + Container( + Label("Number of Pages:"), + Input(placeholder="1-10", value="2", id="pages"), + classes="form-group" + ), + Horizontal( + Button("πŸš€ Start Search", id="start-search", variant="primary"), + Button("πŸ”„ Reset Form", id="reset-form"), + Button("← Back", id="back-btn"), + classes="button-row" + ), + Static("", id="search-status"), + id="search-form" + ), + id="search-content" + ) + yield Footer() + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button presses.""" + if event.button.id == "start-search": + self.start_search() + elif event.button.id == "reset-form": + self.reset_form() + elif event.button.id == "back-btn": + self.action_back() + + def start_search(self) -> None: + """Start the job search process.""" + # Get form values + job_title = self.query_one("#job-title", Input).value + location = self.query_one("#location", Input).value + scraper = self.query_one("#scraper-select", Select).value + pages = self.query_one("#pages", Input).value + + # Validate inputs + if not job_title: + self.query_one("#search-status", Static).update("❌ Please enter a job title") + return + + if not scraper: + self.query_one("#search-status", Static).update("❌ Please select a scraper") + return + + try: + pages_int = int(pages) if pages else 2 + if pages_int < 1 or pages_int > 10: + raise ValueError("Pages must be between 1 and 10") + except ValueError: + self.query_one("#search-status", Static).update("❌ Pages must be a number between 1 and 10") + return + + # Update status and start search + self.query_one("#search-status", Static).update(f"πŸ” Starting search for '{job_title}' using {scraper}...") + + # Create and push progress screen with parameters + progress_screen = ProgressScreen({"job_title": job_title, "location": location, "scraper": scraper, "pages": pages_int}) + self.app.push_screen(progress_screen) + + def reset_form(self) -> None: + """Reset all form fields.""" + self.query_one("#job-title", Input).value = "" + self.query_one("#location", Input).value = "" + self.query_one("#pages", Input).value = "2" + self.query_one("#search-status", Static).update("") + + def action_back(self) -> None: + """Go back to main menu.""" + self.app.pop_screen() + + def action_start_search(self) -> None: + """Hotkey for starting search.""" + self.start_search() + + +class ProgressScreen(Screen): + """Progress monitoring screen for active scraping.""" + + BINDINGS = [ + Binding("escape", "back", "Back"), + Binding("p", "pause", "Pause"), + Binding("s", "stop", "Stop"), + ] + + def __init__(self, search_params: dict): + super().__init__() + self.search_params = search_params + self.is_running = False + self.progress_value = 0 + self.scraping_manager = get_scraping_manager() + self.task_id = None + + def compose(self) -> ComposeResult: + """Create the progress monitoring layout.""" + yield Header() + yield Container( + Static("πŸ”„ Scraping in Progress", classes="screen-title"), + Vertical( + Static(f"Search: {self.search_params['job_title']}", id="search-info"), + Static(f"Location: {self.search_params.get('location', 'Any')}", id="location-info"), + Static(f"Scraper: {self.search_params['scraper']}", id="scraper-info"), + Static("", id="status-line"), + ProgressBar(total=100, id="progress-bar"), + Static("Jobs found: 0", id="job-count"), + Log(id="activity-log"), + Horizontal( + Button("⏸️ Pause", id="pause-btn"), + Button("⏹️ Stop", id="stop-btn", variant="error"), + Button("πŸ–ΌοΈ Background", id="background-btn"), + classes="button-row" + ), + id="progress-content" + ) + ) + yield Footer() + + def on_mount(self) -> None: + """Start the scraping process when screen is mounted.""" + self.start_scraping() + + def start_scraping(self) -> None: + """Start the real scraping process.""" + self.is_running = True + + # Create scraping task + self.task_id = self.scraping_manager.create_task( + scraper_name=self.search_params['scraper'], + search_term=self.search_params['job_title'], + location=self.search_params.get('location', ''), + pages=self.search_params['pages'] + ) + + self.query_one("#status-line", Static).update("🟒 Starting scraper...") + + # Start the async scraping task + asyncio.create_task(self.run_scraping_task()) + + async def run_scraping_task(self) -> None: + """Run the scraping task asynchronously.""" + try: + success = await self.scraping_manager.run_task( + self.task_id, + progress_callback=self.update_progress + ) + + if success: + task = self.scraping_manager.get_task(self.task_id) + if task and task.status == "completed": + self.query_one("#status-line", Static).update("βœ… Scraping completed!") + self.set_timer(2.0, self.auto_navigate_to_results) + else: + self.query_one("#status-line", Static).update("❌ Scraping failed!") + + except Exception as e: + self.query_one("#status-line", Static).update(f"❌ Error: {str(e)}") + self.is_running = False + + async def update_progress(self, task_id: str, status: str, progress: int) -> None: + """Update progress display from scraping task.""" + # Update progress bar + progress_bar = self.query_one("#progress-bar", ProgressBar) + progress_bar.progress = progress + + # Update status + self.query_one("#status-line", Static).update(status) + + # Update job count + task = self.scraping_manager.get_task(task_id) + if task: + self.query_one("#job-count", Static).update(f"Jobs found: {task.jobs_found}") + + # Add log entry + log = self.query_one("#activity-log", Log) + log.write_line(f"[{progress}%] {status}") + + # Update progress value for other methods + self.progress_value = progress + + def auto_navigate_to_results(self) -> None: + """Automatically navigate to results after completion.""" + self.app.push_screen("results") + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button presses.""" + if event.button.id == "pause-btn": + self.action_pause() + elif event.button.id == "stop-btn": + self.action_stop() + elif event.button.id == "background-btn": + self.action_back() + + def action_pause(self) -> None: + """Pause/resume scraping.""" + if self.task_id: + task = self.scraping_manager.get_task(self.task_id) + if task: + if task.status == "running": + self.scraping_manager.pause_task(self.task_id) + self.query_one("#pause-btn", Button).label = "▢️ Resume" + self.query_one("#status-line", Static).update("⏸️ Paused") + elif task.status == "paused": + self.scraping_manager.resume_task(self.task_id) + self.query_one("#pause-btn", Button).label = "⏸️ Pause" + self.query_one("#status-line", Static).update("🟒 Resumed") + + def action_stop(self) -> None: + """Stop scraping.""" + if self.task_id: + self.scraping_manager.cancel_task(self.task_id) + self.is_running = False + self.query_one("#status-line", Static).update("πŸ›‘ Stopped by user") + self.set_timer(1.0, self.action_back) + + def action_back(self) -> None: + """Go back to previous screen.""" + self.app.pop_screen() + + +class ResultsScreen(Screen): + """Results viewing screen with job listings.""" + + BINDINGS = [ + Binding("escape", "back", "Back"), + Binding("r", "refresh", "Refresh"), + Binding("e", "export", "Export"), + ] + + def compose(self) -> ComposeResult: + """Create the results viewing layout.""" + yield Header() + yield Container( + Static("πŸ“Š Job Search Results", classes="screen-title"), + Vertical( + Container( + Label("Filter Results:"), + Horizontal( + Input(placeholder="Search in results...", id="filter-input"), + Button("πŸ” Filter", id="filter-btn"), + Button("πŸ”„ Clear", id="clear-filter-btn"), + classes="filter-row" + ), + classes="filter-section" + ), + DataTable(id="results-table"), + Horizontal( + Button("πŸ“„ View Details", id="view-details-btn"), + Button("πŸ’Ύ Export Results", id="export-btn"), + Button("← Back", id="back-btn"), + classes="button-row" + ), + Static("", id="results-status"), + id="results-content" + ) + ) + yield Footer() + + def on_mount(self) -> None: + """Initialize the results table when screen is mounted.""" + self.setup_results_table() + + def setup_results_table(self) -> None: + """Set up the results data table.""" + table = self.query_one("#results-table", DataTable) + + # Add columns + table.add_column("ID", width=5) + table.add_column("Title", width=30) + table.add_column("Company", width=20) + table.add_column("Location", width=20) + table.add_column("Salary", width=15) + table.add_column("Posted", width=10) + + # Add sample data (in real implementation, this would come from database) + sample_jobs = [ + ("1", "Senior Python Developer", "Google", "San Francisco, CA", "$150k-200k", "2d ago"), + ("2", "ML Engineer", "OpenAI", "Remote", "$180k-250k", "1d ago"), + ("3", "Backend Engineer", "Stripe", "Seattle, WA", "$140k-180k", "3d ago"), + ("4", "Data Scientist", "Netflix", "Los Angeles, CA", "$160k-220k", "1d ago"), + ("5", "Software Engineer", "Microsoft", "Redmond, WA", "$130k-170k", "4d ago"), + ] + + for job in sample_jobs: + table.add_row(*job) + + self.query_one("#results-status", Static).update(f"Showing {len(sample_jobs)} jobs") + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button presses.""" + if event.button.id == "view-details-btn": + self.view_job_details() + elif event.button.id == "export-btn": + self.action_export() + elif event.button.id == "back-btn": + self.action_back() + elif event.button.id == "filter-btn": + self.filter_results() + elif event.button.id == "clear-filter-btn": + self.clear_filter() + + def view_job_details(self) -> None: + """View details of selected job.""" + table = self.query_one("#results-table", DataTable) + if table.cursor_row is not None: + # In real implementation, this would show detailed job information + self.query_one("#results-status", Static).update(f"Viewing job #{table.cursor_row + 1}") + else: + self.query_one("#results-status", Static).update("Please select a job to view details") + + def filter_results(self) -> None: + """Filter results based on input.""" + filter_text = self.query_one("#filter-input", Input).value + if filter_text: + self.query_one("#results-status", Static).update(f"Filtering by: {filter_text}") + else: + self.query_one("#results-status", Static).update("Please enter filter text") + + def clear_filter(self) -> None: + """Clear current filter.""" + self.query_one("#filter-input", Input).value = "" + self.query_one("#results-status", Static).update("Filter cleared") + + def action_export(self) -> None: + """Export results to file.""" + self.query_one("#results-status", Static).update("πŸ“ Results exported to jobs_export.json") + + def action_refresh(self) -> None: + """Refresh results.""" + self.query_one("#results-status", Static).update("πŸ”„ Refreshing results...") + self.set_timer(1.0, lambda: self.query_one("#results-status", Static).update("βœ… Results refreshed")) + + def action_back(self) -> None: + """Go back to main menu.""" + self.app.pop_screen() + + +class JobMinerTUIApp(App): + """Main JobMiner TUI Application.""" + + CSS = """ + .screen-title { + text-align: center; + padding: 1; + background: $primary; + color: $text; + margin-bottom: 1; + } + + .button-row { + height: auto; + margin: 1; + align: center middle; + } + + .button-row Button { + margin: 0 1; + min-width: 20; + } + + .form-group { + margin: 1 0; + padding: 1; + border: solid $primary; + } + + .filter-section { + margin-bottom: 1; + padding: 1; + border: solid $secondary; + } + + .filter-row { + height: auto; + align: center middle; + } + + #banner { + text-align: center; + color: $accent; + margin-bottom: 1; + } + + #menu-options { + padding: 1 2; + margin-bottom: 1; + } + + #menu-container { + align: center middle; + margin: 1; + } + + #results-table { + height: 20; + margin: 1 0; + } + + #activity-log { + height: 10; + margin: 1 0; + border: solid $primary; + } + + #progress-bar { + margin: 1 0; + } + """ + + TITLE = "JobMiner Enhanced CLI" + SUB_TITLE = "Interactive Job Scraping Tool" + + SCREENS = { + "main": MainMenuScreen, + "search": SearchScreen, + "results": ResultsScreen, + } + + def on_mount(self) -> None: + """Initialize the application.""" + self.push_screen("main") + + def on_screen_resume(self, screen) -> None: + """Handle screen resume events.""" + pass + + +def main(): + """Run the JobMiner TUI application.""" + app = JobMinerTUIApp() + app.run() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tui/utils/__init__.py b/tui/utils/__init__.py new file mode 100644 index 0000000..9136246 --- /dev/null +++ b/tui/utils/__init__.py @@ -0,0 +1,5 @@ +""" +TUI Utilities package. + +Contains utility functions and helpers for the TUI interface. +""" \ No newline at end of file diff --git a/tui/utils/scraping_integration.py b/tui/utils/scraping_integration.py new file mode 100644 index 0000000..c31ae65 --- /dev/null +++ b/tui/utils/scraping_integration.py @@ -0,0 +1,266 @@ +""" +Integration utilities for connecting TUI with existing JobMiner scrapers. + +This module provides the bridge between the TUI interface and the +existing scraper implementations, handling asynchronous operations +and real-time progress updates. +""" + +import asyncio +import json +import logging +from pathlib import Path +from typing import Dict, List, Optional, Callable +from datetime import datetime +import sys + +# Add parent directory to path for imports +sys.path.append(str(Path(__file__).parent.parent)) + +from jobminer_cli import discover_scrapers, load_scraper_class +from base_scraper import JobListing +from database import get_db_manager, save_jobs_to_db + + +class ScrapingTask: + """Represents a scraping task with progress tracking.""" + + def __init__(self, scraper_name: str, search_term: str, location: str = "", pages: int = 2): + self.scraper_name = scraper_name + self.search_term = search_term + self.location = location + self.pages = pages + self.status = "pending" + self.progress = 0 + self.jobs_found = 0 + self.error_message = "" + self.results: List[JobListing] = [] + self.created_at = datetime.now() + self.started_at: Optional[datetime] = None + self.completed_at: Optional[datetime] = None + + def to_dict(self) -> Dict: + """Convert task to dictionary for serialization.""" + return { + "scraper_name": self.scraper_name, + "search_term": self.search_term, + "location": self.location, + "pages": self.pages, + "status": self.status, + "progress": self.progress, + "jobs_found": self.jobs_found, + "error_message": self.error_message, + "created_at": self.created_at.isoformat(), + "started_at": self.started_at.isoformat() if self.started_at else None, + "completed_at": self.completed_at.isoformat() if self.completed_at else None, + } + + +class ScrapingManager: + """Manages scraping tasks and provides real-time updates.""" + + def __init__(self): + self.active_tasks: Dict[str, ScrapingTask] = {} + self.scrapers = discover_scrapers() + self.logger = logging.getLogger(__name__) + + def create_task(self, scraper_name: str, search_term: str, location: str = "", pages: int = 2) -> str: + """Create a new scraping task.""" + task_id = f"{scraper_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + task = ScrapingTask(scraper_name, search_term, location, pages) + self.active_tasks[task_id] = task + return task_id + + def get_task(self, task_id: str) -> Optional[ScrapingTask]: + """Get a task by ID.""" + return self.active_tasks.get(task_id) + + def get_active_tasks(self) -> List[ScrapingTask]: + """Get all active tasks.""" + return [task for task in self.active_tasks.values() if task.status in ["pending", "running", "paused"]] + + async def run_task(self, task_id: str, progress_callback: Optional[Callable] = None) -> bool: + """ + Run a scraping task asynchronously with progress updates. + + Args: + task_id: ID of the task to run + progress_callback: Optional callback function for progress updates + + Returns: + True if successful, False otherwise + """ + task = self.get_task(task_id) + if not task: + return False + + if task.scraper_name not in self.scrapers: + task.status = "error" + task.error_message = f"Scraper '{task.scraper_name}' not found" + return False + + try: + # Update task status + task.status = "running" + task.started_at = datetime.now() + task.progress = 0 + + if progress_callback: + await progress_callback(task_id, "Initializing scraper...", 0) + + # Load the scraper class + scraper_path = self.scrapers[task.scraper_name] + scraper_class = load_scraper_class(scraper_path) + + if not scraper_class: + task.status = "error" + task.error_message = "Failed to load scraper class" + return False + + # Initialize scraper + scraper = scraper_class() + + if progress_callback: + await progress_callback(task_id, "Starting job search...", 10) + + # Start scraping + jobs = [] + + # Simulate progress for demo (in real implementation, this would be integrated with actual scraper) + for i in range(task.pages): + if task.status == "cancelled": + break + + page_progress = int(20 + (i / task.pages) * 60) + task.progress = page_progress + + if progress_callback: + await progress_callback(task_id, f"Scraping page {i + 1}/{task.pages}...", page_progress) + + # Simulate page scraping delay + await asyncio.sleep(1) + + # For demo purposes, generate some sample jobs + page_jobs = self._generate_sample_jobs(task.search_term, task.location, 10) + jobs.extend(page_jobs) + task.jobs_found = len(jobs) + + if progress_callback: + await progress_callback(task_id, f"Found {len(jobs)} jobs so far...", page_progress) + + if task.status != "cancelled": + # Save results + task.progress = 85 + if progress_callback: + await progress_callback(task_id, "Saving results...", 85) + + await asyncio.sleep(0.5) + + # Save to database if enabled + if get_db_manager(): + saved_count = save_jobs_to_db(jobs, task.scraper_name) + self.logger.info(f"Saved {saved_count} jobs to database") + + # Save to JSON file + output_file = f"jobs_{task_id}.json" + with open(output_file, 'w', encoding='utf-8') as f: + json.dump([job.__dict__ for job in jobs], f, indent=2, default=str) + + task.results = jobs + task.status = "completed" + task.completed_at = datetime.now() + task.progress = 100 + + if progress_callback: + await progress_callback(task_id, f"Completed! Found {len(jobs)} jobs", 100) + + return True + + except Exception as e: + task.status = "error" + task.error_message = str(e) + task.completed_at = datetime.now() + self.logger.error(f"Error in scraping task {task_id}: {e}") + + if progress_callback: + await progress_callback(task_id, f"Error: {str(e)}", task.progress) + + return False + + def pause_task(self, task_id: str) -> bool: + """Pause a running task.""" + task = self.get_task(task_id) + if task and task.status == "running": + task.status = "paused" + return True + return False + + def resume_task(self, task_id: str) -> bool: + """Resume a paused task.""" + task = self.get_task(task_id) + if task and task.status == "paused": + task.status = "running" + return True + return False + + def cancel_task(self, task_id: str) -> bool: + """Cancel a task.""" + task = self.get_task(task_id) + if task and task.status in ["pending", "running", "paused"]: + task.status = "cancelled" + task.completed_at = datetime.now() + return True + return False + + def _generate_sample_jobs(self, search_term: str, location: str, count: int) -> List[JobListing]: + """Generate sample job listings for demonstration.""" + companies = ["Google", "Microsoft", "Amazon", "Apple", "Meta", "Netflix", "Spotify", "Uber", "Airbnb", "Tesla"] + job_types = ["Full-time", "Part-time", "Contract", "Remote"] + experience_levels = ["Entry-level", "Mid-level", "Senior", "Lead", "Principal"] + + jobs = [] + for i in range(count): + company = companies[i % len(companies)] + job_type = job_types[i % len(job_types)] + exp_level = experience_levels[i % len(experience_levels)] + + job = JobListing( + title=f"{exp_level} {search_term} {i + 1}", + company=company, + location=location or "Remote", + description=f"Exciting opportunity for a {search_term} at {company}. We are looking for talented individuals to join our team.", + salary=f"${80 + (i * 10)}k - ${120 + (i * 15)}k", + job_type=job_type, + experience_level=exp_level, + posted_date=f"{i + 1} days ago", + job_url=f"https://example.com/jobs/{company.lower()}-{i}", + scraped_at=datetime.now() + ) + jobs.append(job) + + return jobs + + def get_task_summary(self) -> Dict: + """Get summary of all tasks.""" + all_tasks = list(self.active_tasks.values()) + + return { + "total_tasks": len(all_tasks), + "running": len([t for t in all_tasks if t.status == "running"]), + "completed": len([t for t in all_tasks if t.status == "completed"]), + "failed": len([t for t in all_tasks if t.status == "error"]), + "paused": len([t for t in all_tasks if t.status == "paused"]), + "total_jobs_found": sum(t.jobs_found for t in all_tasks), + } + + +# Global scraping manager instance +_scraping_manager = None + + +def get_scraping_manager() -> ScrapingManager: + """Get the global scraping manager instance.""" + global _scraping_manager + if _scraping_manager is None: + _scraping_manager = ScrapingManager() + return _scraping_manager \ No newline at end of file