-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscraping_manager.py
More file actions
242 lines (201 loc) · 8.71 KB
/
scraping_manager.py
File metadata and controls
242 lines (201 loc) · 8.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env python3
"""
Scraping Manager - Production Ready
==================================
MCP-compliant scraping manager that coordinates all content extraction operations.
This is the main production interface for scraping strunz.com content.
Status: PRODUCTION
Usage: Called by main.py scrape command
Dependencies: src/scraper/production_scraper.py
Author: Claude Code
Last Updated: 2025-07-11
"""
import logging
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional
from scraper.production_scraper import ProductionStrunzScraper
logger = logging.getLogger(__name__)
class ScrapingManager:
"""Production-ready scraping manager following MCP standards."""
def __init__(self,
unlimited: bool = False,
max_pages: Optional[int] = None,
use_selenium: bool = True,
forum_only: bool = False,
news_only: bool = False):
"""
Initialize scraping manager.
Args:
unlimited: Remove all page limits
max_pages: Maximum pages per category
use_selenium: Use Selenium for JavaScript content
forum_only: Scrape forums only
news_only: Scrape news only
"""
self.unlimited = unlimited
self.max_pages = max_pages if not unlimited else None
self.use_selenium = use_selenium
self.forum_only = forum_only
self.news_only = news_only
# Setup output directories
self.output_dir = Path("data/scraped")
self.output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"ScrapingManager initialized:")
logger.info(f" Unlimited: {self.unlimited}")
logger.info(f" Max pages: {self.max_pages}")
logger.info(f" Use Selenium: {self.use_selenium}")
logger.info(f" Forum only: {self.forum_only}")
logger.info(f" News only: {self.news_only}")
def run_test_scraping(self) -> Dict:
"""Run limited test scraping (max 2 pages per category)."""
logger.info("🧪 Running test scraping with limited pages")
scraper = ProductionStrunzScraper(
use_selenium=self.use_selenium,
max_pages_per_category=2, # Always limit for tests
min_content_score=0.3
)
try:
results = scraper.run_complete_scraping()
stats = scraper.get_statistics()
# Save test results
self._save_scraping_results(results, stats, "test")
return {
'status': 'success',
'type': 'test',
'total_items': self._count_total_items(results),
'statistics': stats,
'output_dir': str(self.output_dir)
}
except Exception as e:
logger.error(f"Test scraping failed: {e}")
return {
'status': 'failed',
'type': 'test',
'error': str(e),
'total_items': 0
}
finally:
scraper.close()
def run_production_scraping(self) -> Dict:
"""Run full production scraping."""
if self.unlimited:
logger.info("🚀 Running UNLIMITED production scraping")
else:
logger.info(f"🔄 Running limited production scraping (max {self.max_pages} pages)")
scraper = ProductionStrunzScraper(
use_selenium=self.use_selenium,
max_pages_per_category=self.max_pages,
min_content_score=0.4
)
try:
if self.forum_only:
results = self._scrape_forums_only(scraper)
elif self.news_only:
results = self._scrape_news_only(scraper)
else:
results = scraper.run_complete_scraping()
stats = scraper.get_statistics()
# Save production results
self._save_scraping_results(results, stats, "production")
return {
'status': 'success',
'type': 'production',
'total_items': self._count_total_items(results),
'statistics': stats,
'output_dir': str(self.output_dir)
}
except Exception as e:
logger.error(f"Production scraping failed: {e}")
return {
'status': 'failed',
'type': 'production',
'error': str(e),
'total_items': 0
}
finally:
scraper.close()
def _scrape_forums_only(self, scraper: ProductionStrunzScraper) -> Dict:
"""Scrape forums only."""
logger.info("📋 Scraping forums only")
results = {'news': [], 'forums': {}}
for category in scraper.FORUM_CATEGORIES.keys():
logger.info(f"Scraping forum: {category}")
try:
posts = scraper.scrape_forum_complete(category)
results['forums'][category] = posts
logger.info(f"✅ {category}: {len(posts)} posts")
except Exception as e:
logger.error(f"❌ Failed to scrape {category}: {e}")
results['forums'][category] = []
return results
def _scrape_news_only(self, scraper: ProductionStrunzScraper) -> Dict:
"""Scrape news only."""
logger.info("📰 Scraping news only")
try:
news_articles = scraper.scrape_news_complete()
logger.info(f"✅ News: {len(news_articles)} articles")
return {'news': news_articles, 'forums': {}}
except Exception as e:
logger.error(f"❌ Failed to scrape news: {e}")
return {'news': [], 'forums': {}}
def _count_total_items(self, results: Dict) -> int:
"""Count total items in results."""
total = len(results.get('news', []))
for posts in results.get('forums', {}).values():
total += len(posts)
return total
def _save_scraping_results(self, results: Dict, stats: Dict, run_type: str):
"""Save scraping results with comprehensive metadata."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save raw results
results_file = self.output_dir / f"{run_type}_scraping_{timestamp}.json"
with open(results_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False, default=str)
# Save statistics
stats_file = self.output_dir / f"{run_type}_statistics_{timestamp}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2, ensure_ascii=False, default=str)
# Create summary report
summary = {
'run_type': run_type,
'timestamp': timestamp,
'configuration': {
'unlimited': self.unlimited,
'max_pages': self.max_pages,
'use_selenium': self.use_selenium,
'forum_only': self.forum_only,
'news_only': self.news_only
},
'results_summary': {
'total_items': self._count_total_items(results),
'news_articles': len(results.get('news', [])),
'forum_categories': len([cat for cat, posts in results.get('forums', {}).items() if posts]),
'total_forum_posts': sum(len(posts) for posts in results.get('forums', {}).values())
},
'files_created': {
'results': str(results_file),
'statistics': str(stats_file)
}
}
summary_file = self.output_dir / f"{run_type}_summary_{timestamp}.json"
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
logger.info(f"📁 Results saved:")
logger.info(f" Results: {results_file}")
logger.info(f" Statistics: {stats_file}")
logger.info(f" Summary: {summary_file}")
def get_latest_results(self) -> Optional[Dict]:
"""Get the most recent scraping results."""
result_files = list(self.output_dir.glob("*_scraping_*.json"))
if not result_files:
return None
# Get most recent file
latest_file = max(result_files, key=lambda f: f.stat().st_mtime)
try:
with open(latest_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
logger.error(f"Failed to load latest results from {latest_file}: {e}")
return None