Skip to content

Commit 18ce60d

Browse files
authored
fix(reward): add detail page patterns for arxiv, openlibrary, openmeteo, hackernews (#17)
1 parent 0656cc0 commit 18ce60d

1 file changed

Lines changed: 50 additions & 2 deletions

File tree

liveweb_arena/core/reward.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,14 @@ class RewardSignal(Enum):
5050
r"taostats\.io/subnets?/\d+",
5151
# Weather: /City or /City?format=...
5252
r"wttr\.in/[A-Za-z+]+(?:\?|$)",
53+
# ArXiv: /abs/2603.16870
54+
r"arxiv\.org/abs/\d{4}\.\d{4,5}",
55+
# OpenLibrary: /works/OL103123W or /works/OL103123W/Dune
56+
r"openlibrary\.org/works/ol\d+w(?:/|$)",
57+
# Open-Meteo: /en/docs?latitude=35.68&longitude=139.65
58+
r"open-meteo\.com/en/docs\?.*latitude=",
59+
# HackerNews: /item?id=12345
60+
r"news\.ycombinator\.com/item\?id=\d+",
5361
]
5462

5563

@@ -387,16 +395,17 @@ def _normalize_url(self, url: str) -> str:
387395
Normalize URL for duplicate detection.
388396
389397
For most sites: remove query/fragment (path-based routing)
390-
For query-based sites (stooq, wttr.in): keep essential query params
398+
For query-based sites (stooq, wttr.in, hackernews, open-meteo): keep essential query params
391399
"""
392400
try:
401+
from urllib.parse import parse_qs, urlencode
402+
393403
p = urlparse(url)
394404
domain = p.netloc.lower()
395405

396406
# Stooq uses query params for asset identification: /q/?s=aapl.us
397407
if "stooq.com" in domain and p.query:
398408
# Keep the 's' parameter which identifies the asset
399-
from urllib.parse import parse_qs, urlencode
400409
params = parse_qs(p.query)
401410
if 's' in params:
402411
kept_query = urlencode({'s': params['s'][0]})
@@ -407,6 +416,23 @@ def _normalize_url(self, url: str) -> str:
407416
if "wttr.in" in domain:
408417
return urlunparse((p.scheme, p.netloc, p.path, '', '', ''))
409418

419+
# HackerNews uses query params for item identification: /item?id=12345
420+
if "ycombinator.com" in domain and p.query:
421+
params = parse_qs(p.query)
422+
if 'id' in params:
423+
kept_query = urlencode({'id': params['id'][0]})
424+
return urlunparse((p.scheme, p.netloc, p.path, '', kept_query, ''))
425+
426+
# Open-Meteo uses query params for location: ?latitude=X&longitude=Y
427+
if "open-meteo.com" in domain and p.query:
428+
params = parse_qs(p.query)
429+
if 'latitude' in params and 'longitude' in params:
430+
kept_query = urlencode({
431+
'latitude': params['latitude'][0],
432+
'longitude': params['longitude'][0],
433+
})
434+
return urlunparse((p.scheme, p.netloc, p.path, '', kept_query, ''))
435+
410436
# Default: remove query/fragment (path-based sites like CoinGecko)
411437
return urlunparse((p.scheme, p.netloc, p.path, '', '', ''))
412438
except Exception:
@@ -443,4 +469,26 @@ def _extract_asset_from_url(self, url: str) -> Optional[str]:
443469
if match:
444470
return match.group(1).replace("+", " ")
445471

472+
# ArXiv: /abs/2603.16870 -> 2603.16870
473+
match = re.search(r"arxiv\.org/abs/(\d{4}\.\d{4,5})", url_lower)
474+
if match:
475+
return match.group(1)
476+
477+
# OpenLibrary: /works/OL103123W -> ol103123w
478+
match = re.search(r"openlibrary\.org/works/(ol\d+w)", url_lower)
479+
if match:
480+
return match.group(1)
481+
482+
# Open-Meteo: /en/docs?latitude=35.68&longitude=139.65 -> 35.68,139.65
483+
if "open-meteo.com" in url_lower:
484+
lat = re.search(r"latitude=([0-9.-]+)", url_lower)
485+
lon = re.search(r"longitude=([0-9.-]+)", url_lower)
486+
if lat and lon:
487+
return f"{lat.group(1)},{lon.group(1)}"
488+
489+
# HackerNews: /item?id=12345 -> 12345
490+
match = re.search(r"news\.ycombinator\.com/item\?id=(\d+)", url_lower)
491+
if match:
492+
return match.group(1)
493+
446494
return None

0 commit comments

Comments
 (0)