@@ -50,6 +50,14 @@ class RewardSignal(Enum):
5050 r"taostats\.io/subnets?/\d+" ,
5151 # Weather: /City or /City?format=...
5252 r"wttr\.in/[A-Za-z+]+(?:\?|$)" ,
53+ # ArXiv: /abs/2603.16870
54+ r"arxiv\.org/abs/\d{4}\.\d{4,5}" ,
55+ # OpenLibrary: /works/OL103123W or /works/OL103123W/Dune
56+ r"openlibrary\.org/works/ol\d+w(?:/|$)" ,
57+ # Open-Meteo: /en/docs?latitude=35.68&longitude=139.65
58+ r"open-meteo\.com/en/docs\?.*latitude=" ,
59+ # HackerNews: /item?id=12345
60+ r"news\.ycombinator\.com/item\?id=\d+" ,
5361]
5462
5563
@@ -387,16 +395,17 @@ def _normalize_url(self, url: str) -> str:
387395 Normalize URL for duplicate detection.
388396
389397 For most sites: remove query/fragment (path-based routing)
390- For query-based sites (stooq, wttr.in): keep essential query params
398+ For query-based sites (stooq, wttr.in, hackernews, open-meteo ): keep essential query params
391399 """
392400 try :
401+ from urllib .parse import parse_qs , urlencode
402+
393403 p = urlparse (url )
394404 domain = p .netloc .lower ()
395405
396406 # Stooq uses query params for asset identification: /q/?s=aapl.us
397407 if "stooq.com" in domain and p .query :
398408 # Keep the 's' parameter which identifies the asset
399- from urllib .parse import parse_qs , urlencode
400409 params = parse_qs (p .query )
401410 if 's' in params :
402411 kept_query = urlencode ({'s' : params ['s' ][0 ]})
@@ -407,6 +416,23 @@ def _normalize_url(self, url: str) -> str:
407416 if "wttr.in" in domain :
408417 return urlunparse ((p .scheme , p .netloc , p .path , '' , '' , '' ))
409418
419+ # HackerNews uses query params for item identification: /item?id=12345
420+ if "ycombinator.com" in domain and p .query :
421+ params = parse_qs (p .query )
422+ if 'id' in params :
423+ kept_query = urlencode ({'id' : params ['id' ][0 ]})
424+ return urlunparse ((p .scheme , p .netloc , p .path , '' , kept_query , '' ))
425+
426+ # Open-Meteo uses query params for location: ?latitude=X&longitude=Y
427+ if "open-meteo.com" in domain and p .query :
428+ params = parse_qs (p .query )
429+ if 'latitude' in params and 'longitude' in params :
430+ kept_query = urlencode ({
431+ 'latitude' : params ['latitude' ][0 ],
432+ 'longitude' : params ['longitude' ][0 ],
433+ })
434+ return urlunparse ((p .scheme , p .netloc , p .path , '' , kept_query , '' ))
435+
410436 # Default: remove query/fragment (path-based sites like CoinGecko)
411437 return urlunparse ((p .scheme , p .netloc , p .path , '' , '' , '' ))
412438 except Exception :
@@ -443,4 +469,26 @@ def _extract_asset_from_url(self, url: str) -> Optional[str]:
443469 if match :
444470 return match .group (1 ).replace ("+" , " " )
445471
472+ # ArXiv: /abs/2603.16870 -> 2603.16870
473+ match = re .search (r"arxiv\.org/abs/(\d{4}\.\d{4,5})" , url_lower )
474+ if match :
475+ return match .group (1 )
476+
477+ # OpenLibrary: /works/OL103123W -> ol103123w
478+ match = re .search (r"openlibrary\.org/works/(ol\d+w)" , url_lower )
479+ if match :
480+ return match .group (1 )
481+
482+ # Open-Meteo: /en/docs?latitude=35.68&longitude=139.65 -> 35.68,139.65
483+ if "open-meteo.com" in url_lower :
484+ lat = re .search (r"latitude=([0-9.-]+)" , url_lower )
485+ lon = re .search (r"longitude=([0-9.-]+)" , url_lower )
486+ if lat and lon :
487+ return f"{ lat .group (1 )} ,{ lon .group (1 )} "
488+
489+ # HackerNews: /item?id=12345 -> 12345
490+ match = re .search (r"news\.ycombinator\.com/item\?id=(\d+)" , url_lower )
491+ if match :
492+ return match .group (1 )
493+
446494 return None
0 commit comments