@@ -11,14 +11,35 @@ def fetch_json(url):
1111 with urllib .request .urlopen (req ) as response :
1212 return json .loads (response .read ().decode ())
1313
14+ def wiki_to_markdown (wiki_text , page_url ):
15+ # Convert <lang> or <syntaxhighlight> blocks
16+ def repl_code (match ):
17+ return f"\n ```zc\n { match .group (1 ).strip ()} \n ```\n "
18+
19+ md = re .sub (r'(?:<lang[^>]*>|<syntaxhighlight[^>]*>|<highlight[^>]*>)(.*?)(?:</lang>|</syntaxhighlight>|</highlight>)' ,
20+ repl_code , wiki_text , flags = re .DOTALL | re .IGNORECASE )
21+
22+ # Convert internal links [[Target|Display]] or [[Target]]
23+ md = re .sub (r'\[\[([^\]|]+)\|([^\]]+)\]\]' , r'[\2](https://rosettacode.org/wiki/\1)' , md )
24+ md = re .sub (r'\[\[([^\]]+)\]\]' , r'[\1](https://rosettacode.org/wiki/\1)' , md )
25+
26+ # Simple formatting: bold '''text''' and italics ''text''
27+ md = re .sub (r"'''(.*?)'''" , r"**\1**" , md )
28+ md = re .sub (r"''(.*?)''" , r"*\1*" , md )
29+
30+ # Clean up multiple newlines
31+ md = re .sub (r'\n{3,}' , '\n \n ' , md )
32+
33+ return md .strip ()
34+
1435def main ():
1536 print ("-> Fetching tasks from Rosetta Code..." )
1637
1738 url = f"{ API_URL } ?action=query&list=categorymembers&cmtitle={ CATEGORY } &cmlimit=500&format=json"
1839 data = fetch_json (url )
1940 pages = data ['query' ]['categorymembers' ]
2041
21- os .makedirs ("examples/rosetta" , exist_ok = True )
42+ os .makedirs ("examples/examples/ rosetta" , exist_ok = True )
2243 os .makedirs ("website_out" , exist_ok = True )
2344
2445 for page in pages :
@@ -32,34 +53,35 @@ def main():
3253 parts = re .split (r'==\{\{header\|Zen[ _-]?C\}\}==' , text , flags = re .IGNORECASE )
3354
3455 if len (parts ) > 1 :
35- zen_c_section = parts [1 ].split ('=={{header|' )[0 ]
56+ zen_c_section = parts [1 ].split ('=={{header|' )[0 ]. strip ()
3657
37- match = re .search (r'(?:<lang[^>]*>|<syntaxhighlight[^>]*>|<highlight[^>]*>)(.*?)(?:</lang>|</syntaxhighlight>|</highlight>)' , zen_c_section , re .DOTALL | re .IGNORECASE )
58+ code_blocks = re .findall (r'(?:<lang[^>]*>|<syntaxhighlight[^>]*>|<highlight[^>]*>)(.*?)(?:</lang>|</syntaxhighlight>|</highlight>)' ,
59+ zen_c_section , re .DOTALL | re .IGNORECASE )
3860
39- if match :
40- code = match . group ( 1 ) .strip ()
61+ if code_blocks :
62+ combined_code = " \n \n " . join ( block .strip () for block in code_blocks )
4163 safe_title = title .replace ("/" , "_" ).replace (" " , "_" )
4264 page_url = f"https://rosettacode.org/wiki/{ title .replace (' ' , '_' )} "
4365 history_url = f"{ page_url } ?action=history"
4466
45- zc_filename = f"examples/rosetta/{ safe_title } .zc"
67+ zc_filename = f"examples/examples/ rosetta/{ safe_title } .zc"
4668 with open (zc_filename , "w" , encoding = "utf-8" ) as f :
47- f .write (code + "\n " )
69+ f .write (combined_code + "\n " )
4870
4971 md_filename = f"website_out/{ safe_title } .md"
72+ content_md = wiki_to_markdown (zen_c_section , page_url )
73+
5074 with open (md_filename , "w" , encoding = "utf-8" ) as f :
5175 f .write ("+++\n " )
5276 f .write (f'title = "{ title } "\n ' )
5377 f .write ("+++\n \n " )
5478 f .write (f"# { title } \n \n " )
55- f .write ("```zc\n " )
56- f .write (code + "\n " )
57- f .write ("```\n \n " )
79+ f .write (content_md + "\n \n " )
5880 f .write ("---\n " )
5981 f .write (f"**Attribution:** This is a community solution for the Rosetta Code task [**{ title } **]({ page_url } ) in Zen C.\n \n " )
6082 f .write (f"*This article uses material from the Rosetta Code article **{ title } **, which is released under the [GNU Free Documentation License 1.3](https://www.gnu.org/licenses/fdl-1.3.html). A list of the original authors can be found in the [page history]({ history_url } ).*\n " )
6183
62- print (f"-> Scraped: { title } " )
84+ print (f"-> Scraped: { title } ( { len ( code_blocks ) } blocks) " )
6385 else :
6486 print (f"-> Found header, but NO code block in: { title } " )
6587 else :
0 commit comments