|
| 1 | +import re |
| 2 | + |
| 3 | +heads = """ |
| 4 | +- What's Important When Scraping At Scale? |
| 5 | +- Challenge #1 - Sloppy and Always Changing Website Formats |
| 6 | +
|
| 7 | + - No Easy Solution |
| 8 | +- Challenge 2: Scalable Architecture |
| 9 | +
|
| 10 | + - Separate Product Discovery From Product Extraction |
| 11 | + - Allocate More Resources To Product Extraction |
| 12 | +- Challenge 3: Maintaining Throughput Performance |
| 13 | +
|
| 14 | + - Crawling Efficiency |
| 15 | +
|
| 16 | +- Challenge 4: Anti-Bot Countermeasures |
| 17 | +
|
| 18 | + - Proxies |
| 19 | + - Beyond Proxies |
| 20 | +- Challenge 5: Data Quality |
| 21 | +- Wrapping Things Up |
| 22 | +""" |
| 23 | + |
| 24 | +# What's Important When Scraping At Scale? -> whats-important-when-scraping-at-scale |
| 25 | +for head in re.split(r'\n', heads): |
| 26 | + if head.strip(): |
| 27 | + start = re.search('\w', head).start() |
| 28 | + prefix = head[:start] |
| 29 | + head_ = head[start:] |
| 30 | + fragment = re.sub(r'[^\w-]', '', head_.lower().replace(' ', '-')) |
| 31 | + print("%s[%s](#%s)" % (prefix, head_, fragment)) |
| 32 | + |
| 33 | + |
| 34 | +# - [What's Important When Scraping At Scale?](#whats-important-when-scraping-at-scale) |
| 35 | +# - [Challenge #1 - Sloppy and Always Changing Website Formats](#challenge-1---sloppy-and-always-changing-website-formats) |
| 36 | + # - [No Easy Solution](#no-easy-solution) |
| 37 | +# - [Challenge 2: Scalable Architecture](#challenge-2-scalable-architecture) |
| 38 | + # - [Separate Product Discovery From Product Extraction](#separate-product-discovery-from-product-extraction) |
| 39 | + # - [Allocate More Resources To Product Extraction](#allocate-more-resources-to-product-extraction) |
| 40 | +# - [Challenge 3: Maintaining Throughput Performance](#challenge-3-maintaining-throughput-performance) |
| 41 | + # - [Crawling Efficiency](#crawling-efficiency) |
| 42 | +# - [Challenge 4: Anti-Bot Countermeasures](#challenge-4-anti-bot-countermeasures) |
| 43 | + # - [Proxies](#proxies) |
| 44 | + # - [Beyond Proxies](#beyond-proxies) |
| 45 | +# - [Challenge 5: Data Quality](#challenge-5-data-quality) |
| 46 | +# - [Wrapping Things Up](#wrapping-things-up) |
0 commit comments