Skip to content

Commit 00330f5

Browse files
Improved examples
- change S3 path (table moved from dev to PDS bucket s3://commoncrawl/) - change monthly crawl in examples
1 parent abe415d commit 00330f5

17 files changed

+66
-54
lines changed

README.md

+13-12
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,19 @@ First, the table needs to be imported into Athena:
4949
2. make Athena recognize the data partitions on `s3://`: `MSCK REPAIR TABLE 'ccindex';` (do not forget to adapt the table name). This step needs to be done again after new data partitions have been added.
5050

5151
A couple of sample queries are also provided:
52-
- page/host/domain counts per top-level domain: [count-by-tld-page-host-domain.sql](src/sql/examples/count-by-tld-page-host-domain.sql)
52+
- count captures over partitions (crawls and subsets), get a quick overview how many pages are contained in the monthly crawl archives (and are also indexed in the table): [count-by-partition.sql](src/sql/examples/cc-index/count-by-partition.sql)
53+
- page/host/domain counts per top-level domain: [count-by-tld-page-host-domain.sql](src/sql/examples/cc-index/count-by-tld-page-host-domain.sql)
5354
- "word" count of
54-
- host name elements (split host name at `.` into words): [count-hostname-elements.sql](src/sql/examples/count-hostname-elements.sql)
55-
- URL path elements (separated by `/`): [count-url-path-elements.sql](src/sql/examples/count-url-path-elements.sql)
56-
- count HTTP status codes: [count-fetch-status.sql](src/sql/examples/count-fetch-status.sql)
57-
- count the domains of a specific top-level domain: [count-domains-of-tld.sql](src/sql/examples/count-domains-of-tld.sql)
58-
- compare document MIME types (Content-Type in HTTP response header vs. MIME type detected by [Tika](http://tika.apache.org/): [compare-mime-type-http-vs-detected.sql](src/sql/examples/compare-mime-type-http-vs-detected.sql)
59-
- distribution/histogram of host name lengths: [host_length_distrib.sql](src/sql/examples/host_length_distrib.sql)
60-
- count URL paths to robots.txt files [count-robotstxt-url-paths.sql](src/sql/examples/count-robotstxt-url-paths.sql)
55+
- host name elements (split host name at `.` into words): [count-hostname-elements.sql](src/sql/examples/cc-index/count-hostname-elements.sql)
56+
- URL path elements (separated by `/`): [count-url-path-elements.sql](src/sql/examples/cc-index/count-url-path-elements.sql)
57+
- count HTTP status codes: [count-fetch-status.sql](src/sql/examples/cc-index/count-fetch-status.sql)
58+
- count the domains of a specific top-level domain: [count-domains-of-tld.sql](src/sql/examples/cc-index/count-domains-of-tld.sql)
59+
- compare document MIME types (Content-Type in HTTP response header vs. MIME type detected by [Tika](http://tika.apache.org/): [compare-mime-type-http-vs-detected.sql](src/sql/examples/cc-index/compare-mime-type-http-vs-detected.sql)
60+
- distribution/histogram of host name lengths: [host_length_distrib.sql](src/sql/examples/cc-index/host_length_distrib.sql)
61+
- count URL paths to robots.txt files [count-robotstxt-url-paths.sql](src/sql/examples/cc-index/count-robotstxt-url-paths.sql)
6162
- export WARC record specs (file, offset, length) for
62-
- a single domain: [get-records-of-domain.sql](src/sql/examples/get-records-of-domain.sql)
63-
- a specific MIME type: [get-records-of-mime-type.sql](src/sql/examples/get-records-of-mime-type.sql)
64-
- find multi-lingual domains by analyzing URL paths: [get_language_translations_url_path.sql](src/sql/examples/get_language_translations_url_path.sql)
65-
- find similar domain names by Levenshtein distance (few characters changed): [similar-domains.sql](src/sql/examples/similar-domains.sql)
63+
- a single domain: [get-records-of-domain.sql](src/sql/examples/cc-index/get-records-of-domain.sql)
64+
- a specific MIME type: [get-records-of-mime-type.sql](src/sql/examples/cc-index/get-records-of-mime-type.sql)
65+
- find multi-lingual domains by analyzing URL paths: [get_language_translations_url_path.sql](src/sql/examples/cc-index/get_language_translations_url_path.sql)
66+
- find similar domain names by Levenshtein distance (few characters changed): [similar-domains.sql](src/sql/examples/cc-index/similar-domains.sql)
6667

src/sql/athena/cc-index-create-table-flat.sql

+21-19
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
-- * format (STORED AS)
66
-- * s3:// path (LOCATION)
77
--
8-
CREATE EXTERNAL TABLE IF NOT EXISTS 'ccindex' (
9-
url_surtkey STRING,
10-
url STRING,
11-
url_host_name STRING,
8+
CREATE EXTERNAL TABLE IF NOT EXISTS ccindex (
9+
url_surtkey STRING,
10+
url STRING,
11+
url_host_name STRING,
1212
url_host_tld STRING,
1313
url_host_2nd_last_part STRING,
1414
url_host_3rd_last_part STRING,
@@ -18,20 +18,22 @@ CREATE EXTERNAL TABLE IF NOT EXISTS 'ccindex' (
1818
url_host_registered_domain STRING,
1919
url_host_private_suffix STRING,
2020
url_host_private_domain STRING,
21-
url_protocol STRING,
22-
url_port INT,
23-
url_path STRING,
24-
url_query STRING,
25-
fetch_time TIMESTAMP,
26-
fetch_status SMALLINT,
27-
content_digest STRING,
28-
content_mime_type STRING,
29-
content_mime_detected STRING,
30-
warc_filename STRING,
31-
warc_record_offset INT,
32-
warc_record_length INT,
33-
warc_segment STRING)
34-
PARTITIONED BY(crawl STRING, subset STRING)
21+
url_protocol STRING,
22+
url_port INT,
23+
url_path STRING,
24+
url_query STRING,
25+
fetch_time TIMESTAMP,
26+
fetch_status SMALLINT,
27+
content_digest STRING,
28+
content_mime_type STRING,
29+
content_mime_detected STRING,
30+
warc_filename STRING,
31+
warc_record_offset INT,
32+
warc_record_length INT,
33+
warc_segment STRING)
34+
PARTITIONED BY (
35+
crawl STRING,
36+
subset STRING)
3537
STORED AS parquet
36-
LOCATION 's3://path_to_table/';
38+
LOCATION 's3://commoncrawl/cc-index/table/cc-main/warc/';
3739

src/sql/athena/cc-index-create-table-nested.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
-- * format (STORED AS)
66
-- * s3:// path (LOCATION)
77
--
8-
CREATE EXTERNAL TABLE IF NOT EXISTS 'ccindex' (
8+
CREATE EXTERNAL TABLE IF NOT EXISTS ccindex (
99
url STRUCT<surtkey:STRING, url:STRING, host:STRUCT<name:STRING, tld:STRING, 2nd_last_part:STRING, 3rd_last_part:STRING, 4th_last_part:STRING, 5th_last_part:STRING, registry_suffix:STRING, registered_domain:STRING, private_suffix:STRING, private_domain:STRING>, protocol:STRING, port:INT, path:STRING, query:STRING>,
1010
fetch STRUCT<time:TIMESTAMP, status:SMALLINT>,
1111
content STRUCT<digest:STRING, mime_type:STRING, mime_detected:STRING>,

src/sql/examples/compare-mime-type-http-vs-detected.sql renamed to src/sql/examples/cc-index/compare-mime-type-http-vs-detected.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ SELECT COUNT(*) as n_pages,
55
content_mime_type,
66
content_mime_detected
77
FROM "ccindex"."ccindex"
8-
WHERE crawl = 'CC-MAIN-2017-47'
8+
WHERE crawl = 'CC-MAIN-2018-05'
99
AND subset = 'warc'
1010
GROUP BY content_mime_type,
1111
content_mime_detected
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
-- count captures per partition (crawls and subsets):
2+
-- how many pages are contained in the monthly crawl
3+
-- archives (and are also indexed in the table)?
4+
SELECT COUNT(*) as n_captures,
5+
crawl,
6+
subset
7+
FROM "ccindex"."ccindex"
8+
GROUP BY crawl, subset;

src/sql/examples/count-by-tld-page-host-domain.sql renamed to src/sql/examples/cc-index/count-by-tld-page-host-domain.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ SELECT COUNT(*) AS n_pages,
44
COUNT(DISTINCT url_host_name) AS n_hosts,
55
COUNT(DISTINCT url_host_registered_domain) AS n_domains
66
FROM "ccindex"."ccindex"
7-
WHERE crawl = 'CC-MAIN-2017-47'
7+
WHERE crawl = 'CC-MAIN-2018-05'
88
AND subset = 'warc'
99
GROUP BY url_host_tld
1010
ORDER BY n_pages DESC;

src/sql/examples/count-domains-of-tld.sql renamed to src/sql/examples/cc-index/count-domains-of-tld.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
SELECT COUNT(*) AS count,
55
url_host_registered_domain
66
FROM "ccindex"."ccindex"
7-
WHERE crawl = 'CC-MAIN-2017-47'
7+
WHERE crawl = 'CC-MAIN-2018-05'
88
AND subset = 'warc'
99
AND url_host_tld = 'no'
1010
GROUP BY url_host_registered_domain
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
-- fetch status counts
2+
-- (only for the 'crawldiagnostics' subset,
3+
-- captures in the 'warc' subset have status 200)
4+
SELECT COUNT(*) as count,
5+
fetch_status
6+
FROM "ccindex"."ccindex"
7+
WHERE crawl = 'CC-MAIN-2018-05'
8+
AND subset = 'crawldiagnostics'
9+
GROUP BY fetch_status
10+
ORDER BY count DESC;

src/sql/examples/count-hostname-elements.sql renamed to src/sql/examples/cc-index/count-hostname-elements.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ SELECT host_name_part,
1010
FROM "ccindex"."ccindex",
1111
UNNEST(reverse(split(url_host_name, '.')))
1212
WITH ORDINALITY AS t (host_name_part, part_position)
13-
WHERE crawl = 'CC-MAIN-2017-47'
13+
WHERE crawl = 'CC-MAIN-2018-05'
1414
AND subset = 'warc'
1515
AND url_host_tld = 'fr'
1616
GROUP BY host_name_part

src/sql/examples/count-robotstxt-url-paths.sql renamed to src/sql/examples/cc-index/count-robotstxt-url-paths.sql

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
SELECT COUNT(*) AS count,
55
URL_EXTRACT_PATH(url) AS urlpath
66
FROM "ccindex"."ccindex"
7-
WHERE crawl = 'CC-MAIN-2017-47'
7+
WHERE crawl = 'CC-MAIN-2018-05'
88
AND subset = 'robotstxt'
99
GROUP BY URL_EXTRACT_PATH(url)
1010
HAVING (COUNT(*) >= 100)
@@ -15,7 +15,7 @@ ORDER BY count DESC
1515
-- SELECT COUNT(*) AS count,
1616
-- url_path
1717
-- FROM "ccindex"."ccindex"
18-
-- WHERE crawl = 'CC-MAIN-2017-47'
18+
-- WHERE crawl = 'CC-MAIN-2018-05'
1919
-- AND subset = 'robotstxt'
2020
-- GROUP BY url_path
2121
-- HAVING (COUNT(*) >= 100)

src/sql/examples/count-url-path-elements.sql renamed to src/sql/examples/cc-index/count-url-path-elements.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ SELECT url_path_element,
77
COUNT(url_path_element) as frequency
88
FROM "ccindex"."ccindex",
99
UNNEST(transform(filter(split(url_path, '/'), w -> w != ''), w -> url_decode(w))) AS t (url_path_element)
10-
WHERE crawl = 'CC-MAIN-2017-47'
10+
WHERE crawl = 'CC-MAIN-2018-05'
1111
AND subset = 'warc'
1212
AND url_host_tld = 'fr'
1313
GROUP BY url_path_element

src/sql/examples/get-language-translations-url-path.sql renamed to src/sql/examples/cc-index/get-language-translations-url-path.sql

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@
1212
SELECT url_host_registered_domain,
1313
COUNT(DISTINCT(url_path_lang)) as n_lang,
1414
COUNT(*) as n_pages,
15-
histogram(DISTINCT(url_path_lang)) as lang_counts
15+
histogram(url_path_lang) as lang_counts
1616
FROM "ccindex"."ccindex",
1717
UNNEST(regexp_extract_all(url_path, '(?<=/)(?:[a-z][a-z])(?=/)')) AS t (url_path_lang)
18-
WHERE crawl = 'CC-MAIN-2017-47'
18+
WHERE crawl = 'CC-MAIN-2018-05'
1919
AND subset = 'warc'
2020
AND url_host_registry_suffix = 'va'
2121
GROUP BY url_host_registered_domain

src/sql/examples/get-records-of-domain.sql renamed to src/sql/examples/cc-index/get-records-of-domain.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ SELECT url,
66
warc_record_offset,
77
warc_record_length
88
FROM "ccindex"."ccindex"
9-
WHERE crawl = 'CC-MAIN-2017-47'
9+
WHERE crawl = 'CC-MAIN-2018-05'
1010
AND subset = 'warc'
1111
AND url_host_registered_domain = 'commoncrawl.org'
1212
LIMIT 1000;

src/sql/examples/get-records-of-mime-type.sql renamed to src/sql/examples/cc-index/get-records-of-mime-type.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ SELECT url,
1313
warc_record_offset,
1414
warc_record_length
1515
FROM "ccindex"."ccindex"
16-
WHERE crawl = 'CC-MAIN-2017-47'
16+
WHERE crawl = 'CC-MAIN-2018-05'
1717
AND subset = 'warc'
1818
AND (content_mime_detected = 'application/pdf' OR
1919
content_mime_type LIKE '%pdf%')

src/sql/examples/host_length_distrib.sql renamed to src/sql/examples/cc-index/host_length_distrib.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
WITH t AS (SELECT url_host_name AS host_name,
88
cardinality(split(url_host_name, '.')) AS host_name_length
99
FROM "ccindex"."ccindex"
10-
WHERE crawl = 'CC-MAIN-2017-47'
10+
WHERE crawl = 'CC-MAIN-2018-05'
1111
AND subset = 'warc')
1212
SELECT COUNT(*) AS n_pages,
1313
COUNT(DISTINCT host_name) AS uniq_hosts,

src/sql/examples/similar-domains.sql renamed to src/sql/examples/cc-index/similar-domains.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ SELECT COUNT(*) AS n_pages,
1010
url_host_2nd_last_part AS host_2nd_last_part,
1111
url_host_registered_domain AS domain
1212
FROM "ccindex"."ccindex"
13-
WHERE crawl = 'CC-MAIN-2017-47'
13+
WHERE crawl = 'CC-MAIN-2018-05'
1414
AND subset = 'warc'
1515
AND (levenshtein_distance('wikipedia', url_host_2nd_last_part) <= 2)
1616
GROUP BY url_host_tld, url_host_2nd_last_part, url_host_registered_domain

src/sql/examples/count-fetch-status.sql

-9
This file was deleted.

0 commit comments

Comments
 (0)