Skip to content

Commit 153777e

Browse files
authored
Merge pull request #216 from allenai/amanr/issues
Fixed issues and improved documentation in getting-started.md
2 parents 4615d34 + d95be21 commit 153777e

File tree

3 files changed

+60
-95
lines changed

3 files changed

+60
-95
lines changed

docs/examples/wikipedia-mixer.json

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"streams": [
3+
{
4+
"name": "getting-started",
5+
"documents": [
6+
"wikipedia/v0/documents/*.gz"
7+
],
8+
"output": {
9+
"path": "wikipedia/example0/documents",
10+
"max_size_in_bytes": 1000000000
11+
},
12+
"attributes": [
13+
"exp",
14+
"bff_duplicate_paragraph_spans"
15+
],
16+
"filter": {
17+
"include": [
18+
"$.attributes[?(@.exp__whitespace_tokenizer_with_paragraphs_v1__document[0][2] < 100000)]"
19+
],
20+
"exclude": [
21+
"$.attributes[?(@.exp__whitespace_tokenizer_with_paragraphs_v1__document[0][2] < 50)]",
22+
"$.attributes[?(@.exp__ft_lang_id_en_paragraph_with_doc_score_v2__doc_en[0][2] <= 0.5)]",
23+
"[email protected][?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
24+
]
25+
},
26+
"span_replacement": [
27+
{
28+
"span": "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en",
29+
"min_score": 0.1,
30+
"replacement": ""
31+
}
32+
]
33+
}
34+
],
35+
"processes": 1
36+
}

docs/getting-started.md

+15-67
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,11 @@ In this tutorial, we will show how to use the `tag`, `dedupe`, and `mix` command
4242
Run all following commands from root of this repository.
4343

4444
### Step 0: Obtain Wikipedia
45-
46-
We use [this script](https://github.com/allenai/dolma/blob/main/scripts/make_wikipedia.py) to download and process Wikipedia:
45+
First, install the required dependencies:
46+
```bash
47+
pip install git+https://github.com/santhoshtr/wikiextractor.git requests smart_open tqdm
48+
```
49+
Next, use [this script](https://github.com/allenai/dolma/blob/main/scripts/make_wikipedia.py) to download and process Wikipedia:
4750

4851
```shell
4952
python scripts/make_wikipedia.py \
@@ -53,9 +56,9 @@ python scripts/make_wikipedia.py \
5356
--processes 16
5457
```
5558

56-
The code above will download and process Wikipedia articles in the `simple` language from the October 1, 2023 wikipedia dump.
57-
After running it, you will have a directory called `wikipedia/v0` with Wikipedia articles in it.
58-
Wikipedia articles are going to be grouped in compressed JSONL files in dolma
59+
This script will download and process Wikipedia articles in the `simple` language from the October 1, 2023 Wikipedia dump. After running it, you will find the articles stored in a directory named `wikipedia/v0`. The articles will be grouped into compressed JSONL files suitable for dolma.
60+
61+
Note: Update the `--date 20231001` argument by selecting a specific dump date from the Wikimedia dump website. Make sure to use the date format `YYYYMMDD`.
5962

6063
### Step 1: Run Taggers
6164

@@ -105,74 +108,19 @@ The above command will create an attribute directory called `bff_duplicate_parag
105108

106109
### Step 3: Run Mixer
107110

108-
After running the taggers and and marking which paragraphs are duplicates, we can run the mixer to create a dataset with a subset of the languages and documents.
111+
After running the taggers and marking which paragraphs are duplicates, we can run the mixer to create a dataset with a subset of the languages and documents.
109112

110-
For this step, we will pass a configuration file to the mix command instead of passing all the options on the command line. CLI invocation looks like this:
113+
For this step, we will pass a configuration file to the `mix` command instead of passing all the options on the command line. The CLI invocation looks like this:
111114

112115
```shell
113-
dolma -c wikipedia-mixer.json mix --processes 16
116+
dolma -c examples/wikipedia-mixer.json mix --processes 16
114117
```
115118

116-
Note how the configuration in this case is a JSON file; a YAML file would also work.
117-
Further, we override the number of processes to use to 96 using the `--processes` flag.
118-
119-
`wikipedia-mixer.json` looks like the following (A YAML-equivalent version is available at [`wikipedia-mixer.yaml`](examples/wikipedia-mixer.yaml)):
120-
121-
122-
```yaml
123-
{
124-
# mix command operates on one or more stream; each can correspond to a different data source
125-
# and can have its own set of filters and transformations
126-
"streams": [
127-
{
128-
# name of the stream; this will be used as a prefix for the output files
129-
"name": "getting-started",
130-
# the documents to mix; note how we use a glob pattern to match all documents
131-
"documents": [
132-
"wikipedia/v0/documents/*.gz",
133-
],
134-
# this is the directory where the output will be written
135-
# note how the toolkit will try to create files of size ~1GB
136-
"output": {
137-
"path": "wikipedia/example0/documents",
138-
"max_size_in_bytes": 1000000000
139-
},
140-
"attributes": [
141-
"exp", # load the attributes from the taggers
142-
"bff_duplicate_paragraph_spans" # load the attributes from the deduper
143-
],
144-
# filers remove or include whole documents based on the value of their attributes
145-
"filter": {
146-
"include": [
147-
# Include all documents with length less than 100,000 whitespace-separated words
148-
"$.attributes[?(@.exp__whitespace_tokenizer_with_paragraphs_v1__document[0][2] < 100000)]"
149-
],
150-
"exclude": [
151-
# Remove any document that is shorter than 50 words
152-
"$.attributes[?(@.exp__whitespace_tokenizer_with_paragraphs_v1__document[0][2] < 50)]",
153-
# Remove any document whose total English fasttext score is below 0.5
154-
"$.attributes[?(@.exp__ft_lang_id_en_paragraph_with_doc_score_v2__doc_en[0][2] <= 0.5)]",
155-
# Remove all documents that contain a duplicate paragraph
156-
"[email protected][?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
157-
]
158-
},
159-
# span replacement allows you to replace spans of text with a different string
160-
"span_replacement": [
161-
{
162-
# remove paragraphs whose not-English cld2 socre is below 0.9 in a document
163-
"span": "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en",
164-
"min_score": 0.1,
165-
"replacement": ""
166-
}
167-
]
168-
}
169-
],
170-
# this process option is overridden by the command line flag
171-
"processes": 1
172-
}
173-
```
119+
In this case, the configuration is provided via a JSON file, though a YAML file would also work. Additionally, we override the number of processes to 16 using the `--processes` flag.
120+
121+
You can find the configuration file [`wikipedia-mixer.json`](examples/wikipedia-mixer.json) in the examples repository, along with its YAML-equivalent version at [`wikipedia-mixer.yaml`](examples/wikipedia-mixer.yaml).
174122

175-
The above configuration will create a directory called `wikipedia/example0/documents` with a set of files that contain the documents that pass the filters.
123+
The configuration will create a directory named `wikipedia/example0/documents` with a set of files containing the documents that pass the filters.
176124

177125
### Step 4: Tokenize The Dataset
178126

scripts/make_wikipedia.py

+9-28
Original file line numberDiff line numberDiff line change
@@ -19,38 +19,19 @@
1919
from tempfile import TemporaryDirectory
2020
from typing import Any, Dict, List, Union
2121

22-
import requests
23-
import smart_open
24-
from dolma.core.parallel import BaseParallelProcessor, QueueType
25-
from uniseg.wordbreak import words as uniseg_get_words
26-
27-
CMD_INSTALL = "pip install git+https://github.com/santhoshtr/wikiextractor.git requests smart_open tqdm"
28-
2922
try:
23+
import requests
24+
import smart_open
25+
import tqdm
26+
from dolma.core.parallel import BaseParallelProcessor, QueueType
27+
from uniseg.wordbreak import words as uniseg_get_words
3028
from wikiextractor import WikiExtractor
31-
except ImportError:
32-
print(f"Please install wikiextractor with `{CMD_INSTALL}`")
33-
sys.exit(1)
34-
35-
try:
36-
import requests # noqa
37-
except ImportError:
38-
print(f"Please install requests with `{CMD_INSTALL}`")
39-
sys.exit(1)
40-
41-
try:
42-
import smart_open # noqa
43-
except ImportError:
44-
print(f"Please install smart_open with `{CMD_INSTALL}`")
45-
sys.exit(1)
46-
47-
try:
48-
import tqdm # noqa
49-
except ImportError:
50-
print(f"Please install tqdm with `{CMD_INSTALL}`")
29+
except ImportError as e:
30+
CMD_INSTALL = "pip install git+https://github.com/santhoshtr/wikiextractor.git requests smart_open tqdm"
31+
missing_module = e.name
32+
print(f"Please install {missing_module} with `{CMD_INSTALL}`")
5133
sys.exit(1)
5234

53-
5435
DUMP_URL = "https://dumps.wikimedia.org/{lang}wiki/{date}/{lang}wiki-{date}-pages-articles-multistream.xml.bz2"
5536
LOGGER = logging.getLogger(__name__)
5637
LOGGER.setLevel(logging.INFO)

0 commit comments

Comments
 (0)