Skip to content

Commit

Permalink
fix codespell
Browse files Browse the repository at this point in the history
  • Loading branch information
Mistobaan committed May 3, 2022
1 parent 121b709 commit d95a433
Show file tree
Hide file tree
Showing 14 changed files with 39 additions and 36 deletions.
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ repos:
rev: v2.1.0
hooks:
- id: codespell
args: [
"--ignore-words-list=reord", # Word used in error messages that need rewording
--check-filenames,
--check-hidden,
]
exclude: >
(?x)^(
.*\.json|ignore.txt
)$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
3 changes: 3 additions & 0 deletions ignore.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ROUGE
rouge
nin
16 changes: 8 additions & 8 deletions lm_eval/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def loglikelihood_rolling(self, requests):
- We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
the max context length.
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementaitons
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
which may simply concatenate multiple documents together.
- IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
multiple chunks, the last input will still a full-sized context.
Expand Down Expand Up @@ -234,9 +234,9 @@ def _collate(x):
return -len(toks), tuple(toks)

# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
re_ord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(
tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size
tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
):
inps = []
cont_toks_list = []
Expand Down Expand Up @@ -327,10 +327,10 @@ def _collate(x):

res.append(answer)

return reord.get_original(res)
return re_ord.get_original(res)

def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# TODO: implement fully general `until` that handles until that are
# multiple tokens or that span multiple tokens correctly

# TODO: extract to TokenizedLM?
Expand All @@ -340,9 +340,9 @@ def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]

reord = utils.Reorderer(requests, _collate)
re_ord = utils.Reorderer(requests, _collate)

for context, until in tqdm(reord.get_reordered()):
for context, until in tqdm(re_ord.get_reordered()):
if isinstance(until, str):
until = [until]

Expand All @@ -366,7 +366,7 @@ def _collate(x):

res.append(s)

return reord.get_original(res)
return re_ord.get_original(res)


class Task(abc.ABC):
Expand Down
2 changes: 1 addition & 1 deletion lm_eval/datasets/drop/drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Custom DROP dataet that, unlike HF, keeps all question-answer pairs
# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
# even if there are multiple types of answers for the same question.
"""DROP dataset."""

Expand Down
2 changes: 1 addition & 1 deletion lm_eval/datasets/sat_analogies/sat_analogies.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def manual_download_instructions(self):
return (
"To use SAT Analogy Questions you have to download it manually. Please "
"email Peter Turney to request the data (https://www.apperceptual.com). "
"Once you recieve a download link for the dataset, supply the local path "
"Once you receive a download link for the dataset, supply the local path "
"as the `data_dir` arg: "
"`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
)
Expand Down
4 changes: 2 additions & 2 deletions lm_eval/decontamination/janitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def register_contaminant(self, dirt_string):

def clean(self, dirty_string):
"""Clean a string (e.g. a training set) by removing all ngrams previously
reigstered as contaminants. Returns a list of clean chunks, or empty if
registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
if JANITOR_CPP:
return self.clean_cpp(dirty_string)
Expand Down Expand Up @@ -275,7 +275,7 @@ def clean_python(self, dirty_string):
# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
# would last, took a cautious approach, prefering to save the revenue rather than investing it in
# would last, took a cautious approach, preferring to save the revenue rather than investing it in
# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
Expand Down
2 changes: 1 addition & 1 deletion lm_eval/models/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(
self._device = torch.device(device)
print(f"Using device '{device}'")
else:
print("Device not specificed")
print("Device not specified")
print(f"Cuda Available? {torch.cuda.is_available()}")
self._device = (
torch.device("cuda")
Expand Down
12 changes: 6 additions & 6 deletions lm_eval/models/gpt3.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,10 @@ def _collate(x):
toks = x[1] + x[2]
return -len(toks), tuple(toks)

reord = utils.Reorderer(requests, _collate)
re_ord = utils.Reorderer(requests, _collate)

for chunk in tqdm(
list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)),
list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
disable=disable_tqdm,
):
inps = []
Expand Down Expand Up @@ -163,7 +163,7 @@ def _collate(x):
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)

return reord.get_original(res)
return re_ord.get_original(res)

def greedy_until(self, requests):
if not requests:
Expand All @@ -174,7 +174,7 @@ def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]

reord = utils.Reorderer(requests, _collate)
re_ord = utils.Reorderer(requests, _collate)

def sameuntil_chunks(xs, size):
ret = []
Expand All @@ -191,7 +191,7 @@ def sameuntil_chunks(xs, size):

# todo: more intelligent batching for heterogeneous `until`
for chunk, until in tqdm(
list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))
list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
):
inps = []
for context, _ in chunk:
Expand Down Expand Up @@ -219,7 +219,7 @@ def sameuntil_chunks(xs, size):

res.append(s)

return reord.get_original(res)
return re_ord.get_original(res)

def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
Expand Down
6 changes: 3 additions & 3 deletions lm_eval/tasks/drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,16 @@ def _flatten_validated_answers(validated_answers):
{"number": ['1', '8'], ...}
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
"""
vas = []
valid_answers = []
for i in range(len(validated_answers["number"])):
vas.append(
valid_answers.append(
{
"number": validated_answers["number"][i],
"date": validated_answers["date"][i],
"spans": validated_answers["spans"][i],
}
)
return vas
return valid_answers

answers = []
answers_set = set()
Expand Down
4 changes: 2 additions & 2 deletions lm_eval/tasks/hendrycks_ethics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
learning agents.
NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
tasks are referred to in this work as the `em` sub-metric. See Section 3. Metrics.
of the paper.
Homepage: https://github.com/hendrycks/ethics
Expand Down Expand Up @@ -323,7 +323,7 @@ def _process_doc(self, doc):
}

def doc_to_text(self, doc):
return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferrable?\nAnswer:".format(
return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferable?\nAnswer:".format(
doc["scenarios"][0], doc["scenarios"][1]
)

Expand Down
4 changes: 2 additions & 2 deletions scripts/clean_training_data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
1) Collects all contamination text files that are to be removed from training data
2) Filters training data by finding `N`gram matches between the training data
and any contamination
1) `N`grams ignore case and punctation and are split on whitespace.
1) `N`grams ignore case and punctuation and are split on whitespace.
2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed
Expand All @@ -20,7 +20,7 @@ minimum_slice_length = 200
too_dirty_cutoff = 10
```

## Compling
## Compiling

Janitor can be used as a pure python program, but it is much faster if the ngram
code is run in C++. To compile the C++ code, run
Expand Down
2 changes: 1 addition & 1 deletion scripts/clean_training_data/process_sorted_buckets.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def process_bucket(
for line in bucket.read():
[ngram, document_id] = line.rsplit(" ", 1)

# Write ngram if more then 10 unique document occurences
# Write ngram if more then 10 unique document occurrences
if ngram != current_ngram:
if len(current_ngram_document_ids) > 10:
output_archive.add_data(
Expand Down
2 changes: 1 addition & 1 deletion templates/new_multiple_choice_task.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task.
Expand Down
6 changes: 3 additions & 3 deletions templates/new_task.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task.
Expand Down Expand Up @@ -45,7 +45,7 @@ def training_docs(self):
if self._training_docs is None:
# TODO: Return the training document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with
# the custom procesing function, `self._process_doc`. E.g.
# the custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"train"`.
Expand All @@ -56,7 +56,7 @@ def validation_docs(self):
if self.has_validation_docs():
# TODO: Return the validation document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# custom procesing function, `self._process_doc`. E.g.
# custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"validation"`.
Expand Down

0 comments on commit d95a433

Please sign in to comment.