fix codespell

abacusai · May 3, 2022 · d95a433 · d95a433
1 parent 121b709
commit d95a433
Show file tree

Hide file tree

Showing 14 changed files with 39 additions and 36 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -35,8 +35,8 @@ repos:
     rev: v2.1.0
     hooks:
       - id: codespell
-        args: [
-            "--ignore-words-list=reord", # Word used in error messages that need rewording
-            --check-filenames,
-            --check-hidden,
-          ]
+        exclude: >
+          (?x)^(
+              .*\.json|ignore.txt
+          )$
+        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
diff --git a/ignore.txt b/ignore.txt
@@ -0,0 +1,3 @@
+ROUGE
+rouge
+nin
diff --git a/lm_eval/base.py b/lm_eval/base.py
@@ -51,7 +51,7 @@ def loglikelihood_rolling(self, requests):
         - We will use the full max context length of the model.
         - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
         the max context length.
-        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementaitons
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
           which may simply concatenate multiple documents together.
         - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
           multiple chunks, the last input will still a full-sized context.
@@ -234,9 +234,9 @@ def _collate(x):
             return -len(toks), tuple(toks)
 
         # TODO: automatic (variable) batch size detection for vectorization
-        reord = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)
         for chunk in utils.chunks(
-            tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size
+            tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
         ):
             inps = []
             cont_toks_list = []
@@ -327,10 +327,10 @@ def _collate(x):
 
                 res.append(answer)
 
-        return reord.get_original(res)
+        return re_ord.get_original(res)
 
     def greedy_until(self, requests):
-        # TODO: implement fully general `until` that handles untils that are
+        # TODO: implement fully general `until` that handles until that are
         #       multiple tokens or that span multiple tokens correctly
 
         # TODO: extract to TokenizedLM?
@@ -340,9 +340,9 @@ def _collate(x):
             toks = self.tok_encode(x[0])
             return len(toks), x[0]
 
-        reord = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)
 
-        for context, until in tqdm(reord.get_reordered()):
+        for context, until in tqdm(re_ord.get_reordered()):
             if isinstance(until, str):
                 until = [until]
 
@@ -366,7 +366,7 @@ def _collate(x):
 
             res.append(s)
 
-        return reord.get_original(res)
+        return re_ord.get_original(res)
 
 
 class Task(abc.ABC):

diff --git a/lm_eval/datasets/drop/drop.py b/lm_eval/datasets/drop/drop.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Custom DROP dataet that, unlike HF, keeps all question-answer pairs
+# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
 # even if there are multiple types of answers for the same question.
 """DROP dataset."""
 

diff --git a/lm_eval/datasets/sat_analogies/sat_analogies.py b/lm_eval/datasets/sat_analogies/sat_analogies.py
@@ -61,7 +61,7 @@ def manual_download_instructions(self):
         return (
             "To use SAT Analogy Questions you have to download it manually. Please "
             "email Peter Turney to request the data (https://www.apperceptual.com). "
-            "Once you recieve a download link for the dataset, supply the local path "
+            "Once you receive a download link for the dataset, supply the local path "
             "as the `data_dir` arg: "
             "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
         )

diff --git a/lm_eval/decontamination/janitor.py b/lm_eval/decontamination/janitor.py
@@ -158,7 +158,7 @@ def register_contaminant(self, dirt_string):
 
     def clean(self, dirty_string):
         """Clean a string (e.g. a training set) by removing all ngrams previously
-        reigstered as contaminants. Returns a list of clean chunks, or empty if
+        registered as contaminants. Returns a list of clean chunks, or empty if
         the string was too dirty"""
         if JANITOR_CPP:
             return self.clean_cpp(dirty_string)
@@ -275,7 +275,7 @@ def clean_python(self, dirty_string):
 #         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
 #         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
 #         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
-#         would last, took a cautious approach, prefering to save the revenue rather than investing it in
+#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
 #         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
 #         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
 #         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],

diff --git a/lm_eval/models/gpt2.py b/lm_eval/models/gpt2.py
@@ -25,7 +25,7 @@ def __init__(
             self._device = torch.device(device)
             print(f"Using device '{device}'")
         else:
-            print("Device not specificed")
+            print("Device not specified")
             print(f"Cuda Available? {torch.cuda.is_available()}")
             self._device = (
                 torch.device("cuda")

diff --git a/lm_eval/models/gpt3.py b/lm_eval/models/gpt3.py
@@ -124,10 +124,10 @@ def _collate(x):
             toks = x[1] + x[2]
             return -len(toks), tuple(toks)
 
-        reord = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)
 
         for chunk in tqdm(
-            list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)),
+            list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
             disable=disable_tqdm,
         ):
             inps = []
@@ -163,7 +163,7 @@ def _collate(x):
                 if cache_key is not None:
                     self.cache_hook.add_partial("loglikelihood", cache_key, answer)
 
-        return reord.get_original(res)
+        return re_ord.get_original(res)
 
     def greedy_until(self, requests):
         if not requests:
@@ -174,7 +174,7 @@ def _collate(x):
             toks = self.tok_encode(x[0])
             return len(toks), x[0]
 
-        reord = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)
 
         def sameuntil_chunks(xs, size):
             ret = []
@@ -191,7 +191,7 @@ def sameuntil_chunks(xs, size):
 
         # todo: more intelligent batching for heterogeneous `until`
         for chunk, until in tqdm(
-            list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))
+            list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
         ):
             inps = []
             for context, _ in chunk:
@@ -219,7 +219,7 @@ def sameuntil_chunks(xs, size):
 
                 res.append(s)
 
-        return reord.get_original(res)
+        return re_ord.get_original(res)
 
     def _model_call(self, inps):
         # Isn't used because we override _loglikelihood_tokens

diff --git a/lm_eval/tasks/drop.py b/lm_eval/tasks/drop.py
@@ -74,16 +74,16 @@ def _flatten_validated_answers(validated_answers):
             {"number": ['1', '8'], ...}
             -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
             """
-            vas = []
+            valid_answers = []
             for i in range(len(validated_answers["number"])):
-                vas.append(
+                valid_answers.append(
                     {
                         "number": validated_answers["number"][i],
                         "date": validated_answers["date"][i],
                         "spans": validated_answers["spans"][i],
                     }
                 )
-            return vas
+            return valid_answers
 
         answers = []
         answers_set = set()

diff --git a/lm_eval/tasks/hendrycks_ethics.py b/lm_eval/tasks/hendrycks_ethics.py
@@ -10,7 +10,7 @@
 learning agents.
 
 NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
-tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
+tasks are referred to in this work as the `em` sub-metric. See Section 3. Metrics.
 of the paper.
 
 Homepage: https://github.com/hendrycks/ethics
@@ -323,7 +323,7 @@ def _process_doc(self, doc):
         }
 
     def doc_to_text(self, doc):
-        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferrable?\nAnswer:".format(
+        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferable?\nAnswer:".format(
             doc["scenarios"][0], doc["scenarios"][1]
         )
 

diff --git a/scripts/clean_training_data/README.md b/scripts/clean_training_data/README.md
@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
 1) Collects all contamination text files that are to be removed from training data
 2) Filters training data by finding `N`gram matches between the training data
    and any contamination
-   1) `N`grams ignore case and punctation and are split on whitespace.
+   1) `N`grams ignore case and punctuation and are split on whitespace.
    2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
     the match, splitting the training data into chunks
    3) Any chunks less than `minimum_slice_length` are removed
@@ -20,7 +20,7 @@ minimum_slice_length = 200
 too_dirty_cutoff = 10
 ```
 
-## Compling
+## Compiling
 
 Janitor can be used as a pure python program, but it is much faster if the ngram
 code is run in C++. To compile the C++ code, run

diff --git a/scripts/clean_training_data/process_sorted_buckets.py b/scripts/clean_training_data/process_sorted_buckets.py
@@ -63,7 +63,7 @@ def process_bucket(
     for line in bucket.read():
         [ngram, document_id] = line.rsplit(" ", 1)
 
-        # Write ngram if more then 10 unique document occurences
+        # Write ngram if more then 10 unique document occurrences
         if ngram != current_ngram:
             if len(current_ngram_document_ids) > 10:
                 output_archive.add_data(

diff --git a/templates/new_multiple_choice_task.py b/templates/new_multiple_choice_task.py
@@ -1,7 +1,7 @@
 # TODO: Remove all TODO comments once the implementation is complete.
 """
 TODO: Add the Paper Title on this line.
-TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
+TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
 
 TODO: Write a Short Description of the task.
 

diff --git a/templates/new_task.py b/templates/new_task.py
@@ -1,7 +1,7 @@
 # TODO: Remove all TODO comments once the implementation is complete.
 """
 TODO: Add the Paper Title on this line.
-TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
+TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
 
 TODO: Write a Short Description of the task.
 
@@ -45,7 +45,7 @@ def training_docs(self):
             if self._training_docs is None:
                 # TODO: Return the training document generator from `self.dataset`.
                 # If you need to process the data, `map` over the documents with
-                # the custom procesing function, `self._process_doc`. E.g.
+                # the custom processing function, `self._process_doc`. E.g.
                 # `map(self._process_doc, self.dataset["validation"])`
                 # In most case you can leave this as is unless the dataset split is
                 # named differently than the default `"train"`.
@@ -56,7 +56,7 @@ def validation_docs(self):
         if self.has_validation_docs():
             # TODO: Return the validation document generator from `self.dataset`.
             # If you need to process the data, `map` over the documents with the
-            # custom procesing function, `self._process_doc`. E.g.
+            # custom processing function, `self._process_doc`. E.g.
             # `map(self._process_doc, self.dataset["validation"])`
             # In most case you can leave this as is unless the dataset split is
             # named differently than the default `"validation"`.
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    ROUGE
+    rouge
+    nin