Add Codespell to CI, fix typos (#543)

mryab · web-flow · commit 2a71fdab6fbe · 2023-01-07T17:31:11.000+03:00
* Add Codespell to CI, fix typos
diff --git a/.github/workflows/check-style.yml b/.github/workflows/check-style.yml
@@ -24,3 +24,11 @@ jobs:
       - uses: isort/isort-action@master
         with:
           isortVersion: "5.10.1"
+
+  codespell:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: codespell-project/actions-codespell@v1
+        with:
+          only_warn: 1
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -38,7 +38,8 @@ with the following rules:
   cannot be longer than 119 characters.
 * We use [black](https://github.com/psf/black) for code formatting and [isort](https://github.com/PyCQA/isort) for 
   import sorting. Before submitting a PR, make sure to install and run `black .` and `isort .` in the root of the
-  repository.
+  repository. Also, you may want to check your code for typos by running `codespell --skip=".git"`, though there
+  might be false positives.
 * We highly encourage the use of [typing](https://docs.python.org/3/library/typing.html) where applicable.
 * Use `get_logger` from `hivemind.utils.logging` to log any information instead of `print`ing directly to standard
   output/error streams.
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ see the [full list](#citation) of our papers below.
 ## Example Use Cases
 
 This section lists projects that leverage hivemind for decentralized training. 
-If you have succesfully trained a model or created a downstream repository with the help of our library, 
+If you have successfully trained a model or created a downstream repository with the help of our library, 
 feel free to submit a pull request that adds your project to this list.
 
 * **Petals** ([webpage](https://petals.ml), [code](https://github.com/bigscience-workshop/petals)) — a decentralized platform for inference and fine-tuning of 100B+ language models.
diff --git a/benchmarks/benchmark_dht.py b/benchmarks/benchmark_dht.py
@@ -51,7 +51,7 @@ async def store_and_get_task(
     latest: bool,
     node_killer: NodeKiller,
 ) -> Tuple[list, list, list, list, int, int]:
-    """Iteratively choose random peers to store data onto the dht, then retreive with another random subset of peers"""
+    """Iteratively choose random peers to store data onto the dht, then retrieve with another random subset of peers"""
 
     total_stores = total_gets = 0
     successful_stores = []
diff --git a/docs/modules/optim.rst b/docs/modules/optim.rst
@@ -5,7 +5,7 @@
 
   This module contains decentralized optimizers that wrap your regular PyTorch Optimizer to train with peers.
   Depending on the exact configuration, Optimizer may perform large synchronous updates equivalent,
-  or perform asynchrnous local updates and average model parameters.
+  or perform asynchronous local updates and average model parameters.
 
   <br><br>
 
diff --git a/docs/user/dht.md b/docs/user/dht.md
@@ -119,7 +119,7 @@ dht = hivemind.DHT(
     ], start=True)
 ```
 
-Thats it, now the two DHT nodes are connected. If you connect additional peers to the network, you only need to specify
+That's it, now the two DHT nodes are connected. If you connect additional peers to the network, you only need to specify
 one (or a subset) of peers as `initial_peers`.
 In case your peer operates behind a restrictive firewall, you may find it beneficial to set `client_mode=True`. In this
  case, the DHT instance will access others, but it will not announce that other peers can connect to it.
diff --git a/hivemind/averaging/averager.py b/hivemind/averaging/averager.py
@@ -62,7 +62,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
     :param min_matchmaking_time: when looking for group, wait for requests for at least this many seconds
     :param compression: optionally compress tensors with this compression algorithm before running all-reduce
     :param state_compression: a separate compression strategy for load_state_from_peers (default = no compression)
-    :param tensor_infos: CompressionInfo for each respective tensor; this determines how the tensor will be comressed
+    :param tensor_infos: CompressionInfo for each respective tensor; this determines how the tensor will be compressed
     :param averaging_alpha: optional "learning rate" for averaging. If specified, local parameters will be shifted
       towards the (estimated) average by this coefficient. By default, local parameters are set equal to average.
     :param request_timeout: when looking for group, wait for a response from leader for at most this many seconds.
@@ -376,7 +376,7 @@ def step(
         """
         Set up the averager to look for a group and run one round of averaging, return True on success, False on failure
 
-        :param gather: optionally send this informaton to all peers in the next group and gather it from every groupmate
+        :param gather: optionally send this information to all peers in the next group and gather it from every groupmate
           (this operation is known as all-gather). The gathered data will be available as the output of this function.
         :param scheduled_time: when matchmaking, assume that all-reduce will begin at this moment.
           By default, schedule all-reduce current time plus min_matchmaking_time seconds
@@ -651,7 +651,7 @@ async def rpc_download_state(
 
     def get_current_state(self) -> Tuple[Any, Sequence[torch.Tensor], Sequence[CompressionInfo]]:
         """
-        Get current state and send it to a peer. executed in the host process. Meant to be overriden.
+        Get current state and send it to a peer. executed in the host process. Meant to be overridden.
         :returns: a tuple of (small metadata, sequence of torch tensors)
         :note: metadata must be seriablizable with self.serializer (default = MSGPackSerializer)
         """
diff --git a/hivemind/averaging/partition.py b/hivemind/averaging/partition.py
@@ -26,7 +26,7 @@ class TensorPartContainer:
     :param peer_fractions: for each peer, a target fraction of vector elements that this peer should average
     :param compression: optionally compress tensors with this compression algorithm before sending them to peers
     :param part_size_bytes: greedily split tensors into parts of up to this many bytes (after compression)
-    :param tensor_infos: CompressionInfo for each respective tensor; this determines how the tensor will be comressed
+    :param tensor_infos: CompressionInfo for each respective tensor; this determines how the tensor will be compressed
     :param return_deltas: if True, output tensors are differences (aggregated tensor - local tensor)
     :param prefetch: when compressing, pre-compute this many compressed tensors in background
     """
diff --git a/hivemind/compression/base.py b/hivemind/compression/base.py
@@ -53,7 +53,7 @@ def compress(self, tensor: torch.Tensor, info: CompressionInfo, allow_inplace: b
         """
         Applies compression algorithm to a tensor based on their meta-parameters
 
-        :param tensor: a pytorch tensor to compress; depending on the applicaiton, it is a full tensor or a part
+        :param tensor: a pytorch tensor to compress; depending on the application, it is a full tensor or a part
         :param info: meta-information about the tensor; if partitioning is used, this still describes the full tensor
         :param allow_inplace: if True, compression can (but doesn't have to) to modify tensor in-place for efficiency
         :returns: a protobuf message that encodes the tensor
diff --git a/hivemind/dht/node.py b/hivemind/dht/node.py
@@ -586,7 +586,7 @@ async def get_many_by_id(
             If min_expiration_time=float('inf'), this method will find a value with _latest_ expiration
         :param beam_size: maintains up to this many nearest nodes when crawling dht, default beam_size = bucket_size
         :param num_workers: override for default num_workers, see traverse_dht num_workers param
-        :param return_futures: if True, immediately return asyncio.Future for every before interacting with the nework.
+        :param return_futures: if True, immediately return asyncio.Future for every before interacting with the network.
          The algorithm will populate these futures with (value, expiration) when it finds the corresponding key
          Note: canceling a future will stop search for the corresponding key
         :param _is_refresh: internal flag, set to True by an internal cache refresher (if enabled)
diff --git a/hivemind/dht/routing.py b/hivemind/dht/routing.py
@@ -1,4 +1,4 @@
-""" Utlity data structures to represent DHT nodes (peers), data keys, and routing tables. """
+""" Utility data structures to represent DHT nodes (peers), data keys, and routing tables. """
 from __future__ import annotations
 
 import hashlib
diff --git a/hivemind/moe/server/server.py b/hivemind/moe/server/server.py
@@ -302,7 +302,7 @@ def shutdown(self):
         logger.debug(f"Shutting down runtime")
         self.runtime.shutdown()
 
-        logger.info("Server shutdown succesfully")
+        logger.info("Server shutdown successfully")
 
 
 @contextmanager
diff --git a/hivemind/optim/grad_averager.py b/hivemind/optim/grad_averager.py
@@ -29,7 +29,7 @@ class GradientAverager(DecentralizedAverager):
     (3) averaged gradients - gradient buffers that are aggregated in-place with peers, always in host memory
 
     :param parameters: pytorch parameters for which to aggregate gradients
-    :param dht: a DHT isntance connected to the rest of the swarm. See hivemind.DHT docs
+    :param dht: a DHT instance connected to the rest of the swarm. See hivemind.DHT docs
     :param prefix: a unique DHT key used for matchmaking. E.g. this can be your experiment name with optional suffixes
     :param reuse_grad_buffers: if True, use model's .grad buffers for accumulating gradients over multiple steps.
       This is more memory efficient, but it requires that the user does *not* call zero_grad or clip_by_whatever at all
diff --git a/hivemind/optim/optimizer.py b/hivemind/optim/optimizer.py
@@ -56,11 +56,11 @@ class Optimizer(torch.optim.Optimizer):
 
     Unlike regular training, your device may join midway through training, when other peers already made some progress.
     For this reason, any learning rate schedulers, curriculum and other **time-dependent features should be based on**
-    ``optimizer.local_epoch`` (and not the number ot calls to opt.step). Otherwise, peers that joined training late
+    ``optimizer.local_epoch`` (and not the number of calls to opt.step). Otherwise, peers that joined training late
     may end up having different learning rates. To do so automatically, specify ``scheduler=...`` parameter below.
 
     :What is an epoch?: Optimizer uses the term ``epoch`` to describe intervals between synchronizations. One epoch
-      coresponds to processing certain number of training samples (``target_batch_size``) in total across all peers.
+      corresponds to processing certain number of training samples (``target_batch_size``) in total across all peers.
       Like in PyTorch LR Scheduler, **epoch does not necessarily correspond to a full pass over the training data.**
       At the end of epoch, peers perform synchronous actions such as averaging gradients for a global optimizer update,
       updating the learning rate scheduler or simply averaging parameters (if using local updates).
diff --git a/hivemind/optim/power_sgd_averager.py b/hivemind/optim/power_sgd_averager.py
@@ -51,7 +51,7 @@ class PowerSGDGradientAverager(GradientAverager):
 
     :param parameters: pytorch parameters for which to aggregate gradients
     :param averager_rank: rank of compressed gradients
-    :param dht: a DHT isntance connected to the rest of the swarm. See hivemind.DHT docs
+    :param dht: a DHT instance connected to the rest of the swarm. See hivemind.DHT docs
     :param prefix: a unique DHT key used for matchmaking. E.g. this can be your experiment name with optional suffixes
     :param reuse_grad_buffers: if True, use model's .grad buffers for accumulating gradients over multiple steps.
       This is more memory efficient, but it requires that the user does *not* call zero_grad or clip_by_whatever at all
diff --git a/hivemind/utils/math.py b/hivemind/utils/math.py
@@ -15,7 +15,7 @@ def orthogonalize_(matrix, eps: float = 1e-8):
 
 
 def get_flatten_greedy_dims(tensor: torch.Tensor, max_ndim: int = 2):
-    """get dims to flatten tensor upto max_ndim dimensions by merging small axes together"""
+    """get dims to flatten tensor up to max_ndim dimensions by merging small axes together"""
     dims = list(tensor.shape)
     while len(dims) > max_ndim:
         squeeze_ix = min(range(len(dims) - 1), key=lambda i: dims[i] * dims[i + 1])
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -8,4 +8,5 @@ scikit-learn
 torchvision
 black==22.3.0
 isort==5.10.1
+codespell==2.2.2
 psutil
diff --git a/tests/test_averaging.py b/tests/test_averaging.py
@@ -356,7 +356,7 @@ def test_load_state_from_peers():
     class TestAverager(DecentralizedAverager):
         def get_current_state(self):
             """
-            Get current state and send it to a peer. executed in the host process. Meant to be overriden.
+            Get current state and send it to a peer. executed in the host process. Meant to be overridden.
             :returns: a tuple of (serializable_small_metadata, sequence of torch tensors)
             """
             nonlocal num_calls, super_metadata, super_tensors

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" Utlity data structures to represent DHT nodes (peers), data keys, and routing tables. """`
	`1`	`+""" Utility data structures to represent DHT nodes (peers), data keys, and routing tables. """`
`2`	`2`	`from __future__ import annotations`
`3`	`3`
`4`	`4`	`import hashlib`