microsoft · sordonia · Jul 8, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ output/
 data/
 **/data
 .vscode/
+**/modular_artifacts/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -143,9 +144,9 @@ venv.bak/
 .mypy_cache/
 .dmypy.json
 dmypy.json
-     
+
 # Pyre type checker
-.pyre/     
+.pyre/
 **/eval/alpaca_eval/*
 **/eval/alpaca_eval/**/*.csv
 **/eval/alpaca_eval/**/*.json

diff --git a/mttl/datamodule/mt_seq_to_seq_module.py b/mttl/datamodule/mt_seq_to_seq_module.py
@@ -287,3 +287,9 @@ def setup_dataset(self):
                 num_proc=n_proc,
                 desc="Filtering phi-2 eval tasks from training mixture.",
             )
+            if not self.train_dataset.num_rows:
+                logger.warning(
+                    "No training examples left after filtering. "
+                    "Please set `remove_phi_eval_tasks=False` "
+                    "if you want to train on phi-2 eval tasks."
+                )
diff --git a/mttl/models/expert_config.py b/mttl/models/expert_config.py
@@ -63,7 +63,7 @@ def _set_defaults(self):
         self.eval_mmlu_few_shot = True  # use few-shot for mmlu, default
         self.eval_mmlu_flag = False  # eval mmlu performance during training
         self.eval_rouge_flag = False  # eval rouge during training
-        self.pipeline_eval_tasks = "all"
+        self.pipeline_eval_tasks = None
 
         self.eval_metric = "loss"
         self.use_vllm = False

diff --git a/mttl/models/modifiers/expert_containers/expert_library.py b/mttl/models/modifiers/expert_containers/expert_library.py
@@ -122,6 +122,13 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
+def _remove_protocol(repo_id):
+    """Remove the protocol from the repo_id. Ex:
+    az://storage_account/container -> storage_account/container
+    """
+    return str(repo_id).split("://")[-1]
+
+
 class BackendEngine(ABC):
     @abstractmethod
     def snapshot_download(self, repo_id, allow_patterns=None):
@@ -612,7 +619,8 @@ class RepoInfo:
     def list_repo_files(self, repo_id):
         import glob
 
-        return list(glob.glob(os.path.join(repo_id, "*")))
+        _repo_id = _remove_protocol(repo_id)
+        return list(glob.glob(os.path.join(_repo_id, "*")))
 
 
 class ExpertLibrary:
@@ -628,7 +636,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.repo_id = self._remove_protocol(repo_id)
+        self.repo_id = _remove_protocol(repo_id)
         self._sliced = False
         self.selection = selection
         self.exclude_selection = exclude_selection
@@ -662,12 +670,6 @@ def __init__(
     def sliced(self):
         return self._sliced and not self.ignore_sliced
 
-    def _remove_protocol(self, repo_id):
-        """Remove the protocol from the repo_id. Ex:
-        az://storage_account/container -> storage_account/container
-        """
-        return str(repo_id).split("://")[-1]
-
     def _build_lib(self):
         self._sliced = False
         self.data = {}
@@ -1363,7 +1365,7 @@ def _get_expert_lib_class(repo_id, expert_library_type):
                 expert_library_type = prefix[0]
                 repo_id = prefix[1]
             else:
-                expert_library_type = "local"
+                expert_library_type = "hf"
         try:
             expert_lib_class = available_libraries[expert_library_type]
         except KeyError:
@@ -1686,7 +1688,7 @@ def _get_dataset_engine(dataset_id: str, token: Optional[str]) -> DatasetEngine:
         if prefix[0] in engines:
             engine_id = prefix[0]
             dataset_id = prefix[1]
-        else:  # Default to Hugging Face Hub to help with the transition
+        else:
             engine_id = "hf"
         try:
             engine = engines[engine_id](dataset_id=dataset_id, token=token)

diff --git a/projects/modular_llm/README.md b/projects/modular_llm/README.md
@@ -1,15 +1,14 @@
-# Expert Library
+# Towards Modular LLM by Building and Reusing a Library of LoRAs
 
-The MTTL Expert Library enables:
+The code in this folder allows to reproduce the experiments in our paper. Mainly, the code contains scripts that use the MTTL library to:
 
 1. Train different kinds of adapters over LLMs;
 2. Build MoE models from these adapters, with custom routing strategies;
 3. Maintain a collection of experts, which can later be used to transfer to new tasks or update existing ones.
 
-
 ## Setup
 
-MTTL supports `Python >=3.8, <3.12`. Create a virtual environment using `virtualenv` or `conda`, then install the required Python packages:
+Before starting, make sure to install all the requirements going with MTTL. MTTL supports `Python >=3.8, <3.12`. Create a virtual environment using `virtualenv` or `conda`, then install the required Python packages from the root directory of this repository:
 
 ```bash
 conda create -n mttl python=3.11
@@ -18,8 +17,91 @@ conda activate mttl
 pip install -e .
 ```
 
+Alternatively:
+
+```
+pip install -r requirements.txt
+export PYTHONPATH=$PWD
+```
+
+## Dataset Preparation
+
+First of all, download and prepare [FLANv2](https://github.com/google-research/FLAN/tree/main/flan/v2) dataset. We limit each task to having 10000 examples for computational reasons. We provide a simple script to do all the preprocessing as below:
+
+```bash
+python cli_dataset_create.py flan --dataset_library_id=local://modular_artifacts/flan-flat
+```
+
+
+## Training a Private Library
+
+A *private* library consists of one expert per task in Flan. To train one expert starting from Phi-2, we can use the following command:
+
+```bash
+python train_experts_main.py \
+  -c configs/models/phi-2_hf.json \
+  -k remove_phi_eval_tasks=False \
+  finetune_task_name=ai2_arc_ARC_Easy_1_0_0 \
+  output_dir=arc_easy_expert/ \
+  dataset=local://modular_artifacts/flan-flat \
+  library_id=local://modular_artifacts/library \
+  expert_name=arc_easy
+```
+
+The expert will be automatically added to an *Expert Library* stored under `modular_artifacts/library`. To know more about the Expert Library concept, continue reading :).
+
+We provide a bash script that loops over all Flan tasks and trains one expert on each:
+
+```bash
+export LIBRARY_PATH=local://modular_artifacts/library
+export DATASET_PATH=local://modular_artifacts/flan-flat
+bash train_private_library.sh
+```
+
+To start, you can run `train_private_library_fast.sh` which trains only 2 experts using a small LM (gpt-neo 125M).
+
+After this, to analyze the content of your expert library, you can use the script in `mttl/cli/show_library.py` by providing the path to the library.
+
+## Training an MBC library
+
+To train an MBC library, we need to cluster a private library. To do so:
+
+```bash
+python run_mbc_clustering.py -k \
+  library_id=local://modular_artifacts/library \
+  num_clusters=10 \
+  output_file=modular_artifacts/mbc_10.json
+```
+
+The file `mbc_10.json` will contain the task names falling into each cluster. These task names can then be used to train experts by just passing `finetune_task_name=task_name1,task_name2` to the `train_experts_main.py` script.
 
-## Expert Library
+
+## Evaluating the Modular LLM
+
+Once we built a library, we can load it into the base model and apply a given merging or routing mechanism, such as Arrow. To evaluate the resulting modular LLM on, for example, arc-easy, you can run:
+
+```bash
+python eval_library.py \
+  -k output_dir=an_expert_eval/ \
+  library_id=local://modular_artifacts/library \
+  pipeline_eval_tasks='arc-easy' \
+  merge_or_route='uniform'
+```
+
+`merge_or_route='uniform'` means that we will just uniformly average all the experts in the library before performing inference. To run `Arrow`, use `merge_or_route='arrow'` instead:
+
+```bash
+python eval_library.py \
+  -k output_dir=an_expert_eval/ \
+  library_id=local://modular_artifacts/library \
+  pipeline_eval_tasks='arc-easy' \
+  merge_or_route='arrow' \
+  topk=4
+```
+
+At first, this will compute Arrow prototypes (thus will be a bit slower) but then the prototypes will be stored inside the library as additional artifacts, therefore subsequent calls will be much faster.
+
+## Additional Documentation around Expert Library
 
 ### Important Abstractions
 
@@ -100,34 +182,3 @@ retrieved_expert = expert_lib.get_expert("example_expert")
 # Remove an expert from the library
 expert_lib.remove_expert("example_expert")
 ```
-
-### Dataset Preparation
-
-Download and prepare [FLANv2](https://github.com/google-research/FLAN/tree/main/flan/v2) dataset:
-
-```bash
-python projects/modular_llm/cli_dataset_create.py flan --dataset_library_id=local://mttldata/flan-flat
-```
-
-To cite FLAN. please use:
-
-```
-@article{longpre2023flan,
-  title={The Flan Collection: Designing Data and Methods for Effective Instruction Tuning},
-  author={Longpre, Shayne and Hou, Le and Vu, Tu and Webson, Albert and Chung, Hyung Won and Tay, Yi and Zhou, Denny and Le, Quoc V and Zoph, Barret and Wei, Jason and others},
-  journal={arXiv preprint arXiv:2301.13688},
-  year={2023}
-}
-```
-
-### Training Experts
-
-```bash
-python train_experts_main.py -c configs/wiki-mmlu/gptneo_125m_flan.json -k finetune_task_name=ai2_arc_ARC_Easy_1_0_0 num_train_epochs=1 output_dir=an_expert/ library_id=local://mttldata/mttladapters-predictor pipeline_eval_tasks='arc-easy' expert_name=predictor
-```
-
-### Evaluating Experts
-
-```bash
-python eval_library.py -k output_dir=an_expert_eval/ library_id=local://mttldata/mttladapters-predictor pipeline_eval_tasks='arc-easy' merge_or_route='uniform'
-```
diff --git a/projects/modular_llm/configs/models/config.json b/projects/modular_llm/configs/models/config.json
diff --git a/projects/modular_llm/configs/models/gpt2_large_dense.json b/projects/modular_llm/configs/models/gpt2_large_dense.json
diff --git a/projects/modular_llm/configs/models/gpt2_large_experts.json b/projects/modular_llm/configs/models/gpt2_large_experts.json
diff --git a/projects/modular_llm/configs/models/gpt2neo_1B_cluster_expert.json b/projects/modular_llm/configs/models/gpt2neo_1B_cluster_expert.json
diff --git a/projects/modular_llm/configs/models/gpt2neo_1B_dense.json b/projects/modular_llm/configs/models/gpt2neo_1B_dense.json
diff --git a/projects/modular_llm/configs/models/gpt2neo_1B_dense_underparam.json b/projects/modular_llm/configs/models/gpt2neo_1B_dense_underparam.json
diff --git a/projects/modular_llm/configs/models/gpt2neo_1B_experts.json b/projects/modular_llm/configs/models/gpt2neo_1B_experts.json
diff --git a/projects/modular_llm/configs/models/gpt2neo_1B_underparam.json b/projects/modular_llm/configs/models/gpt2neo_1B_underparam.json
diff --git a/projects/modular_llm/configs/models/gpt2neo_1B_underparam_shared.json b/projects/modular_llm/configs/models/gpt2neo_1B_underparam_shared.json
diff --git a/..._llm/configs/models/gptneo_125m_flan.json → ...dular_llm/configs/models/gptneo_125m.json b/..._llm/configs/models/gptneo_125m_flan.json → ...dular_llm/configs/models/gptneo_125m.json