|
| 1 | +""" |
| 2 | +Evaluation of ensemble of LLMs on the BillSum dataset using LLM Blender |
| 3 | +
|
| 4 | +This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is |
| 5 | +evaluated on the BLEURT, BARTScore, and BERTScore metrics. |
| 6 | +
|
| 7 | +The pipeline performs the following steps: |
| 8 | +1. Loads multiple language models (LLaMA, Phi, OpenChat, OpenHermes, Solar, Qwen, Mistral). |
| 9 | +2. Builds prompts for each model using specific templates. |
| 10 | +3. Generates responses for prompts from the Mix-Instruct dataset using each model. |
| 11 | +4. Ranks the generated responses from all the models using the LLM Blender Ranker. |
| 12 | +5. Evaluates the top-ranked response against reference outputs using multiple metrics. |
| 13 | +
|
| 14 | +The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with 7 diverse LLMs. |
| 15 | +""" |
| 16 | + |
1 | 17 | from datasets import load_dataset |
2 | 18 | from haystack import Pipeline |
3 | 19 | from haystack.components.builders import PromptBuilder |
4 | 20 | from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator |
5 | 21 |
|
6 | 22 | from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker |
7 | 23 |
|
| 24 | +# Load the BillSum dataset |
8 | 25 | dataset = load_dataset("billsum", split="test") |
9 | 26 |
|
| 27 | +# Define prompt templates for each model |
10 | 28 | llama_prompt_template = ( |
11 | 29 | """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """ |
12 | 30 | """text. The summary should cover all the key points and main ideas presented in the original text, while """ |
|
51 | 69 | """a concise and easy-to-understand format.: {{ prompt }} [/INST] """ |
52 | 70 | ) |
53 | 71 |
|
| 72 | +# Initialize PromptBuilder for each model |
54 | 73 | llama_prompt_builder = PromptBuilder(template=llama_prompt_template) |
55 | 74 | phi_prompt_builder = PromptBuilder(template=phi_prompt_template) |
56 | 75 | openchat_prompt_builder = PromptBuilder(template=openchat_prompt_template) |
|
59 | 78 | qwen_prompt_builder = PromptBuilder(template=qwen_prompt_template) |
60 | 79 | mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template) |
61 | 80 |
|
| 81 | +# Define model and generation parameters for all models |
62 | 82 | model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}} |
63 | 83 |
|
| 84 | +# Initialize LlamaCppGenerator for each model |
64 | 85 | llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params) |
65 | 86 | phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params) |
66 | 87 | openchat_model = LlamaCppGenerator(model="models/openchat-3.5-0106.Q4_K_M.gguf", **model_params) |
|
69 | 90 | qwen_model = LlamaCppGenerator(model="models/qwen1_5-7b-chat-Q4_K_M.gguf", **model_params) |
70 | 91 | mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params) |
71 | 92 |
|
| 93 | +# Initialize LLMBlenderRanker to ensemble multiple models |
72 | 94 | llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu") |
73 | 95 |
|
74 | | - |
| 96 | +# Create the main pipeline |
75 | 97 | blender_pipeline = Pipeline() |
76 | 98 |
|
| 99 | +# Add components to the pipeline |
77 | 100 | blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder") |
78 | 101 | blender_pipeline.add_component(instance=llama_model, name="llama_model") |
79 | 102 |
|
|
97 | 120 |
|
98 | 121 | blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker") |
99 | 122 |
|
| 123 | +# Connect components in the pipeline |
| 124 | +# Connect the prompt builders to the respective model |
100 | 125 | blender_pipeline.connect("llama_prompt_builder", "llama_model") |
101 | 126 | blender_pipeline.connect("phi_prompt_builder", "phi_model") |
102 | 127 | blender_pipeline.connect("openchat_prompt_builder", "openchat_model") |
|
105 | 130 | blender_pipeline.connect("qwen_prompt_builder", "qwen_model") |
106 | 131 | blender_pipeline.connect("mistral_prompt_builder", "mistral_model") |
107 | 132 |
|
| 133 | +# Connect all the models to the LLMBlenderRanker for ensembling |
108 | 134 | blender_pipeline.connect("llama_model", "llm_blender_ranker") |
109 | 135 | blender_pipeline.connect("phi_model", "llm_blender_ranker") |
110 | 136 | blender_pipeline.connect("openchat_model", "llm_blender_ranker") |
|
113 | 139 | blender_pipeline.connect("qwen_model", "llm_blender_ranker") |
114 | 140 | blender_pipeline.connect("mistral_model", "llm_blender_ranker") |
115 | 141 |
|
| 142 | +# Process the dataset and generate answers |
116 | 143 | generated_answers_labels = [] |
117 | 144 | for row in dataset: |
118 | 145 | prompt = row["input"] |
119 | 146 | label = row["output"] |
| 147 | + |
| 148 | + # Run the pipeline for each input |
120 | 149 | output = blender_pipeline.run( |
121 | 150 | { |
122 | 151 | {"llama_prompt_builder": {"prompt": prompt}}, |
|
130 | 159 | ) |
131 | 160 | generated_answers_labels.append((output["answers"], label)) |
132 | 161 |
|
| 162 | +# Prepare data for evaluation |
133 | 163 | preds = [] |
134 | 164 | labels = [] |
135 | 165 | for ranked_answers, label in generated_answers_labels: |
136 | 166 | # Use top ranked output as the answer |
137 | 167 | preds.append(ranked_answers[0].data) |
138 | 168 | labels.append(label) |
139 | 169 |
|
| 170 | +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs |
140 | 171 | evaluator = LLMBlenderEvaluator(preds=preds, labels=labels) |
| 172 | + |
| 173 | +# Compute various metrics to evaluate the generated results against the reference outputs |
141 | 174 | metrics = evaluator.compute_metrics() |
142 | 175 |
|
| 176 | +# Print the evaluation metrics |
143 | 177 | print("BLEURT Score", metrics["bleurt"]) |
144 | 178 | print("BARTSCORE Score", metrics["bartscore"]) |
145 | 179 | print("BERTSCORE Score", metrics["bertscore"]) |
0 commit comments