Skip to content

Commit ca3894d

Browse files
awinmlvrunm
andauthored
Add detailed comments to pipelines (#1)
* Add detailed comments and update paper --------- Co-authored-by: Varun Mathur <[email protected]>
1 parent 1b2a32e commit ca3894d

22 files changed

+953
-56
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,10 @@ print(ranked_answers)
8080
# ]
8181
```
8282

83-
The API documentation can be found [here](src/llm_blender/README.md).
83+
The detailed documentation can be found in the [LLM-Blender API Reference](src/llm_blender/README.md).
84+
85+
As the [`llm-blender` library](https://github.com/yuchenlin/LLM-Blender) lacks a stable release, the necessary code has been incorporated into this project under `src/llm_blender/llm_blender_utils`.
86+
8487

8588
## Results
8689

paper/llm_blender.pdf

6.61 KB
Binary file not shown.

src/llm_blender/billsum/llama.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
"""Evaluation of Llama-3-8b on the BillSum dataset
2+
3+
This script implements a pipeline to evaluate the Llama-3-8b model on the BillSum dataset
4+
on the BLEURT, BARTScore, and BERTScore metrics.
5+
6+
The pipeline performs the following steps:
7+
1. Loads the Llama-3-8b model using the LlamaCppGenerator from Haystack.
8+
2. Generates responses for prompts and instructions from the BillSum dataset.
9+
3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
10+
11+
This evaluation provides a baseline for the model's performance on the BillSum dataset.
12+
"""
13+
114
from datasets import load_dataset
215
from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
316

@@ -8,46 +21,78 @@ def generate_result(
821
generator: LlamaCppGenerator,
922
prompt: str = "",
1023
) -> str:
24+
"""
25+
Generate a response using the LlamaCppGenerator.
1126
27+
The prompt and instruction are formatted to be compatible with the model.
28+
29+
Args:
30+
generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
31+
prompt (str): The main text input for the model.
32+
33+
Returns:
34+
str: The generated response from the model.
35+
"""
36+
# Additional instructions for the model for Summarization
1237
instruction = (
1338
""" Provide a comprehensive summary of the given text. """
1439
"""The summary should cover all the key points and main ideas presented in the original text, """
1540
"""while also condensing the information into a concise and easy-to-understand format."""
1641
)
1742

1843
# Format prompt to be compatible with meta-llama-3-8b-instruct
44+
# This specific format is required for the model to distinguish between user input and expected output
1945
formatted_prompt = (
2046
"""<|begin_of_text|><|start_header_id|>user<|end_header_id|> """
2147
f"""{instruction} {prompt} <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
2248
)
2349

24-
# Generate text
50+
# Generate text using the LlamaCppGenerator
2551
result = generator.run(
2652
formatted_prompt,
2753
generation_kwargs={"max_tokens": 500, "temperature": 0.1},
2854
)
55+
56+
# Extract the generated text from the result
2957
generated_answer = result["replies"][0]
3058
return generated_answer
3159

3260

61+
# Define the path to the model weights
3362
model = "meta-llama-3-8b-instruct.Q4_K_M.gguf"
63+
64+
# Initialize the LlamaCppGenerator with the specified model and context window size
3465
generator = LlamaCppGenerator(
3566
model=model,
3667
n_ctx=256,
3768
)
69+
70+
# Warm up the generator (loading the model into memory)
3871
generator.warm_up()
3972

73+
# Load the dataset from the HuggingFace
4074
dataset = load_dataset("billsum", split="test")
75+
76+
# Convert the dataset to a pandas DataFrame for easier manipulation
4177
dataset = dataset.to_pandas()
78+
79+
# Generate results for each row in the dataset
80+
# Apply the generate_result function to each row, using the 'text' column
81+
# Store the results in the 'result' column
4282
dataset.loc[:, "result"] = dataset.apply(
4383
lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
4484
)
45-
dataset.to_csv("output_llama.csv", index=False)
4685

86+
# Save the generated texts to a CSV file
87+
dataset.to_csv("output_llama.csv", index=False)
4788

89+
# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
4890
evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
91+
92+
# Compute various metrics to evaluate the generated results against the reference outputs
4993
metrics = evaluator.compute_metrics()
5094

95+
# Print the computed metrics
5196
print("BLEURT Score", metrics["bleurt"])
5297
print("BARTSCORE Score", metrics["bartscore"])
5398
print("BERTSCORE Score", metrics["bertscore"])

src/llm_blender/billsum/llm_blender_ranker_all_llms.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,30 @@
1+
"""
2+
Evaluation of ensemble of LLMs on the BillSum dataset using LLM Blender
3+
4+
This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is
5+
evaluated on the BLEURT, BARTScore, and BERTScore metrics.
6+
7+
The pipeline performs the following steps:
8+
1. Loads multiple language models (LLaMA, Phi, OpenChat, OpenHermes, Solar, Qwen, Mistral).
9+
2. Builds prompts for each model using specific templates.
10+
3. Generates responses for prompts from the Mix-Instruct dataset using each model.
11+
4. Ranks the generated responses from all the models using the LLM Blender Ranker.
12+
5. Evaluates the top-ranked response against reference outputs using multiple metrics.
13+
14+
The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with 7 diverse LLMs.
15+
"""
16+
117
from datasets import load_dataset
218
from haystack import Pipeline
319
from haystack.components.builders import PromptBuilder
420
from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
521

622
from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker
723

24+
# Load the BillSum dataset
825
dataset = load_dataset("billsum", split="test")
926

27+
# Define prompt templates for each model
1028
llama_prompt_template = (
1129
"""<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
1230
"""text. The summary should cover all the key points and main ideas presented in the original text, while """
@@ -51,6 +69,7 @@
5169
"""a concise and easy-to-understand format.: {{ prompt }} [/INST] """
5270
)
5371

72+
# Initialize PromptBuilder for each model
5473
llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
5574
phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
5675
openchat_prompt_builder = PromptBuilder(template=openchat_prompt_template)
@@ -59,8 +78,10 @@
5978
qwen_prompt_builder = PromptBuilder(template=qwen_prompt_template)
6079
mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)
6180

81+
# Define model and generation parameters for all models
6282
model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}}
6383

84+
# Initialize LlamaCppGenerator for each model
6485
llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
6586
phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
6687
openchat_model = LlamaCppGenerator(model="models/openchat-3.5-0106.Q4_K_M.gguf", **model_params)
@@ -69,11 +90,13 @@
6990
qwen_model = LlamaCppGenerator(model="models/qwen1_5-7b-chat-Q4_K_M.gguf", **model_params)
7091
mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)
7192

93+
# Initialize LLMBlenderRanker to ensemble multiple models
7294
llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")
7395

74-
96+
# Create the main pipeline
7597
blender_pipeline = Pipeline()
7698

99+
# Add components to the pipeline
77100
blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
78101
blender_pipeline.add_component(instance=llama_model, name="llama_model")
79102

@@ -97,6 +120,8 @@
97120

98121
blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")
99122

123+
# Connect components in the pipeline
124+
# Connect the prompt builders to the respective model
100125
blender_pipeline.connect("llama_prompt_builder", "llama_model")
101126
blender_pipeline.connect("phi_prompt_builder", "phi_model")
102127
blender_pipeline.connect("openchat_prompt_builder", "openchat_model")
@@ -105,6 +130,7 @@
105130
blender_pipeline.connect("qwen_prompt_builder", "qwen_model")
106131
blender_pipeline.connect("mistral_prompt_builder", "mistral_model")
107132

133+
# Connect all the models to the LLMBlenderRanker for ensembling
108134
blender_pipeline.connect("llama_model", "llm_blender_ranker")
109135
blender_pipeline.connect("phi_model", "llm_blender_ranker")
110136
blender_pipeline.connect("openchat_model", "llm_blender_ranker")
@@ -113,10 +139,13 @@
113139
blender_pipeline.connect("qwen_model", "llm_blender_ranker")
114140
blender_pipeline.connect("mistral_model", "llm_blender_ranker")
115141

142+
# Process the dataset and generate answers
116143
generated_answers_labels = []
117144
for row in dataset:
118145
prompt = row["input"]
119146
label = row["output"]
147+
148+
# Run the pipeline for each input
120149
output = blender_pipeline.run(
121150
{
122151
{"llama_prompt_builder": {"prompt": prompt}},
@@ -130,16 +159,21 @@
130159
)
131160
generated_answers_labels.append((output["answers"], label))
132161

162+
# Prepare data for evaluation
133163
preds = []
134164
labels = []
135165
for ranked_answers, label in generated_answers_labels:
136166
# Use top ranked output as the answer
137167
preds.append(ranked_answers[0].data)
138168
labels.append(label)
139169

170+
# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
140171
evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)
172+
173+
# Compute various metrics to evaluate the generated results against the reference outputs
141174
metrics = evaluator.compute_metrics()
142175

176+
# Print the evaluation metrics
143177
print("BLEURT Score", metrics["bleurt"])
144178
print("BARTSCORE Score", metrics["bartscore"])
145179
print("BERTSCORE Score", metrics["bertscore"])

src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,30 @@
1+
"""
2+
Evaluation of ensemble of best performing LLMs on the BillSum dataset using LLM Blender
3+
4+
This script implements a pipeline to ensemble multiple language models on the BillSum dataset. The pipeline is
5+
evaluated on the BLEURT, BARTScore, and BERTScore metrics.
6+
7+
The pipeline performs the following steps:
8+
1. Loads 3 top performing LLMs: LLaMA, Phi and Mistral.
9+
2. Builds prompts for each model using specific templates.
10+
3. Generates responses for prompts from the Mix-Instruct dataset using each model.
11+
4. Ranks the generated responses from all the models using the LLM Blender Ranker.
12+
5. Evaluates the top-ranked response against reference outputs using multiple metrics.
13+
14+
The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with diverse LLMs.
15+
"""
16+
117
from datasets import load_dataset
218
from haystack import Pipeline
319
from haystack.components.builders import PromptBuilder
420
from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
521

622
from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker
723

24+
# Load the BillSum dataset
825
dataset = load_dataset("billsum", split="test")
926

27+
# Define prompt templates for each model
1028
llama_prompt_template = (
1129
"""<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
1230
"""text. The summary should cover all the key points and main ideas presented in the original text, while """
@@ -26,20 +44,26 @@
2644
"""a concise and easy-to-understand format.: {{ prompt }} [/INST] """
2745
)
2846

47+
# Initialize PromptBuilder for each model
2948
llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
3049
phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
3150
mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)
3251

52+
# Define model and generation parameters for all models
3353
model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}}
3454

55+
# Initialize LlamaCppGenerator for each model
3556
llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
3657
phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
3758
mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)
3859

60+
# Initialize LLMBlenderRanker to ensemble multiple models
3961
llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")
4062

63+
# Create the main pipeline
4164
blender_pipeline = Pipeline()
4265

66+
# Add components to the pipeline
4367
blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
4468
blender_pipeline.add_component(instance=llama_model, name="llama_model")
4569

@@ -51,18 +75,24 @@
5175

5276
blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")
5377

78+
# Connect components in the pipeline
79+
# Connect the prompt builders to the respective model
5480
blender_pipeline.connect("llama_prompt_builder", "llama_model")
5581
blender_pipeline.connect("phi_prompt_builder", "phi_model")
5682
blender_pipeline.connect("mistral_prompt_builder", "mistral_model")
5783

84+
# Connect all the models to the LLMBlenderRanker for ensembling
5885
blender_pipeline.connect("llama_model", "llm_blender_ranker")
5986
blender_pipeline.connect("phi_model", "llm_blender_ranker")
6087
blender_pipeline.connect("mistral_model", "llm_blender_ranker")
6188

89+
# Process the dataset and generate answers
6290
generated_answers_labels = []
6391
for row in dataset:
6492
prompt = row["input"]
6593
label = row["output"]
94+
95+
# Run the pipeline for each input
6696
output = blender_pipeline.run(
6797
{
6898
{
@@ -74,16 +104,21 @@
74104
)
75105
generated_answers_labels.append((output["answers"], label))
76106

107+
# Prepare data for evaluation
77108
preds = []
78109
labels = []
79110
for ranked_answers, label in generated_answers_labels:
80111
# Use top ranked output as the answer
81112
preds.append(ranked_answers[0].data)
82113
labels.append(label)
83114

115+
# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
84116
evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)
117+
118+
# Compute various metrics to evaluate the generated results against the reference outputs
85119
metrics = evaluator.compute_metrics()
86120

121+
# Print the evaluation metrics
87122
print("BLEURT Score", metrics["bleurt"])
88123
print("BARTSCORE Score", metrics["bartscore"])
89124
print("BERTSCORE Score", metrics["bertscore"])

0 commit comments

Comments
 (0)