Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ default:
cmd:
- prep
- llmargs
- modelEval

prep:
doctype: snt # 'rvw' for single review, but 'snt' for segmented subreviews
Expand Down Expand Up @@ -34,9 +35,12 @@ args: # required for 'prep' step
llmargs:
use_api: true
api_key: lm-studio
api_base: http://localhost:1234/v1
api_base: http://10.60.33.10:8000/v1 #http://localhost:1234/v1
model_name: deepseek-r1-distill-qwen-32b
temperature: 0.5
max_tokens: 1024
output: ../output/AspectAdded/semeval-agg/aspectAdded.pkl
top_k_aspects: 1
outputEval: ../output/modelEval/


132 changes: 124 additions & 8 deletions src/llm/aspect_extraction_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from llm.llm_config_handler import LLMconfig, LLMHandler
from cmn.review import Review
from llm.prompt_builder import PromptBuilder

from llm.trustworthiness_check import TrustWorthiness_PromptBuilder, model_Evaluator
from tqdm import tqdm
import json
import re
from omegaconf import DictConfig
import pandas as pd
import os
import string
import random



Expand All @@ -35,7 +36,6 @@ def _init_llm_handler(self):
def find_aspect_indices(aspect: str, sentence_tokens) :
aspect_tokens = aspect.lower().split()
tokens = [token.lower().strip(string.punctuation) for token in sentence_tokens]

for i in range(len(tokens) - len(aspect_tokens) + 1):
if tokens[i:i + len(aspect_tokens)] == aspect_tokens: return list(range(i, i + len(aspect_tokens)))

Expand Down Expand Up @@ -63,13 +63,12 @@ def process_reviews(self, reviews: list):
if "aspect" in aspect_data and aspect_data["aspect"]:
valid_json_found = True
break
except json.JSONDecodeError:
continue

if valid_json_found:
break
else:
print(f"Invalid or no valid JSON with 'aspect' found. Attempt {attempt + 1} of {max_retries}")
except json.JSONDecodeError: continue

if valid_json_found: break
else: print(f"Invalid or no valid JSON with 'aspect' found. Attempt {attempt + 1} of {max_retries}")


if not matches:
print("No JSON object found in response")
Expand Down Expand Up @@ -127,12 +126,129 @@ def save_to_pickle(self, reviews):
print(f'\nSaving processed Review.pickle file {output_dir}/{filename}...')
pd.to_pickle(reviews, os.path.join(output_dir, filename))

# Model Evalution Code


def evaluate_llm_trustworthiness(self, reviews: list):
prompt_builder = TrustWorthiness_PromptBuilder()
results = []

for i, review in enumerate(tqdm(reviews)):
print(vars(review))
if not hasattr(review, 'aspects') or not review.aspects:
review.aspects = self.extract_aspects_from_aos(review)
print(review.aspects)
print()
if not review.aspects: continue

review_text = ' '.join(review.sentences[0])
review_dict = vars(review)

# Correct aspect
correct_aspect = random.choice(review.aspects)
correct_prompt = prompt_builder.build_prompt(review_dict, correct_aspect)
correct_response = self.llm_handler.get_response(correct_prompt)
correct_pred = self._parse_llm_answer(correct_response)

results.append({
"review_id": review.id,
"review_text": review_text,
"aspect": correct_aspect,
"expected_answer": "Yes",
"model_prediction": correct_pred,
"is_correct": correct_pred == "Yes"
})

# Incorrect aspect
wrong_aspect = self._get_wrong_aspect(reviews, i)
if not wrong_aspect:
continue

wrong_prompt = prompt_builder.build_prompt(review_dict, wrong_aspect)
wrong_response = self.llm_handler.get_response(wrong_prompt)
wrong_pred = self._parse_llm_answer(wrong_response)

results.append({
"review_id": review.id,
"review_text": review_text,
"aspect": wrong_aspect,
"expected_answer": "No",
"model_prediction": wrong_pred,
"is_correct": wrong_pred == "No"
})

self.save_to_excel(results)
return True


def accuracy_Evaluator(self):
path = self.cfg.llmargs.outputEval
model_eval = model_Evaluator()

if os.path.exists(path) and os.path.isdir(path):
model_eval.evaluator(path)
else:
print("The specified path does not exist or is not a directory.")

def save_to_excel(self, results):
output_path = self.cfg.llmargs.outputEval
output_dir = os.path.dirname(output_path)

if not os.path.exists(output_dir): os.makedirs(output_dir)

base_name = os.path.splitext(os.path.basename(self.cfg.args.data))[0]
filename = f"{base_name}_trustworthiness_eval.xlsx"

df = pd.DataFrame(results)
df.to_excel(os.path.join(output_dir, filename), index=False)
print(f'\Saved processed Evaluation file {output_dir}/{filename}...')


def _parse_llm_answer(self, response: str) -> str:
match = re.findall(r'Answer\s*:\s*({\s*"Answer"\s*:\s*"(Yes|No)"\s*})', response, re.IGNORECASE)
if match:
last_json_str = match[-1][0] # Full JSON string
try:
answer_dict = json.loads(last_json_str)
return answer_dict.get("Answer", "Invalid").capitalize()
except json.JSONDecodeError: return "Invalid"
return "Invalid"

def _get_wrong_aspect(self, reviews, exclude_index):
other_reviews = [r for idx, r in enumerate(reviews) if idx != exclude_index and hasattr(r, 'aspects') and r.aspects]
if not other_reviews: return None
return random.choice(random.choice(other_reviews).aspects)

def extract_aspects_from_aos(self, review):
aspects = []
for ao_list in review.aos:
for aspect_info in ao_list:
indices = aspect_info[0] # aspect token indices
if indices and indices[0] != -1:
flat_tokens = [word for sent in review.sentences for word in sent]
try:
aspect_term = ' '.join(flat_tokens[i] for i in indices if i < len(flat_tokens))
if aspect_term.strip():
aspects.append(aspect_term.strip().lower())
except IndexError: continue
return aspects

# Entry function
def mainllm(cfg: DictConfig, reviews: list):
processor = LLMReviewProcessor(cfg)
return processor.process_reviews(reviews)

def llmEval(cfg: DictConfig, reviews: list):
processor = LLMReviewProcessor(cfg)

bolVal = processor.evaluate_llm_trustworthiness(reviews)
if(bolVal): processor.accuracy_Evaluator()









Expand Down
8 changes: 2 additions & 6 deletions src/llm/llm_config_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,14 @@ def __init__(self, use_api=True, api_key=None, api_base=None, temperature=0.5, m

class LLMHandler:


def __init__(self, config):
self.config = config
self.generator = None

if self.config.use_api:
openai.api_key = self.config.api_key
if self.config.api_base:
openai.api_base = self.config.api_base
else:
if self.config.api_base: openai.api_base = self.config.api_base
else: # Not Using Model Locally in The code Yet # So this part of Code is not being used
self.tokenizer = AutoTokenizer.from_pretrained(self.config.local_model_path)
self.model = AutoModelForCausalLM.from_pretrained(self.config.local_model_path)
self.generator = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
Expand All @@ -53,9 +51,7 @@ def get_response(self, prompt: str) -> str:
else:
#result = self.generator(prompt, max_length=max_tokens + len(prompt.split()), do_sample=True)
max_len = min(self.config.max_tokens + len(prompt.split()), 1024)

result = self.generator(prompt, max_length=max_len, truncation=True, do_sample=True)

return result[0]['generated_text'].strip()

except Exception as e:
Expand Down
85 changes: 85 additions & 0 deletions src/llm/summarize_trustworthiness_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import pandas as pd

def collect_excel_files(base_folder):
#Recursively collects all .xlsx files under the base folder.
excel_files = []
for root, dirs, files in os.walk(base_folder):
for file in files:
if file.endswith(".xlsx"): excel_files.append(os.path.join(root, file))
return excel_files

def group_by_top_folder(files, base_folder):
#Groups file paths by their top-level folder within the base folder.
grouped = {}
for f in files:
parts = os.path.relpath(f, base_folder).split(os.sep)
group = parts[0] # top-level directory
grouped.setdefault(group, []).append(f)
return grouped

def summarize_files(file_list):
#Extracts and combines 'Evaluation_summary' sheets from a list of Excel files.
summaries = {}
for path in file_list:
try:
summary_df = pd.read_excel(path, sheet_name="Evaluation_summary")
summary_df.insert(0, "File_Name", os.path.basename(path)) # Add filename to each row
summaries[os.path.basename(path)] = summary_df
except Exception as e: print(f"Skipping {path}: {e}")
if summaries: return pd.concat(summaries.values(), ignore_index=True)
else: return pd.DataFrame()



def generate_overall_metrics(summary_file_path):
#Reads a summary Excel file and computes macro/micro accuracy metrics.
#Adds a new sheet named 'Overall_Metrics' to the same file.
df = pd.read_excel(summary_file_path, sheet_name=None)

if 'Sheet1' in df: data = df['Sheet1']
else: data = next(iter(df.values()))

macro_accuracy = data["Accuracy"].mean()
total_correct = data["Correct Predictions"].sum()
total_cases = data["Total Cases"].sum()
micro_accuracy = (total_correct / total_cases) * 100 if total_cases > 0 else 0

overall_analysis = pd.Series({
"Macro Accuracy (%)": round(macro_accuracy, 2),
"Micro Accuracy (%)": round(micro_accuracy, 2),
"Total Files": len(data),
"Total Cases": total_cases,
"Total Correct Predictions": total_correct,
"Total Incorrect Predictions": data["Incorrect Predictions"].sum()
})

# Save back to the same Excel file with a new sheet
with pd.ExcelWriter(summary_file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: overall_analysis.to_frame(name='Value').to_excel(writer, sheet_name='Overall_Metrics')

print(f"Overall metrics added to sheet 'Overall_Metrics' in {summary_file_path}")

def main(base_folder, save_folder):
os.makedirs(save_folder, exist_ok=True) # Ensure the save folder exists

all_files = collect_excel_files(base_folder)
grouped_files = group_by_top_folder(all_files, base_folder)
all_group_summaries = []
for group, files in grouped_files.items():
print(f"Processing group: {group} with {len(files)} files")
group_summary = summarize_files(files)
if not group_summary.empty:
group_summary.insert(0, "Dataset_Group", group)
group_summary.to_excel(os.path.join(save_folder, f"{group}_trustworthiness_summary.xlsx"), index=False)
all_group_summaries.append(group_summary)

if all_group_summaries:
final_summary = pd.concat(all_group_summaries, ignore_index=True)
file_path = os.path.join(save_folder, "overall_trustworthiness_summary.xlsx")
final_summary.to_excel(file_path, index=False)
generate_overall_metrics(file_path)
print("All summaries generated successfully.")
else: print("No valid summaries found.")


main("/Users/karanveersinghsidhu/LADy_Model_Eval_Results", "/Users/karanveersinghsidhu/LADy_Model_Eval_Results")
Loading