fani-lab · sidhu66 · Jun 4, 2025 · Jul 5, 2025 · Aug 1, 2025 · Aug 1, 2025
diff --git a/src/config.yaml b/src/config.yaml
@@ -4,6 +4,7 @@ default:
 cmd:
   - prep
   - llmargs
+  - modelEval
 
 prep:
   doctype: snt  # 'rvw' for single review, but 'snt' for segmented subreviews
@@ -34,9 +35,12 @@ args: # required for 'prep' step
 llmargs:
   use_api: true
   api_key: lm-studio
-  api_base: http://localhost:1234/v1
+  api_base: http://10.60.33.10:8000/v1 #http://localhost:1234/v1
   model_name: deepseek-r1-distill-qwen-32b
   temperature: 0.5
   max_tokens: 1024
   output: ../output/AspectAdded/semeval-agg/aspectAdded.pkl
   top_k_aspects: 1
+  outputEval: ../output/modelEval/
+
+
diff --git a/src/llm/aspect_extraction_pipeline.py b/src/llm/aspect_extraction_pipeline.py
@@ -1,14 +1,15 @@
 from llm.llm_config_handler import LLMconfig, LLMHandler
 from cmn.review import Review  
 from llm.prompt_builder import PromptBuilder
-
+from llm.trustworthiness_check import TrustWorthiness_PromptBuilder, model_Evaluator
 from tqdm import tqdm
 import json
 import re
 from omegaconf import DictConfig
 import pandas as pd
 import os
 import string
+import random
 
 
 
@@ -35,7 +36,6 @@ def _init_llm_handler(self):
     def find_aspect_indices(aspect: str, sentence_tokens) :
         aspect_tokens = aspect.lower().split()
         tokens = [token.lower().strip(string.punctuation) for token in sentence_tokens]
-
         for i in range(len(tokens) - len(aspect_tokens) + 1):
             if tokens[i:i + len(aspect_tokens)] == aspect_tokens: return list(range(i, i + len(aspect_tokens)))
 
@@ -63,13 +63,12 @@ def process_reviews(self, reviews: list):
                         if "aspect" in aspect_data and aspect_data["aspect"]:
                             valid_json_found = True
                             break
-                    except json.JSONDecodeError:
-                        continue
 
-                if valid_json_found:
-                    break
-                else:
-                    print(f"Invalid or no valid JSON with 'aspect' found. Attempt {attempt + 1} of {max_retries}")
+                    except json.JSONDecodeError: continue
+
+                if valid_json_found: break
+                else: print(f"Invalid or no valid JSON with 'aspect' found. Attempt {attempt + 1} of {max_retries}")
+
 
             if not matches: 
                 print("No JSON object found in response") 
@@ -127,12 +126,129 @@ def save_to_pickle(self, reviews):
         print(f'\nSaving processed Review.pickle file {output_dir}/{filename}...')
         pd.to_pickle(reviews, os.path.join(output_dir, filename))
 
+# Model Evalution Code
+
+
+    def evaluate_llm_trustworthiness(self, reviews: list):
+        prompt_builder = TrustWorthiness_PromptBuilder()
+        results = []
+
+        for i, review in enumerate(tqdm(reviews)):
+            print(vars(review))
+            if not hasattr(review, 'aspects') or not review.aspects:
+                review.aspects = self.extract_aspects_from_aos(review)
+                print(review.aspects)
+                print()
+                if not review.aspects: continue
+
+            review_text = ' '.join(review.sentences[0])
+            review_dict = vars(review)
+
+            # Correct aspect
+            correct_aspect = random.choice(review.aspects)
+            correct_prompt = prompt_builder.build_prompt(review_dict, correct_aspect)
+            correct_response = self.llm_handler.get_response(correct_prompt)
+            correct_pred = self._parse_llm_answer(correct_response)
+
+            results.append({
+                "review_id": review.id,
+                "review_text": review_text,
+                "aspect": correct_aspect,
+                "expected_answer": "Yes",
+                "model_prediction": correct_pred,
+                "is_correct": correct_pred == "Yes"
+            })
+
+            #  Incorrect aspect
+            wrong_aspect = self._get_wrong_aspect(reviews, i)
+            if not wrong_aspect:
+                continue
+
+            wrong_prompt = prompt_builder.build_prompt(review_dict, wrong_aspect)
+            wrong_response = self.llm_handler.get_response(wrong_prompt)
+            wrong_pred = self._parse_llm_answer(wrong_response)
+
+            results.append({
+                "review_id": review.id,
+                "review_text": review_text,
+                "aspect": wrong_aspect,
+                "expected_answer": "No",
+                "model_prediction": wrong_pred,
+                "is_correct": wrong_pred == "No"
+            })
+
+        self.save_to_excel(results)
+        return True
+
+
+    def accuracy_Evaluator(self):
+        path = self.cfg.llmargs.outputEval
+        model_eval = model_Evaluator()
+
+        if os.path.exists(path) and os.path.isdir(path):
+            model_eval.evaluator(path)
+        else:
+            print("The specified path does not exist or is not a directory.")
+
+    def save_to_excel(self, results):
+        output_path = self.cfg.llmargs.outputEval
+        output_dir = os.path.dirname(output_path)
+
+        if not os.path.exists(output_dir): os.makedirs(output_dir)
+
+        base_name = os.path.splitext(os.path.basename(self.cfg.args.data))[0]
+        filename = f"{base_name}_trustworthiness_eval.xlsx"
+
+        df = pd.DataFrame(results)
+        df.to_excel(os.path.join(output_dir, filename), index=False)
+        print(f'\Saved processed Evaluation  file {output_dir}/{filename}...')
+
+
+    def _parse_llm_answer(self, response: str) -> str:
+        match = re.findall(r'Answer\s*:\s*({\s*"Answer"\s*:\s*"(Yes|No)"\s*})', response, re.IGNORECASE)
+        if match:
+            last_json_str = match[-1][0]  # Full JSON string
+            try:
+                answer_dict = json.loads(last_json_str)
+                return answer_dict.get("Answer", "Invalid").capitalize()
+            except json.JSONDecodeError: return "Invalid"
+        return "Invalid"
+
+    def _get_wrong_aspect(self, reviews, exclude_index):
+        other_reviews = [r for idx, r in enumerate(reviews) if idx != exclude_index and hasattr(r, 'aspects') and r.aspects]
+        if not other_reviews: return None
+        return random.choice(random.choice(other_reviews).aspects)
+
+    def extract_aspects_from_aos(self, review):
+        aspects = []
+        for ao_list in review.aos:
+            for aspect_info in ao_list:
+                indices = aspect_info[0]  # aspect token indices
+                if indices and indices[0] != -1:
+                    flat_tokens = [word for sent in review.sentences for word in sent]
+                    try:
+                        aspect_term = ' '.join(flat_tokens[i] for i in indices if i < len(flat_tokens))
+                        if aspect_term.strip():
+                            aspects.append(aspect_term.strip().lower())
+                    except IndexError: continue
+        return aspects
 
 # Entry function
 def mainllm(cfg: DictConfig, reviews: list):
     processor = LLMReviewProcessor(cfg)
     return processor.process_reviews(reviews)
 
+def llmEval(cfg: DictConfig, reviews: list):
+    processor = LLMReviewProcessor(cfg)
+
+    bolVal = processor.evaluate_llm_trustworthiness(reviews)
+    if(bolVal): processor.accuracy_Evaluator()
+
+
+
+
+
+
 
 
 

diff --git a/src/llm/llm_config_handler.py b/src/llm/llm_config_handler.py
@@ -19,16 +19,14 @@ def __init__(self, use_api=True, api_key=None, api_base=None, temperature=0.5, m
 
 class LLMHandler:
 
-
     def __init__(self, config):
         self.config = config
         self.generator = None
 
         if self.config.use_api:
             openai.api_key = self.config.api_key
-            if self.config.api_base:
-                openai.api_base = self.config.api_base
-        else:
+            if self.config.api_base: openai.api_base = self.config.api_base
+        else:   # Not Using Model Locally in The code Yet # So this part of Code is not being used
             self.tokenizer = AutoTokenizer.from_pretrained(self.config.local_model_path)
             self.model = AutoModelForCausalLM.from_pretrained(self.config.local_model_path)
             self.generator = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
@@ -53,9 +51,7 @@ def get_response(self, prompt: str) -> str:
             else:
                 #result = self.generator(prompt, max_length=max_tokens + len(prompt.split()), do_sample=True)
                 max_len = min(self.config.max_tokens + len(prompt.split()), 1024)
-
                 result = self.generator(prompt, max_length=max_len, truncation=True, do_sample=True)
-
                 return result[0]['generated_text'].strip()
 
         except Exception as e:

diff --git a/src/llm/summarize_trustworthiness_results.py b/src/llm/summarize_trustworthiness_results.py
@@ -0,0 +1,85 @@
+import os
+import pandas as pd
+
+def collect_excel_files(base_folder):
+    #Recursively collects all .xlsx files under the base folder.
+    excel_files = []
+    for root, dirs, files in os.walk(base_folder):
+        for file in files:
+            if file.endswith(".xlsx"): excel_files.append(os.path.join(root, file))
+    return excel_files
+
+def group_by_top_folder(files, base_folder):
+    #Groups file paths by their top-level folder within the base folder.
+    grouped = {}
+    for f in files:
+        parts = os.path.relpath(f, base_folder).split(os.sep)
+        group = parts[0]  # top-level directory
+        grouped.setdefault(group, []).append(f)
+    return grouped
+
+def summarize_files(file_list):
+    #Extracts and combines 'Evaluation_summary' sheets from a list of Excel files.
+    summaries = {}
+    for path in file_list:
+        try:
+            summary_df = pd.read_excel(path, sheet_name="Evaluation_summary")
+            summary_df.insert(0, "File_Name", os.path.basename(path))  # Add filename to each row
+            summaries[os.path.basename(path)] = summary_df
+        except Exception as e: print(f"Skipping {path}: {e}")
+    if summaries: return pd.concat(summaries.values(), ignore_index=True)
+    else: return pd.DataFrame()
+
+
+
+def generate_overall_metrics(summary_file_path):
+    #Reads a summary Excel file and computes macro/micro accuracy metrics.
+    #Adds a new sheet named 'Overall_Metrics' to the same file.
+    df = pd.read_excel(summary_file_path, sheet_name=None)
+
+    if 'Sheet1' in df: data = df['Sheet1']
+    else: data = next(iter(df.values()))
+
+    macro_accuracy = data["Accuracy"].mean()
+    total_correct = data["Correct Predictions"].sum()
+    total_cases = data["Total Cases"].sum()
+    micro_accuracy = (total_correct / total_cases) * 100 if total_cases > 0 else 0
+
+    overall_analysis = pd.Series({
+        "Macro Accuracy (%)": round(macro_accuracy, 2),
+        "Micro Accuracy (%)": round(micro_accuracy, 2),
+        "Total Files": len(data),
+        "Total Cases": total_cases,
+        "Total Correct Predictions": total_correct,
+        "Total Incorrect Predictions": data["Incorrect Predictions"].sum()
+    })
+
+    # Save back to the same Excel file with a new sheet
+    with pd.ExcelWriter(summary_file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: overall_analysis.to_frame(name='Value').to_excel(writer, sheet_name='Overall_Metrics')
+
+    print(f"Overall metrics added to sheet 'Overall_Metrics' in {summary_file_path}")
+
+def main(base_folder, save_folder):
+    os.makedirs(save_folder, exist_ok=True)  # Ensure the save folder exists
+
+    all_files = collect_excel_files(base_folder)
+    grouped_files = group_by_top_folder(all_files, base_folder)
+    all_group_summaries = []
+    for group, files in grouped_files.items():
+        print(f"Processing group: {group} with {len(files)} files")
+        group_summary = summarize_files(files)
+        if not group_summary.empty:
+            group_summary.insert(0, "Dataset_Group", group)
+            group_summary.to_excel(os.path.join(save_folder, f"{group}_trustworthiness_summary.xlsx"), index=False)
+            all_group_summaries.append(group_summary)
+
+    if all_group_summaries:
+        final_summary = pd.concat(all_group_summaries, ignore_index=True)
+        file_path = os.path.join(save_folder, "overall_trustworthiness_summary.xlsx")
+        final_summary.to_excel(file_path, index=False)
+        generate_overall_metrics(file_path)
+        print("All summaries generated successfully.")
+    else: print("No valid summaries found.")
+
+
+main("/Users/karanveersinghsidhu/LADy_Model_Eval_Results", "/Users/karanveersinghsidhu/LADy_Model_Eval_Results")