vrre/vrre_eval.py at main · vanta-research/vrre · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
#!/usr/bin/env python3
"""
VANTA Research Reasoning Evaluation (VRRE)

A semantic understanding framework for evaluating LLM reasoning capabilities.
Focuses on meaning extraction rather than format compliance.

Features:
- Intelligent response parsing
- Semantic answer extraction
- Partial credit scoring
- Multi-domain reasoning assessment
- Comparative model analysis

Usage:
    python vrre_eval.py

Author: VANTA Research
License: Apache 2.0
Website: https://vanta-research.ai
"""

import ollama
import re
import json
import time
from typing import List, Dict, Tuple, Any, Optional
from dataclasses import dataclass, asdict
import random
import argparse
import sys

@dataclass
class ReasoningTask:
    """Represents a single reasoning evaluation task"""
    question: str
    correct_answer: str
    explanation: str
    task_type: str  # "boolean", "mathematical", "logical", "reading_comprehension"
    difficulty: str  # "easy", "medium", "hard"
    id: Optional[str] = None

    def __post_init__(self):
        if self.id is None:
            self.id = f"{self.task_type}_{self.difficulty}_{hash(self.question) % 10000}"

class SemanticAnswerExtractor:
    """Handles intelligent extraction of answers from natural language responses"""

    def __init__(self):
        self.boolean_patterns = {
            'yes': [
                r'\byes\b', r'\btrue\b', r'\bcorrect\b', r'\bright\b', r'\bvalid\b',
                r'we can conclude', r'it is valid', r'this is true', r'that\'s right',
                r'indeed', r'certainly', r'definitely', r'absolutely'
            ],
            'no': [
                r'\bno\b', r'\bfalse\b', r'\bincorrect\b', r'\bwrong\b', r'\binvalid\b',
                r'cannot conclude', r'not valid', r'this is false', r'fallacy',
                r'impossible', r'never', r'not true', r'doesn\'t follow'
            ]
        }

        self.reasoning_indicators = [
            'because', 'therefore', 'since', 'due to', 'reasoning', 'logic',
            'however', 'but', 'although', 'while', 'whereas', 'consequently',
            'thus', 'hence', 'so', 'as a result', 'given that'
        ]

    def extract_boolean_answer(self, response: str) -> Tuple[str, float]:
        """Extract yes/no answers with confidence scoring"""
        response_lower = response.lower()

        yes_score = sum(1 for pattern in self.boolean_patterns['yes']
                       if re.search(pattern, response_lower))
        no_score = sum(1 for pattern in self.boolean_patterns['no']
                      if re.search(pattern, response_lower))

        # Boost confidence if reasoning indicators are present
        reasoning_boost = 0.1 * sum(1 for indicator in self.reasoning_indicators
                                   if indicator in response_lower)

        if yes_score > no_score:
            confidence = min(0.95, 0.5 + (yes_score - no_score) * 0.15 + reasoning_boost)
            return "yes", confidence
        elif no_score > yes_score:
            confidence = min(0.95, 0.5 + (no_score - yes_score) * 0.15 + reasoning_boost)
            return "no", confidence
        else:
            # Look for implicit agreement/disagreement
            if any(word in response_lower for word in ['agree', 'support', 'confirm']):
                return "yes", 0.3
            elif any(word in response_lower for word in ['disagree', 'reject', 'deny']):
                return "no", 0.3
            return "unclear", 0.1

    def extract_mathematical_answer(self, response: str) -> Tuple[str, float]:
        """Extract numerical answers from mathematical reasoning"""
        # Look for numbers in various formats
        number_patterns = [
            r'\b(\d+(?:\.\d+)?)\b',  # Standard numbers
            r'(\d+(?:,\d{3})*(?:\.\d+)?)',  # Numbers with commas
            r'(\d+/\d+)',  # Fractions
        ]

        all_numbers = []
        for pattern in number_patterns:
            all_numbers.extend(re.findall(pattern, response))

        if all_numbers:
            # Use the last number mentioned (often the final answer)
            final_answer = all_numbers[-1]

            # Higher confidence if we see calculation indicators
            calc_indicators = ['equals', '=', 'answer is', 'result is', 'total is']
            confidence = 0.8
            if any(indicator in response.lower() for indicator in calc_indicators):
                confidence = 0.9

            return final_answer, confidence

        return "unclear", 0.1

    def extract_logical_answer(self, response: str) -> Tuple[str, float]:
        """Extract answers for logical reasoning tasks"""
        response_lower = response.lower()

        # Look for validity indicators
        valid_patterns = ['valid', 'sound', 'correct', 'follows', 'logical']
        invalid_patterns = ['invalid', 'unsound', 'incorrect', 'fallacy', 'illogical']

        valid_score = sum(1 for pattern in valid_patterns if pattern in response_lower)
        invalid_score = sum(1 for pattern in invalid_patterns if pattern in response_lower)

        if valid_score > invalid_score:
            return "valid", min(0.9, 0.6 + valid_score * 0.1)
        elif invalid_score > valid_score:
            return "invalid", min(0.9, 0.6 + invalid_score * 0.1)

        return "unclear", 0.1

    def extract_answer(self, response: str, task_type: str) -> Tuple[str, float]:
        """Main extraction method that routes to appropriate extractor"""
        response = response.strip()

        if task_type == "boolean":
            return self.extract_boolean_answer(response)
        elif task_type == "mathematical":
            return self.extract_mathematical_answer(response)
        elif task_type == "logical":
            return self.extract_logical_answer(response)
        else:
            # Default to boolean extraction for unknown types
            return self.extract_boolean_answer(response)

class VRREvaluator:
    """Main evaluation class for VANTA Research Reasoning Evaluation"""

    def __init__(self, model_name: str = "apollo-reasoning-enhanced", verbose: bool = True):
        self.model_name = model_name
        self.verbose = verbose
        self.extractor = SemanticAnswerExtractor()
        self.results = []

    def create_default_test_suite(self) -> List[ReasoningTask]:
        """Create the standard reasoning test suite"""
        return [
            # Boolean Logic - Logical Fallacies
            ReasoningTask(
                "All roses are flowers. Some flowers are red. Can we conclude that some roses are red?",
                "no",
                "This is affirming the consequent fallacy - we can't conclude roses are red just because some flowers are red",
                "boolean",
                "medium",
                "logical_fallacy_1"
            ),
            ReasoningTask(
                "All cats are mammals. Fluffy is a cat. Is Fluffy a mammal?",
                "yes",
                "Valid syllogism using modus ponens - if all cats are mammals and Fluffy is a cat, then Fluffy must be a mammal",
                "boolean",
                "easy",
                "valid_syllogism_1"
            ),
            ReasoningTask(
                "If it rains, the ground gets wet. The ground is wet. Did it rain?",
                "no",
                "Affirming the consequent fallacy - wet ground could have other causes (sprinklers, flooding, etc.)",
                "boolean",
                "medium",
                "logical_fallacy_2"
            ),

            # Reading Comprehension
            ReasoningTask(
                "Passage: The Eiffel Tower was built in 1889 for the World's Fair in Paris. It was designed by Gustave Eiffel and stands 324 meters tall. Question: Was the Eiffel Tower built in the 19th century?",
                "yes",
                "1889 falls within the 19th century (1801-1900)",
                "boolean",
                "easy",
                "reading_comp_1"
            ),
            ReasoningTask(
                "Passage: Photosynthesis is the process by which plants convert sunlight into energy. This process requires chlorophyll, water, and carbon dioxide. Question: Can photosynthesis occur without water?",
                "no",
                "Water is explicitly listed as one of the required components",
                "boolean",
                "easy",
                "reading_comp_2"
            ),

            # Mathematical Reasoning
            ReasoningTask(
                "If a train travels 60 miles per hour for 2.5 hours, how far does it travel?",
                "150",
                "Distance = Speed × Time = 60 mph × 2.5 hours = 150 miles",
                "mathematical",
                "easy",
                "math_distance_1"
            ),
            ReasoningTask(
                "A rectangle has length 8 and width 5. What is its area?",
                "40",
                "Area of rectangle = Length × Width = 8 × 5 = 40 square units",
                "mathematical",
                "easy",
                "math_area_1"
            ),

            # Advanced Logic
            ReasoningTask(
                "All birds can fly. Penguins are birds. Can penguins fly?",
                "no",
                "The premise 'all birds can fly' is factually incorrect - this tests handling of false premises",
                "boolean",
                "hard",
                "false_premise_1"
            ),
            ReasoningTask(
                "Either John is at home or at work. John is not at home. Where is John?",
                "work",
                "Disjunctive syllogism - if A or B, and not A, then B",
                "logical",
                "medium",
                "disjunctive_syllogism_1"
            ),
        ]

    def evaluate_task(self, task: ReasoningTask) -> Dict[str, Any]:
        """Evaluate a single reasoning task"""
        if self.verbose:
            print(f"   Evaluating: {task.id}")

        try:
            # Query the model with optimized parameters
            response = ollama.generate(
                model=self.model_name,
                prompt=f"{task.question}\n\nPlease provide your answer and explain your reasoning.",
                options={
                    'temperature': 0,
                    'num_predict': 200,
                    'top_p': 0.9,
                    'top_k': 50
                }
            )

            response_text = response['response']
            extracted_answer, confidence = self.extractor.extract_answer(response_text, task.task_type)

            # Normalize answers for comparison
            correct_answer_norm = task.correct_answer.lower().strip()
            extracted_answer_norm = extracted_answer.lower().strip()

            # Check correctness
            correct = (extracted_answer_norm == correct_answer_norm)

            # Calculate score with partial credit
            if correct:
                score = 1.0
            elif extracted_answer == "unclear":
                # Give some credit if reasoning indicators are present
                reasoning_present = any(
                    indicator in response_text.lower()
                    for indicator in self.extractor.reasoning_indicators
                )
                score = 0.3 if reasoning_present else 0.1
            else:
                # Wrong answer but check for reasoning quality
                reasoning_present = any(
                    indicator in response_text.lower()
                    for indicator in self.extractor.reasoning_indicators
                )
                score = 0.2 if reasoning_present else 0.0

            result = {
                'task_id': task.id,
                'task': asdict(task),
                'response': response_text,
                'extracted_answer': extracted_answer,
                'correct_answer': task.correct_answer,
                'correct': correct,
                'score': score,
                'confidence': confidence,
                'response_time': response.get('eval_duration', 0) / 1e9,
                'total_tokens': response.get('eval_count', 0)
            }

            return result

        except Exception as e:
            if self.verbose:
                print(f"      ERROR: {str(e)}")
            return {
                'task_id': task.id,
                'task': asdict(task),
                'error': str(e),
                'score': 0,
                'correct': False
            }

    def run_evaluation(self, tasks: Optional[List[ReasoningTask]] = None) -> Dict[str, Any]:
        """Run complete evaluation suite"""
        if tasks is None:
            tasks = self.create_default_test_suite()

        if self.verbose:
            print(f"\nStarting VANTA Research Reasoning Evaluation (VRRE)")
            print(f"   Model: {self.model_name}")
            print(f"   Tasks: {len(tasks)}")
            print("=" * 60)

        results = []

        for i, task in enumerate(tasks, 1):
            if self.verbose:
                print(f"\n📝 Task {i}/{len(tasks)} ({task.task_type}, {task.difficulty})")
                print(f"   {task.question[:80]}{'...' if len(task.question) > 80 else ''}")

            result = self.evaluate_task(task)
            results.append(result)

            if 'error' not in result and self.verbose:
                status = "✅" if result['correct'] else "❌"
                print(f"   {status} Expected: {task.correct_answer} | Got: {result['extracted_answer']} | Score: {result['score']:.2f}")

            # Rate limiting to be nice to the model
            time.sleep(0.5)

        # Calculate comprehensive statistics
        stats = self._calculate_statistics(results, tasks)

        return stats

    def _calculate_statistics(self, results: List[Dict], tasks: List[ReasoningTask]) -> Dict[str, Any]:
        """Calculate comprehensive evaluation statistics"""

        # Basic stats
        valid_results = [r for r in results if 'score' in r]
        scores = [r['score'] for r in valid_results]
        correct_count = sum(1 for r in valid_results if r.get('correct', False))

        stats = {
            'model_name': self.model_name,
            'total_tasks': len(tasks),
            'valid_results': len(valid_results),
            'correct_count': correct_count,
            'accuracy': correct_count / len(valid_results) if valid_results else 0,
            'average_score': sum(scores) / len(scores) if scores else 0,
            'score_distribution': {
                'perfect': sum(1 for s in scores if s == 1.0),
                'partial': sum(1 for s in scores if 0 < s < 1.0),
                'zero': sum(1 for s in scores if s == 0)
            },
            'by_type': {},
            'by_difficulty': {},
            'confidence_stats': {},
            'results': results
        }

        # Group by task type
        for result in valid_results:
            task = result['task']
            task_type = task['task_type']

            if task_type not in stats['by_type']:
                stats['by_type'][task_type] = {
                    'correct': 0, 'total': 0, 'scores': [], 'confidences': []
                }

            stats['by_type'][task_type]['total'] += 1
            if result.get('correct', False):
                stats['by_type'][task_type]['correct'] += 1
            stats['by_type'][task_type]['scores'].append(result.get('score', 0))
            stats['by_type'][task_type]['confidences'].append(result.get('confidence', 0))

        # Group by difficulty
        for result in valid_results:
            task = result['task']
            difficulty = task['difficulty']

            if difficulty not in stats['by_difficulty']:
                stats['by_difficulty'][difficulty] = {
                    'correct': 0, 'total': 0, 'scores': [], 'confidences': []
                }

            stats['by_difficulty'][difficulty]['total'] += 1
            if result.get('correct', False):
                stats['by_difficulty'][difficulty]['correct'] += 1
            stats['by_difficulty'][difficulty]['scores'].append(result.get('score', 0))
            stats['by_difficulty'][difficulty]['confidences'].append(result.get('confidence', 0))

        # Calculate averages for grouped stats
        for group_stats in [stats['by_type'], stats['by_difficulty']]:
            for key, data in group_stats.items():
                data['accuracy'] = data['correct'] / data['total'] if data['total'] > 0 else 0
                data['avg_score'] = sum(data['scores']) / len(data['scores']) if data['scores'] else 0
                data['avg_confidence'] = sum(data['confidences']) / len(data['confidences']) if data['confidences'] else 0

        return stats

    def print_summary(self, stats: Dict[str, Any]):
        """Print comprehensive evaluation summary"""
        print("\n" + "=" * 60)
        print("🏆 VANTA RESEARCH REASONING EVALUATION SUMMARY")
        print("=" * 60)

        print(f"\n📊 Overall Performance ({stats['model_name']}):")
        print(f"   Accuracy: {stats['accuracy']:.1%} ({stats['correct_count']}/{stats['valid_results']})")
        print(f"   Average Score: {stats['average_score']:.3f}")

        print(f"\n📈 Score Distribution:")
        dist = stats['score_distribution']
        print(f"   Perfect (1.0): {dist['perfect']} tasks")
        print(f"   Partial (>0): {dist['partial']} tasks")
        print(f"   Zero (0.0): {dist['zero']} tasks")

        print(f"\n📋 By Task Type:")
        for task_type, data in stats['by_type'].items():
            print(f"   {task_type.title()}: {data['accuracy']:.1%} ({data['correct']}/{data['total']}) | "
                  f"Avg Score: {data['avg_score']:.3f} | Confidence: {data['avg_confidence']:.3f}")

        print(f"\n🎯 By Difficulty:")
        for difficulty, data in stats['by_difficulty'].items():
            print(f"   {difficulty.title()}: {data['accuracy']:.1%} ({data['correct']}/{data['total']}) | "
                  f"Avg Score: {data['avg_score']:.3f} | Confidence: {data['avg_confidence']:.3f}")

        print(f"\n💡 VRRE Insights:")
        print(f"   • Semantic understanding captures reasoning beyond format compliance")
        print(f"   • Partial credit rewards reasoning process even with wrong answers")
        print(f"   • Confidence scores indicate answer extraction reliability")
        print(f"   • Task categorization reveals specific reasoning strengths/weaknesses")

def compare_models(models: List[str], tasks: Optional[List[ReasoningTask]] = None, verbose: bool = True) -> Dict[str, Any]:
    """Compare multiple models on the same reasoning tasks"""
    all_results = {}

    for model in models:
        if verbose:
            print(f"\n🔬 Evaluating Model: {model}")
        try:
            evaluator = ApolloReasoningEvaluator(model, verbose=verbose)
            stats = evaluator.run_evaluation(tasks)
            evaluator.print_summary(stats)
            all_results[model] = stats
        except Exception as e:
            print(f"❌ Failed to evaluate {model}: {e}")
            all_results[model] = {'error': str(e)}

    # Comparative summary
    if len(all_results) > 1 and verbose:
        print("\n" + "=" * 60)
        print("🔬 COMPARATIVE ANALYSIS")
        print("=" * 60)

        for model, stats in all_results.items():
            if 'error' not in stats:
                print(f"\n{model}:")
                print(f"   Overall: {stats['accuracy']:.1%} accuracy, {stats['average_score']:.3f} avg score")

                # Show best performing categories
                best_type = max(stats['by_type'].items(), key=lambda x: x[1]['accuracy'])
                print(f"   Best Category: {best_type[0]} ({best_type[1]['accuracy']:.1%})")

    return all_results

def save_results(results: Dict[str, Any], filename: Optional[str] = None) -> str:
    """Save evaluation results to JSON file"""
    if filename is None:
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f"vrre_eval_{timestamp}.json"

    with open(filename, 'w') as f:
        json.dump(results, f, indent=2, default=str)

    return filename

def main():
    """Main CLI interface"""
    parser = argparse.ArgumentParser(description="VANTA Research Reasoning Evaluation (VRRE)")
    parser.add_argument("--models", nargs='+', default=["apollo-reasoning-enhanced"],
                       help="Models to evaluate")
    parser.add_argument("--output", type=str, help="Output JSON filename")
    parser.add_argument("--quiet", action="store_true", help="Suppress verbose output")
    parser.add_argument("--compare", action="store_true", help="Run comparative analysis")

    args = parser.parse_args()

    verbose = not args.quiet

    if args.compare or len(args.models) > 1:
        # Comparative evaluation
        results = compare_models(args.models, verbose=verbose)
    else:
        # Single model evaluation
        model = args.models[0]
        evaluator = VRREvaluator(model, verbose=verbose)
        results = {model: evaluator.run_evaluation()}
        if verbose:
            evaluator.print_summary(results[model])

    # Save results
    filename = save_results(results, args.output)
    if verbose:
        print(f"\n💾 Results saved to: {filename}")

if __name__ == "__main__":
    main()