RAG_experiments/benchmark_queries.json at main · hugopalma17/RAG_experiments · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
{
  "version": 2,
  "description": "18 semantic benchmark queries for RAG pipeline evaluation -- designed for reasoning, synthesis, and inference over ~6500 AI/software job listings. No counting or aggregation queries.",
  "ground_truth_status": "pending",
  "queries": [
    {
      "id": 1,
      "category": "synthesis",
      "query": "What does a typical senior ML engineer role look like in terms of day-to-day responsibilities?",
      "tests": "Synthesize patterns across multiple job descriptions into a coherent picture",
      "ground_truth": []
    },
    {
      "id": 2,
      "category": "synthesis",
      "query": "What tech stack do companies building LLM-powered products typically require?",
      "tests": "Extract and combine technical requirements from LLM-related roles",
      "ground_truth": []
    },
    {
      "id": 3,
      "category": "synthesis",
      "query": "What does the interview process look like for AI engineering roles based on these listings?",
      "tests": "Pull and synthesize interview details scattered across descriptions",
      "ground_truth": []
    },
    {
      "id": 4,
      "category": "comparison",
      "query": "How do junior versus senior AI roles differ in what they expect candidates to know?",
      "tests": "Compare and contrast requirements across seniority levels",
      "ground_truth": []
    },
    {
      "id": 5,
      "category": "comparison",
      "query": "What is the difference between what startups and large companies look for in machine learning engineers?",
      "tests": "Distinguish company-stage signals and compare expectations",
      "ground_truth": []
    },
    {
      "id": 6,
      "category": "comparison",
      "query": "How do roles focused on building AI products from scratch differ from those integrating existing models or APIs?",
      "tests": "Semantic depth, build vs integrate distinction across descriptions",
      "ground_truth": []
    },
    {
      "id": 7,
      "category": "inference",
      "query": "Which roles seem to expect someone who can work independently with minimal supervision?",
      "tests": "Infer autonomy expectations from indirect language cues",
      "ground_truth": []
    },
    {
      "id": 8,
      "category": "inference",
      "query": "Based on the job descriptions, which roles are more research-oriented versus production engineering?",
      "tests": "Classify roles by inferred focus without explicit labels",
      "ground_truth": []
    },
    {
      "id": 9,
      "category": "inference",
      "query": "Which jobs sound like they want a full-stack engineer who also does ML, rather than a pure ML researcher?",
      "tests": "Infer hybrid role expectations from combined skill signals",
      "ground_truth": []
    },
    {
      "id": 10,
      "category": "pattern",
      "query": "What soft skills keep appearing across AI and ML engineering job descriptions?",
      "tests": "Identify recurring non-technical requirements across retrieved chunks",
      "ground_truth": []
    },
    {
      "id": 11,
      "category": "pattern",
      "query": "What tools and frameworks are most commonly mentioned alongside LLM or RAG work?",
      "tests": "Extract co-occurring technical terms in a specific subdomain",
      "ground_truth": []
    },
    {
      "id": 12,
      "category": "pattern",
      "query": "What benefits beyond salary do AI companies highlight to attract engineering candidates?",
      "tests": "Identify perks and cultural signals across multiple listings",
      "ground_truth": []
    },
    {
      "id": 13,
      "category": "nuanced-retrieval",
      "query": "Find roles where the focus is on data quality and pipeline reliability rather than model building",
      "tests": "Retrieve based on semantic intent, not keyword overlap with ML terms",
      "ground_truth": []
    },
    {
      "id": 14,
      "category": "nuanced-retrieval",
      "query": "Jobs that emphasize mentorship, career growth, or a strong engineering culture",
      "tests": "Retrieve on soft cultural signals buried in descriptions",
      "ground_truth": []
    },
    {
      "id": 15,
      "category": "nuanced-retrieval",
      "query": "Roles that involve deploying models to production and managing inference at scale",
      "tests": "Distinguish MLOps/deployment focus from training/research focus",
      "ground_truth": []
    },
    {
      "id": 16,
      "category": "analysis",
      "query": "Based on these job listings, what skills would you recommend someone learn to be competitive for AI engineering roles?",
      "tests": "LLM must reason about market signals and form a recommendation",
      "ground_truth": []
    },
    {
      "id": 17,
      "category": "analysis",
      "query": "Which job descriptions seem the most well-written and informative versus vague and generic?",
      "tests": "LLM judges content quality, requires meta-reasoning about the text itself",
      "ground_truth": []
    },
    {
      "id": 18,
      "category": "analysis",
      "query": "Based on the requirements listed, which roles seem the hardest to fill and why?",
      "tests": "Infer hiring difficulty from requirement complexity and specificity",
      "ground_truth": []
    }
  ]
}