-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupload.js
More file actions
283 lines (238 loc) · 8.56 KB
/
upload.js
File metadata and controls
283 lines (238 loc) · 8.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const readline = require('readline');
const { getSupabaseClient } = require('./db_utils');
// Parse command line arguments
function parseCommandLineArgs() {
const args = process.argv.slice(2);
const options = {};
// Check for help flag
if (args.includes('-h') || args.includes('--help')) {
showHelpAndExit();
}
// Look for -f or --file argument
const fileArgIndex = args.findIndex(arg => arg === '-f' || arg === '--file');
if (fileArgIndex !== -1 && args.length > fileArgIndex + 1) {
options.filePath = args[fileArgIndex + 1];
}
// Look for -s or --set argument
const setArgIndex = args.findIndex(arg => arg === '-s' || arg === '--set');
if (setArgIndex !== -1 && args.length > setArgIndex + 1) {
options.questionSet = args[setArgIndex + 1];
}
// Return parsed options
return options;
}
// Show help message and exit
function showHelpAndExit() {
console.log(`
Usage: node upload.js [options]
Options:
-f, --file <path> Path to the questions file (JSONL or JSON)
-s, --set <name> Question set name
-h, --help Show this help message
Examples:
node upload.js -f ./questions.json -s medQA
node upload.js --file ./step1.jsonl --set step1
Features:
- Preserves additional fields from input files in the 'other' column
- Captures all unmapped fields in the 'overflow' column
- Stores file metadata in the 'extraJ' column
If no arguments are provided, the script will run in interactive mode.
`);
process.exit(0);
}
// CLI interface
let rl;
function createInterface() {
if (!rl) {
rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
}
return rl;
}
async function uploadQuestionsPrompt() {
const rl = createInterface();
return new Promise((resolve) => {
rl.question('\nEnter the path to the questions file (JSONL or JSON): ', (filePath) => {
rl.question('Enter the question set name: ', async (questionSet) => {
try {
await uploadQuestions(filePath, questionSet);
resolve();
} catch (error) {
console.error('Error uploading questions:', error.message);
resolve();
}
});
});
});
}
async function uploadQuestions(filePath, questionSet) {
console.log(`\nUploading questions from ${filePath} to the '${questionSet}' question set...`);
try {
const supabase = getSupabaseClient();
// Get file metadata
const fileMetadata = getFileMetadata(filePath);
console.log('File metadata:', fileMetadata);
// Read file content
const fileContent = fs.readFileSync(filePath, 'utf8');
let questions = [];
// Parse file based on extension
const extension = path.extname(filePath).toLowerCase();
if (extension === '.jsonl') {
// Parse JSONL (one JSON object per line)
questions = fileContent
.split('\n')
.filter(line => line.trim())
.map(line => JSON.parse(line));
} else if (extension === '.json') {
// Parse JSON (single array or object)
const parsed = JSON.parse(fileContent);
questions = Array.isArray(parsed) ? parsed : [parsed];
} else if (extension === '.csv') {
console.error('CSV format not yet supported. Please use JSON or JSONL format.');
return;
} else {
console.error('Unsupported file format. Please use JSON or JSONL format.');
return;
}
console.log(`Found ${questions.length} questions to upload.`);
// Process questions in batches of 100
const batchSize = 100;
let processed = 0;
for (let i = 0; i < questions.length; i += batchSize) {
const batch = questions.slice(i, i + batchSize);
const processedBatch = batch.map(q => {
// Create SHA-256 hash of the question text to detect duplicates
const questionData = q.question + JSON.stringify(q.options) + q.answer;
const questionHash = crypto.createHash('sha256').update(questionData).digest('hex');
// Define known fields that we explicitly map to database columns
const knownMappedFields = [
'question',
'options',
'answer',
'answer_idx',
'meta_info',
'other'
];
// Identify unmapped fields for overflow
const overflowFields = {};
Object.keys(q).forEach(key => {
// Skip fields that will be explicitly mapped to database columns
if (!knownMappedFields.includes(key)) {
overflowFields[key] = q[key];
}
});
// Create the base object
const questionObj = {
question_set: questionSet,
question: q.question,
options: typeof q.options === 'object' ? q.options : JSON.parse(q.options),
answer: q.answer,
answer_idx: q.answer_idx,
question_hash: questionHash,
meta_info: q.meta_info || null,
answer_count: 0,
extraJ: fileMetadata // Pass object directly, not stringified
};
// Add 'other' column if present in input
if (q.other) {
questionObj.other = typeof q.other === 'object'
? q.other
: JSON.parse(q.other);
}
// Add overflow data if there are unmapped fields
if (Object.keys(overflowFields).length > 0) {
questionObj.overflow = overflowFields; // Pass object directly
// Log first 5 overflow fields for debugging
if (i === 0 && Object.keys(overflowFields).length > 0) {
const overflowFieldNames = Object.keys(overflowFields);
console.log(`Found ${overflowFieldNames.length} unmapped fields that will be stored in overflow: ${overflowFieldNames.slice(0, 5).join(', ')}${overflowFieldNames.length > 5 ? '...' : ''}`);
}
}
return questionObj;
});
try {
const { error } = await supabase.from('questions').insert(processedBatch);
if (error) {
throw error;
}
processed += batch.length;
console.log(`Processed ${processed}/${questions.length} questions...`);
} catch (error) {
console.error(`Error uploading batch (${i+1}-${i+Math.min(batch.length, batchSize)}):`, error.message);
throw error;
}
}
console.log(`Successfully uploaded ${processed} questions to the '${questionSet}' question set!`);
} catch (error) {
console.error('Error uploading questions:', error.message);
throw error;
}
}
/**
* Get metadata for a file including name, size, and hash
* @param {string} filePath - Path to the file
* @returns {Object} File metadata object
*/
function getFileMetadata(filePath) {
// Get absolute file path
const absolutePath = path.resolve(filePath);
// Extract filename from path
const filename = path.basename(absolutePath);
// Get file stats
const stats = fs.statSync(absolutePath);
const sizeInBytes = stats.size;
// Generate SHA256 hash of the raw file
const fileBuffer = fs.readFileSync(absolutePath);
const fileHash = crypto.createHash('sha256').update(fileBuffer).digest('hex');
return {
filename,
size: sizeInBytes,
sizeFormatted: formatFileSize(sizeInBytes),
sha256: fileHash,
uploadedAt: new Date().toISOString(),
sourcePath: absolutePath
};
}
/**
* Format file size to human readable format
* @param {number} bytes - Size in bytes
* @returns {string} Formatted size string
*/
function formatFileSize(bytes) {
if (bytes < 1024) return bytes + ' bytes';
else if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(2) + ' KB';
else if (bytes < 1024 * 1024 * 1024) return (bytes / (1024 * 1024)).toFixed(2) + ' MB';
else return (bytes / (1024 * 1024 * 1024)).toFixed(2) + ' GB';
}
// Main execution logic
async function main() {
try {
// Check for command line arguments first
const args = parseCommandLineArgs();
if (args.filePath && args.questionSet) {
// Use command line arguments
await uploadQuestions(args.filePath, args.questionSet);
} else {
// Fall back to interactive prompt
await uploadQuestionsPrompt();
}
if (rl) rl.close();
console.log('Upload process complete.');
process.exit(0);
} catch (error) {
if (rl) rl.close();
console.error('Upload process failed:', error);
process.exit(1);
}
}
// If this script is run directly
if (require.main === module) {
main();
}
module.exports = { uploadQuestions, uploadQuestionsPrompt };