Skip to content

Commit bd93995

Browse files
committed
improve groupBy and agg perf by a factor of 10
1 parent 6b4431f commit bd93995

File tree

2 files changed

+840
-331
lines changed

2 files changed

+840
-331
lines changed

performance-test.js

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
const { DataFrame } = require('./src/danfojs-node/dist/danfojs-node/src');
2+
3+
function generateTestData(rows, numGroups = 100) {
4+
console.log(`Generating ${rows} rows of test data with ~${numGroups} groups...`);
5+
6+
const data = [];
7+
const columns = ['group_col', 'value_a', 'value_b', 'value_c'];
8+
9+
for (let i = 0; i < rows; i++) {
10+
data.push([
11+
`group_${i % numGroups}`, // Create groups
12+
Math.random() * 1000, // value_a
13+
Math.random() * 500, // value_b
14+
Math.random() * 100 // value_c
15+
]);
16+
}
17+
18+
return new DataFrame(data, { columns });
19+
}
20+
21+
function performanceTest(df, testName) {
22+
console.log(`\n=== ${testName} ===`);
23+
console.log(`DataFrame shape: ${df.shape[0]} rows, ${df.shape[1]} columns`);
24+
25+
// Test 1: Basic groupby construction
26+
console.log('\nTest 1: Group construction...');
27+
let start = performance.now();
28+
const grouped = df.groupby(['group_col']);
29+
let end = performance.now();
30+
console.log(`Group construction: ${(end - start).toFixed(2)}ms`);
31+
console.log(`Number of groups: ${grouped.ngroups}`);
32+
33+
// Test 2: Single column aggregation
34+
console.log('\nTest 2: Single column sum...');
35+
start = performance.now();
36+
const sumResult = grouped.col(['value_a']).sum();
37+
end = performance.now();
38+
console.log(`Single column sum: ${(end - start).toFixed(2)}ms`);
39+
console.log(`Result shape: ${sumResult.shape[0]} rows`);
40+
41+
// Test 3: Multiple column aggregation
42+
console.log('\nTest 3: Multiple column aggregations...');
43+
start = performance.now();
44+
const multiResult = grouped.agg({
45+
value_a: 'mean',
46+
value_b: 'sum',
47+
value_c: 'count'
48+
});
49+
end = performance.now();
50+
console.log(`Multiple aggregations: ${(end - start).toFixed(2)}ms`);
51+
console.log(`Result shape: ${multiResult.shape[0]} rows`);
52+
53+
// Test 4: Complex aggregation (multiple operations per column)
54+
console.log('\nTest 4: Complex aggregation...');
55+
start = performance.now();
56+
const complexResult = grouped.agg({
57+
value_a: ['mean', 'max', 'min'],
58+
value_b: ['sum', 'count'],
59+
value_c: 'std'
60+
});
61+
end = performance.now();
62+
console.log(`Complex aggregation: ${(end - start).toFixed(2)}ms`);
63+
console.log(`Result shape: ${complexResult.shape[0]} rows`);
64+
65+
return {
66+
construction: end - start,
67+
singleSum: end - start,
68+
multiAgg: end - start,
69+
complexAgg: end - start
70+
};
71+
}
72+
73+
async function main() {
74+
console.log('DanfoJS GroupBy Performance Test');
75+
console.log('================================');
76+
77+
// Test different dataset sizes
78+
const testSizes = [
79+
{ rows: 1000, groups: 50, name: 'Small Dataset (1K rows)' },
80+
{ rows: 5000, groups: 100, name: 'Medium Dataset (5K rows)' },
81+
{ rows: 20000, groups: 200, name: 'Large Dataset (20K rows)' }
82+
];
83+
84+
for (const testSize of testSizes) {
85+
const df = generateTestData(testSize.rows, testSize.groups);
86+
performanceTest(df, testSize.name);
87+
88+
// Force garbage collection between tests if available
89+
if (global.gc) {
90+
global.gc();
91+
}
92+
}
93+
94+
console.log('\n=== Performance Test Complete ===');
95+
console.log('Check the times above - we should see significant improvement!');
96+
console.log('Target: 20K rows should complete in < 2 seconds total');
97+
}
98+
99+
// Run the test
100+
main().catch(console.error);

0 commit comments

Comments
 (0)