@@ -1342,3 +1342,140 @@ def compute_placebo_effects(
13421342 placebo_effects .append (placebo_tau )
13431343
13441344 return np .asarray (placebo_effects )
1345+
1346+
1347+ def demean_by_group (
1348+ data : pd .DataFrame ,
1349+ variables : List [str ],
1350+ group_var : str ,
1351+ inplace : bool = False ,
1352+ suffix : str = "" ,
1353+ ) -> Tuple [pd .DataFrame , int ]:
1354+ """
1355+ Demean variables by a grouping variable (one-way within transformation).
1356+
1357+ For each variable, computes: x_ig - mean(x_g) where g is the group.
1358+
1359+ Parameters
1360+ ----------
1361+ data : pd.DataFrame
1362+ DataFrame containing the variables to demean.
1363+ variables : list of str
1364+ Column names to demean.
1365+ group_var : str
1366+ Column name for the grouping variable.
1367+ inplace : bool, default False
1368+ If True, modifies the original columns. If False, leaves original
1369+ columns unchanged (demeaning is still applied to return value).
1370+ suffix : str, default ""
1371+ Suffix to add to demeaned column names (only used when inplace=False
1372+ and you want to keep both original and demeaned columns).
1373+
1374+ Returns
1375+ -------
1376+ data : pd.DataFrame
1377+ DataFrame with demeaned variables.
1378+ n_effects : int
1379+ Number of absorbed fixed effects (nunique - 1).
1380+
1381+ Examples
1382+ --------
1383+ >>> df, n_fe = demean_by_group(df, ['y', 'x1', 'x2'], 'unit')
1384+ >>> # df['y'], df['x1'], df['x2'] are now demeaned by unit
1385+ """
1386+ if not inplace :
1387+ data = data .copy ()
1388+
1389+ # Count fixed effects (categories - 1 for identification)
1390+ n_effects = data [group_var ].nunique () - 1
1391+
1392+ # Cache the groupby object for efficiency
1393+ grouper = data .groupby (group_var , sort = False )
1394+
1395+ for var in variables :
1396+ col_name = var if not suffix else f"{ var } { suffix } "
1397+ group_means = grouper [var ].transform ("mean" )
1398+ data [col_name ] = data [var ] - group_means
1399+
1400+ return data , n_effects
1401+
1402+
1403+ def within_transform (
1404+ data : pd .DataFrame ,
1405+ variables : List [str ],
1406+ unit : str ,
1407+ time : str ,
1408+ inplace : bool = False ,
1409+ suffix : str = "_demeaned" ,
1410+ ) -> pd .DataFrame :
1411+ """
1412+ Apply two-way within transformation to remove unit and time fixed effects.
1413+
1414+ Computes: y_it - y_i. - y_.t + y_.. for each variable.
1415+
1416+ This is the standard fixed effects transformation for panel data that
1417+ removes both unit-specific and time-specific effects.
1418+
1419+ Parameters
1420+ ----------
1421+ data : pd.DataFrame
1422+ Panel data containing the variables to transform.
1423+ variables : list of str
1424+ Column names to transform.
1425+ unit : str
1426+ Column name for unit identifier.
1427+ time : str
1428+ Column name for time period identifier.
1429+ inplace : bool, default False
1430+ If True, modifies the original columns. If False, creates new columns
1431+ with the specified suffix.
1432+ suffix : str, default "_demeaned"
1433+ Suffix for new column names when inplace=False.
1434+
1435+ Returns
1436+ -------
1437+ pd.DataFrame
1438+ DataFrame with within-transformed variables.
1439+
1440+ Notes
1441+ -----
1442+ The within transformation removes variation that is constant within units
1443+ (unit fixed effects) and constant within time periods (time fixed effects).
1444+ The resulting estimates are equivalent to including unit and time dummies
1445+ but is computationally more efficient for large panels.
1446+
1447+ Examples
1448+ --------
1449+ >>> df = within_transform(df, ['y', 'x'], 'unit_id', 'year')
1450+ >>> # df now has 'y_demeaned' and 'x_demeaned' columns
1451+ """
1452+ if not inplace :
1453+ data = data .copy ()
1454+
1455+ # Cache groupby objects for efficiency
1456+ unit_grouper = data .groupby (unit , sort = False )
1457+ time_grouper = data .groupby (time , sort = False )
1458+
1459+ if inplace :
1460+ # Modify columns in place
1461+ for var in variables :
1462+ unit_means = unit_grouper [var ].transform ("mean" )
1463+ time_means = time_grouper [var ].transform ("mean" )
1464+ grand_mean = data [var ].mean ()
1465+ data [var ] = data [var ] - unit_means - time_means + grand_mean
1466+ else :
1467+ # Build all demeaned columns at once to avoid DataFrame fragmentation
1468+ demeaned_data = {}
1469+ for var in variables :
1470+ unit_means = unit_grouper [var ].transform ("mean" )
1471+ time_means = time_grouper [var ].transform ("mean" )
1472+ grand_mean = data [var ].mean ()
1473+ demeaned_data [f"{ var } { suffix } " ] = (
1474+ data [var ] - unit_means - time_means + grand_mean
1475+ ).values
1476+
1477+ # Add all columns at once
1478+ demeaned_df = pd .DataFrame (demeaned_data , index = data .index )
1479+ data = pd .concat ([data , demeaned_df ], axis = 1 )
1480+
1481+ return data
0 commit comments