diff --git a/src/codegen/cli/cli.py b/src/codegen/cli/cli.py index 8910f615a..7a0eb79d6 100644 --- a/src/codegen/cli/cli.py +++ b/src/codegen/cli/cli.py @@ -11,6 +11,7 @@ from codegen.cli.commands.login.main import login_command from codegen.cli.commands.logout.main import logout_command from codegen.cli.commands.lsp.lsp import lsp_command +from codegen.cli.commands.memprof.main import memprof_command from codegen.cli.commands.notebook.main import notebook_command from codegen.cli.commands.profile.main import profile_command from codegen.cli.commands.reset.main import reset_command @@ -51,6 +52,7 @@ def main(): main.add_command(lsp_command) main.add_command(serve_command) main.add_command(start_command) +main.add_command(memprof_command) # Add the memory profiling command if __name__ == "__main__": diff --git a/src/codegen/cli/commands/memprof/__init__.py b/src/codegen/cli/commands/memprof/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/codegen/cli/commands/memprof/main.py b/src/codegen/cli/commands/memprof/main.py new file mode 100644 index 000000000..6b42b1b8f --- /dev/null +++ b/src/codegen/cli/commands/memprof/main.py @@ -0,0 +1,69 @@ +import os +from typing import Optional + +import rich +import rich_click as click +from rich import box +from rich.panel import Panel + +from codegen.cli.utils.memory_profiler import profile_command + + +@click.command(name="memprof") +@click.argument("command", nargs=-1, required=True) +@click.option( + "--output-dir", + "-o", + type=click.Path(file_okay=False), + help="Directory to save memory profile reports", +) +def memprof_command(command: list[str], output_dir: Optional[str] = None): + """Profile memory usage of a Codegen CLI command. + + Example: + codegen memprof run my-codemod --arguments '{"param": "value"}' + """ + if not command: + rich.print("[bold red]Error:[/bold red] No command specified") + return + + # Convert command tuple to list + cmd_args = list(command) + + # Set default output directory if not provided + if not output_dir: + home_dir = os.path.expanduser("~") + output_dir = os.path.join(home_dir, ".codegen", "memory_profiles") + + # Run the profiling + rich.print( + Panel( + f"[cyan]Profiling command:[/cyan] codegen {' '.join(cmd_args)}", + title="🔍 [bold]Memory Profiler[/bold]", + border_style="cyan", + box=box.ROUNDED, + padding=(1, 2), + ) + ) + + try: + report_dir = profile_command(cmd_args, output_dir=output_dir) + rich.print( + Panel( + f"[green]Memory profile saved to:[/green] {report_dir}", + title="✅ [bold]Profiling Complete[/bold]", + border_style="green", + box=box.ROUNDED, + padding=(1, 2), + ) + ) + except Exception as e: + rich.print( + Panel( + f"[red]Error during profiling:[/red] {e!s}", + title="❌ [bold]Profiling Failed[/bold]", + border_style="red", + box=box.ROUNDED, + padding=(1, 2), + ) + ) diff --git a/src/codegen/cli/commands/run/run_local.py b/src/codegen/cli/commands/run/run_local.py index 4ca737dd1..99d051e7f 100644 --- a/src/codegen/cli/commands/run/run_local.py +++ b/src/codegen/cli/commands/run/run_local.py @@ -1,5 +1,9 @@ +import gc +import os +import time from pathlib import Path +import psutil import rich from rich.panel import Panel from rich.status import Status @@ -27,6 +31,9 @@ def parse_codebase( Returns: Parsed Codebase object """ + # Force garbage collection before parsing to free up memory + gc.collect() + codebase = Codebase( projects=[ ProjectConfig( @@ -51,21 +58,36 @@ def run_local( function: The function to run diff_preview: Number of lines of diff to preview (None for all) """ + # Get initial memory usage + process = psutil.Process(os.getpid()) + initial_memory = process.memory_info().rss / (1024 * 1024) # Convert to MB + # Parse codebase and run with Status(f"[bold]Parsing codebase at {session.repo_path} with subdirectories {function.subdirectories or 'ALL'} and language {function.language or 'AUTO'} ...", spinner="dots") as status: + start_time = time.time() codebase = parse_codebase(repo_path=session.repo_path, subdirectories=function.subdirectories, language=function.language) - status.update("[bold green]✓ Parsed codebase") + parse_time = time.time() - start_time + status.update(f"[bold green]✓ Parsed codebase in {parse_time:.2f}s") + + # Memory usage after parsing + post_parse_memory = process.memory_info().rss / (1024 * 1024) status.update("[bold]Running codemod...") + start_time = time.time() function.run(codebase) # Run the function - status.update("[bold green]✓ Completed codemod") + run_time = time.time() - start_time + status.update(f"[bold green]✓ Completed codemod in {run_time:.2f}s") # Get the diff from the codebase result = codebase.get_diff() + # Final memory usage + final_memory = process.memory_info().rss / (1024 * 1024) + # Handle no changes case if not result: rich.print("\n[yellow]No changes were produced by this codemod[/yellow]") + rich.print(f"\n[dim]Memory usage: {initial_memory:.2f}MB → {final_memory:.2f}MB (Δ {final_memory - initial_memory:.2f}MB)[/dim]") return # Show diff preview if requested @@ -84,3 +106,11 @@ def run_local( # Apply changes rich.print("") rich.print("[green]✓ Changes have been applied to your local filesystem[/green]") + + # Print memory usage statistics + rich.print(f"\n[dim]Memory usage: {initial_memory:.2f}MB → {final_memory:.2f}MB (Δ {final_memory - initial_memory:.2f}MB)[/dim]") + rich.print(f"[dim]Parsing: {parse_time:.2f}s, Execution: {run_time:.2f}s[/dim]") + + # Clean up to free memory + del codebase + gc.collect() diff --git a/src/codegen/cli/utils/memory_profiler.py b/src/codegen/cli/utils/memory_profiler.py new file mode 100644 index 000000000..dd4d22aa8 --- /dev/null +++ b/src/codegen/cli/utils/memory_profiler.py @@ -0,0 +1,366 @@ +import functools +import json +import os +import subprocess +import sys +import tempfile +import time +import tracemalloc +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Union + +import matplotlib.pyplot as plt +import psutil +from rich.console import Console +from rich.table import Table + +console = Console() + + +@dataclass +class MemorySnapshot: + """A snapshot of memory usage at a point in time.""" + + timestamp: float + rss_mb: float # Resident Set Size in MB + vms_mb: float # Virtual Memory Size in MB + tracemalloc_mb: Optional[float] = None # Tracemalloc total in MB + + def to_dict(self) -> dict: + return {"timestamp": self.timestamp, "rss_mb": self.rss_mb, "vms_mb": self.vms_mb, "tracemalloc_mb": self.tracemalloc_mb} + + +class MemoryProfiler: + """A memory profiler that tracks memory usage over time.""" + + def __init__(self, interval: float = 0.1, use_tracemalloc: bool = True): + """Initialize the memory profiler. + + Args: + interval: The interval in seconds between memory snapshots. + use_tracemalloc: Whether to use tracemalloc for detailed memory tracking. + """ + self.interval = interval + self.use_tracemalloc = use_tracemalloc + self.snapshots: list[MemorySnapshot] = [] + self.process = psutil.Process(os.getpid()) + self.start_time = None + self._running = False + + def start(self): + """Start memory profiling.""" + if self._running: + return + + self.snapshots = [] + self.start_time = time.time() + + if self.use_tracemalloc: + tracemalloc.start() + + self._running = True + self._take_snapshot() + + def stop(self) -> list[MemorySnapshot]: + """Stop memory profiling and return the snapshots.""" + if not self._running: + return self.snapshots + + self._take_snapshot() # Take one final snapshot + + if self.use_tracemalloc: + tracemalloc.stop() + + self._running = False + return self.snapshots + + def _take_snapshot(self): + """Take a snapshot of the current memory usage.""" + mem_info = self.process.memory_info() + + snapshot = MemorySnapshot( + timestamp=time.time() - self.start_time, + rss_mb=mem_info.rss / (1024 * 1024), + vms_mb=mem_info.vms / (1024 * 1024), + ) + + if self.use_tracemalloc: + current, peak = tracemalloc.get_traced_memory() + snapshot.tracemalloc_mb = current / (1024 * 1024) + + self.snapshots.append(snapshot) + + def get_peak_memory(self) -> tuple[float, float]: + """Get the peak RSS and VMS memory usage in MB.""" + if not self.snapshots: + return 0.0, 0.0 + + peak_rss = max(s.rss_mb for s in self.snapshots) + peak_vms = max(s.vms_mb for s in self.snapshots) + return peak_rss, peak_vms + + def get_tracemalloc_stats(self, top_n: int = 10) -> list: + """Get the top memory allocations from tracemalloc.""" + if not self.use_tracemalloc or not tracemalloc.is_tracing(): + return [] + + snapshot = tracemalloc.take_snapshot() + stats = snapshot.statistics("lineno") + return stats[:top_n] + + def save_report(self, output_dir: Union[str, Path], command_name: str): + """Save a memory profiling report to the specified directory.""" + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Save raw data as JSON + data_file = output_dir / f"{command_name}_memory_profile.json" + with open(data_file, "w") as f: + json.dump([s.to_dict() for s in self.snapshots], f, indent=2) + + # Generate and save plot + self._generate_plot(output_dir / f"{command_name}_memory_profile.png") + + # Generate text report with tracemalloc stats if available + report_file = output_dir / f"{command_name}_memory_report.txt" + with open(report_file, "w") as f: + peak_rss, peak_vms = self.get_peak_memory() + f.write(f"Memory Profile for: {command_name}\n") + f.write(f"{'=' * 50}\n") + f.write(f"Duration: {self.snapshots[-1].timestamp:.2f} seconds\n") + f.write(f"Peak RSS: {peak_rss:.2f} MB\n") + f.write(f"Peak VMS: {peak_vms:.2f} MB\n\n") + + if self.use_tracemalloc: + f.write("Top Memory Allocations:\n") + f.write(f"{'-' * 50}\n") + for stat in self.get_tracemalloc_stats(top_n=20): + f.write(f"{stat.size / (1024 * 1024):.2f} MB: {stat.traceback.format()[0]}\n") + + return output_dir + + def _generate_plot(self, output_file: Path): + """Generate a plot of memory usage over time.""" + if not self.snapshots: + return + + timestamps = [s.timestamp for s in self.snapshots] + rss_values = [s.rss_mb for s in self.snapshots] + vms_values = [s.vms_mb for s in self.snapshots] + + plt.figure(figsize=(10, 6)) + plt.plot(timestamps, rss_values, label="RSS (MB)", linewidth=2) + plt.plot(timestamps, vms_values, label="VMS (MB)", linewidth=2) + + if self.use_tracemalloc: + tracemalloc_values = [s.tracemalloc_mb for s in self.snapshots if s.tracemalloc_mb is not None] + if tracemalloc_values: + tracemalloc_timestamps = timestamps[: len(tracemalloc_values)] + plt.plot(tracemalloc_timestamps, tracemalloc_values, label="Tracemalloc (MB)", linewidth=2, linestyle="--") + + plt.xlabel("Time (seconds)") + plt.ylabel("Memory Usage (MB)") + plt.title("Memory Usage Over Time") + plt.grid(True, linestyle="--", alpha=0.7) + plt.legend() + + plt.tight_layout() + plt.savefig(output_file) + plt.close() + + +def profile_memory(func=None, *, interval: float = 0.1, use_tracemalloc: bool = True, output_dir: Optional[Union[str, Path]] = None): + """Decorator to profile memory usage of a function. + + Args: + func: The function to profile. + interval: The interval in seconds between memory snapshots. + use_tracemalloc: Whether to use tracemalloc for detailed memory tracking. + output_dir: Directory to save the memory profile report. If None, a temporary directory is used. + """ + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + profiler = MemoryProfiler(interval=interval, use_tracemalloc=use_tracemalloc) + profiler.start() + + try: + result = func(*args, **kwargs) + return result + finally: + profiler.stop() + + # Determine output directory + out_dir = output_dir + if out_dir is None: + out_dir = Path(tempfile.gettempdir()) / "codegen_memory_profiles" + + # Save report + func_name = func.__name__ + report_dir = profiler.save_report(out_dir, func_name) + + console.print(f"\n[bold green]Memory profile saved to:[/bold green] {report_dir}") + + # Print summary + peak_rss, peak_vms = profiler.get_peak_memory() + table = Table(title="Memory Usage Summary") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="green") + + table.add_row("Peak RSS", f"{peak_rss:.2f} MB") + table.add_row("Peak VMS", f"{peak_vms:.2f} MB") + table.add_row("Duration", f"{profiler.snapshots[-1].timestamp:.2f} seconds") + + console.print(table) + + return wrapper + + if func is None: + return decorator + return decorator(func) + + +def profile_command(cmd_args: list[str], output_dir: Optional[Union[str, Path]] = None) -> Path: + """Profile memory usage of a command. + + Args: + cmd_args: The command arguments to profile. + output_dir: Directory to save the memory profile report. If None, a temporary directory is used. + + Returns: + Path to the output directory containing the profile report. + """ + if output_dir is None: + output_dir = Path(tempfile.gettempdir()) / "codegen_memory_profiles" + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Create a unique filename for this run + timestamp = int(time.time()) + cmd_name = "_".join(cmd_args).replace("/", "_")[:50] # Limit length and remove problematic chars + output_file = output_dir / f"{cmd_name}_{timestamp}_memory.json" + + # Run the command with memory profiling + env = os.environ.copy() + env["PYTHONPATH"] = f"{os.getcwd()}:{env.get('PYTHONPATH', '')}" + + # Prepare the profiling script + script = f""" +import sys +import time +import json +import psutil +import tracemalloc +from pathlib import Path + +output_file = "{output_file}" +interval = 0.1 +process = psutil.Process() +snapshots = [] +start_time = time.time() + +# Start tracemalloc +tracemalloc.start() + +# Take snapshots at regular intervals +try: + while True: + mem_info = process.memory_info() + current, peak = tracemalloc.get_traced_memory() + + snapshots.append({{ + "timestamp": time.time() - start_time, + "rss_mb": mem_info.rss / (1024 * 1024), + "vms_mb": mem_info.vms / (1024 * 1024), + "tracemalloc_mb": current / (1024 * 1024) + }}) + + time.sleep(interval) +except KeyboardInterrupt: + pass +finally: + # Save the snapshots + with open(output_file, 'w') as f: + json.dump(snapshots, f, indent=2) + + # Print summary + if snapshots: + peak_rss = max(s["rss_mb"] for s in snapshots) + peak_vms = max(s["vms_mb"] for s in snapshots) + duration = snapshots[-1]["timestamp"] + + print(f"\\nMemory Profile Summary:") + print(f"Peak RSS: {{peak_rss:.2f}} MB") + print(f"Peak VMS: {{peak_vms:.2f}} MB") + print(f"Duration: {{duration:.2f}} seconds") + print(f"Profile saved to: {{output_file}}") + """ + + script_file = output_dir / f"memory_profiler_{timestamp}.py" + with open(script_file, "w") as f: + f.write(script) + + # Start the profiler in a separate process + profiler_process = subprocess.Popen([sys.executable, str(script_file)], env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Run the command + cmd_process = subprocess.Popen([sys.executable, "-m", "codegen.cli.cli", *cmd_args], env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Wait for the command to finish + stdout, stderr = cmd_process.communicate() + + # Stop the profiler + profiler_process.terminate() + profiler_stdout, profiler_stderr = profiler_process.communicate() + + # Print command output + console.print("[bold]Command Output:[/bold]") + console.print(stdout.decode()) + if stderr: + console.print("[bold red]Command Errors:[/bold red]") + console.print(stderr.decode()) + + # Print profiler output + if profiler_stdout: + console.print(profiler_stdout.decode()) + + # Generate visualization if the profile data exists + if output_file.exists(): + try: + with open(output_file) as f: + data = json.load(f) + + if data: + # Generate plot + plot_file = output_dir / f"{cmd_name}_{timestamp}_memory.png" + + timestamps = [s["timestamp"] for s in data] + rss_values = [s["rss_mb"] for s in data] + vms_values = [s["vms_mb"] for s in data] + tracemalloc_values = [s["tracemalloc_mb"] for s in data if "tracemalloc_mb" in s] + + plt.figure(figsize=(10, 6)) + plt.plot(timestamps, rss_values, label="RSS (MB)", linewidth=2) + plt.plot(timestamps, vms_values, label="VMS (MB)", linewidth=2) + + if tracemalloc_values: + tracemalloc_timestamps = timestamps[: len(tracemalloc_values)] + plt.plot(tracemalloc_timestamps, tracemalloc_values, label="Tracemalloc (MB)", linewidth=2, linestyle="--") + + plt.xlabel("Time (seconds)") + plt.ylabel("Memory Usage (MB)") + plt.title(f"Memory Usage: {' '.join(cmd_args)}") + plt.grid(True, linestyle="--", alpha=0.7) + plt.legend() + + plt.tight_layout() + plt.savefig(plot_file) + plt.close() + + console.print(f"[bold green]Memory profile visualization saved to:[/bold green] {plot_file}") + except Exception as e: + console.print(f"[bold red]Error generating visualization:[/bold red] {e}") + + return output_dir