feat(charts): plot length distribution

ChenZiHong-Gavin · ChenZiHong-Gavin · commit ca191f37cc11 · 2025-01-13T13:17:49.000+08:00
diff --git a/.pylintrc b/.pylintrc
@@ -433,6 +433,7 @@ disable=raw-checker-failed,
         missing-module-docstring,
         missing-class-docstring,
         missing-function-docstring,
+        no-member,
         W0122,  # Use of exec (exec-used)
         R0914,  # Too many local variables (19/15) (too-many-locals)
         R0903,  # Too few public methods (1/2)
@@ -450,7 +451,7 @@ disable=raw-checker-failed,
         E1120,  # TODO: unbound-method-call-no-value-for-parameter
         R0917,  # Too many positional arguments (6/5) (too-many-positional-arguments)
         C0103,
-        E0401
+        E0401,
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/charts/plot_rephrase_process.py b/charts/plot_rephrase_process.py
@@ -1,12 +1,13 @@
 import re
-
+import plotly.express as px
+from collections import defaultdict
+import plotly.graph_objects as go
 import pandas as pd
 from tqdm import tqdm
+
 from models import Tokenizer
 from utils.log import parse_log
-import plotly.express as px
-import plotly.graph_objects as go
-from collections import defaultdict
+
 
 def analyse_log(log_info: dict) -> list:
     """
@@ -80,7 +81,6 @@ def plot_pre_length_distribution(stats: list[dict]):
     :return fig
     """
 
-    # 使用传入的stats参数而不是全局的data
     if not stats:
         return go.Figure()
 
@@ -134,6 +134,66 @@ def plot_pre_length_distribution(stats: list[dict]):
 
     return fig
 
+def plot_post_synth_length_distribution(stats: list[dict]):
+    """
+    Plot the distribution of post-synthesis length.
+
+    :return fig
+    """
+
+    if not stats:
+        return go.Figure()
+
+    # 计算最大长度并确定区间
+    max_length = max(item['post_length'] for item in stats)
+    bin_size = 50
+    max_length = ((max_length // bin_size) + 1) * bin_size
+
+    # 使用defaultdict避免键不存在的检查
+    length_distribution = defaultdict(int)
+
+    # 一次遍历完成所有统计
+    for item in stats:
+        bin_start = (item['post_length'] // bin_size) * bin_size
+        bin_key = f"{bin_start}-{bin_start + bin_size}"
+        length_distribution[bin_key] += 1
+
+    # 转换为排序后的列表以保持区间顺序
+    sorted_bins = sorted(length_distribution.keys(),
+                         key=lambda x: int(x.split('-')[0]))
+
+    # 创建图表
+    fig = go.Figure(data=[
+        go.Bar(
+            x=sorted_bins,
+            y=[length_distribution[bin_] for bin_ in sorted_bins],
+            text=[length_distribution[bin_] for bin_ in sorted_bins],
+            textposition='auto',
+        )
+    ])
+
+    # 设置图表布局
+    fig.update_layout(
+        title='Distribution of Post-Synthesis Length',
+        xaxis_title='Length Range',
+        yaxis_title='Count',
+        bargap=0.2,
+        showlegend=False
+    )
+
+    # 如果数据点过多，优化x轴标签显示
+    if len(sorted_bins) > 10:
+        fig.update_layout(
+            xaxis={
+                'tickangle': 45,
+                'tickmode': 'array',
+                'ticktext': sorted_bins[::2],  # 每隔一个显示标签
+                'tickvals': list(range(len(sorted_bins)))[::2]
+            }
+        )
+
+    return fig
+
 if __name__ == "__main__":
     log = parse_log('/home/PJLAB/chenzihong/Project/graphgen/cache/logs/graphgen.log')
     data = analyse_log(log)
diff --git a/simulate.py b/simulate.py
@@ -1,12 +1,15 @@
 """Simulate text length distributions using input data distributions when rephrasing."""
 
+import copy
+import os
+import json
 import gradio as gr
 
 from models import TraverseStrategy, NetworkXStorage
-from charts.plot_rephrase_process import plot_pre_length_distribution
+from charts.plot_rephrase_process import plot_pre_length_distribution, plot_post_synth_length_distribution
 from graphgen.operators.split_graph import get_batches_with_strategy
 from utils import create_event_loop
-import copy
+from models import Tokenizer
 
 if __name__ == "__main__":
     networkx_storage = NetworkXStorage(
@@ -32,22 +35,22 @@ async def get_batches(traverse_strategy: TraverseStrategy):
         return await get_batches_with_strategy(nodes, edges, networkx_storage, traverse_strategy)
 
     def traverse_graph(
-        bidirectional: bool,
-        expand_method: str,
-        max_extra_edges: int,
-        max_tokens: int,
-        max_depth: int,
-        edge_sampling: str,
-        isolated_node_strategy: str
+        ts_bidirectional: bool,
+        ts_expand_method: str,
+        ts_max_extra_edges: int,
+        ts_max_tokens: int,
+        ts_max_depth: int,
+        ts_edge_sampling: str,
+        ts_isolated_node_strategy: str
     ) -> str:
         traverse_strategy = TraverseStrategy(
-            bidirectional=bidirectional,
-            expand_method=expand_method,
-            max_extra_edges=max_extra_edges,
-            max_tokens=max_tokens,
-            max_depth=max_depth,
-            edge_sampling=edge_sampling,
-            isolated_node_strategy=isolated_node_strategy
+            bidirectional=ts_bidirectional,
+            expand_method=ts_expand_method,
+            max_extra_edges=ts_max_extra_edges,
+            max_tokens=ts_max_tokens,
+            max_depth=ts_max_depth,
+            edge_sampling=ts_edge_sampling,
+            isolated_node_strategy=ts_isolated_node_strategy
         )
 
         loop = create_event_loop()
@@ -56,8 +59,8 @@ def traverse_graph(
 
         data = []
         for _process_batch in batches:
-            pre_length = sum([node['length'] for node in _process_batch[0]]) + sum(
-                [edge[2]['length'] for edge in _process_batch[1]])
+            pre_length = sum(node['length'] for node in _process_batch[0]) + sum(
+                edge[2]['length'] for edge in _process_batch[1])
             data.append({
                 'pre_length': pre_length
             })
@@ -66,60 +69,88 @@ def traverse_graph(
         return fig
 
 
-    def update_sliders(expand_method):
-        if expand_method == "max_tokens":
+    def update_sliders(method_name):
+        if method_name == "max_tokens":
             return gr.update(visible=True), gr.update(visible=False)  # Show max_tokens, hide max_extra_edges
-        else:
-            return gr.update(visible=False), gr.update(visible=True)  # Hide max_tokens, show max_extra_edges
-
-
-    with gr.Blocks() as iface:
-        gr.Markdown("# Graph Traversal Interface")
-
-        with gr.Row():
-            with gr.Column():
-                bidirectional = gr.Checkbox(label="Bidirectional", value=False)
-                expand_method = gr.Dropdown(
-                    choices=["max_width", "max_tokens"],
-                    value="max_tokens",
-                    label="Expand Method",
-                    interactive=True
-                )
-
-                # Initialize sliders
-                max_extra_edges = gr.Slider(minimum=1, maximum=50, value=5, step=1, label="Max Extra Edges",
-                                            visible=False)
-                max_tokens = gr.Slider(minimum=128, maximum=8 * 1024, value=1024, step=128, label="Max Tokens")
-                max_depth = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Depth")
-                edge_sampling = gr.Dropdown(
-                    choices=["max_loss", "random", "min_loss"],
-                    value="max_loss",
-                    label="Edge Sampling Strategy"
-                )
-                isolated_node_strategy = gr.Dropdown(
-                    choices=["add", "ignore", "connect"],
-                    value="add",
-                    label="Isolated Node Strategy"
-                )
-                submit_btn = gr.Button("Traverse Graph")
-
-        with gr.Row():
-            output_plot = gr.Plot(label="Graph Visualization")
-
-        # Set up event listener for expand_method dropdown
-        expand_method.change(fn=update_sliders, inputs=expand_method, outputs=[max_tokens, max_extra_edges])
-
-        submit_btn.click(
-            fn=traverse_graph,
-            inputs=[
-                bidirectional,
-                expand_method,
-                max_extra_edges,
-                max_tokens,
-                max_depth,
-                edge_sampling,
-                isolated_node_strategy
-            ],
-            outputs=[output_plot]
-        )
-    iface.launch()
+        return gr.update(visible=False), gr.update(visible=True)  # Hide max_tokens, show max_extra_edges
+
+
+    with gr.Blocks() as app:
+        with gr.Tab("Before Traversal"):
+            with gr.Row():
+                with gr.Column():
+                    bidirectional = gr.Checkbox(label="Bidirectional", value=False)
+                    expand_method = gr.Dropdown(
+                        choices=["max_width", "max_tokens"],
+                        value="max_tokens",
+                        label="Expand Method",
+                        interactive=True
+                    )
+
+                    # Initialize sliders
+                    max_extra_edges = gr.Slider(minimum=1, maximum=50, value=5, step=1, label="Max Extra Edges",
+                                                visible=False)
+                    max_tokens = gr.Slider(minimum=128, maximum=8 * 1024, value=1024, step=128, label="Max Tokens")
+                    max_depth = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Depth")
+                    edge_sampling = gr.Dropdown(
+                        choices=["max_loss", "random", "min_loss"],
+                        value="max_loss",
+                        label="Edge Sampling Strategy"
+                    )
+                    isolated_node_strategy = gr.Dropdown(
+                        choices=["add", "ignore", "connect"],
+                        value="add",
+                        label="Isolated Node Strategy"
+                    )
+                    submit_btn = gr.Button("Traverse Graph")
+
+            with gr.Row():
+                output_plot = gr.Plot(label="Graph Visualization")
+
+            # Set up event listener for expand_method dropdown
+            expand_method.change(fn=update_sliders, inputs=expand_method, outputs=[max_tokens, max_extra_edges])
+
+            submit_btn.click(
+                fn=traverse_graph,
+                inputs=[
+                    bidirectional,
+                    expand_method,
+                    max_extra_edges,
+                    max_tokens,
+                    max_depth,
+                    edge_sampling,
+                    isolated_node_strategy
+                ],
+                outputs=[output_plot]
+            )
+
+        with gr.Tab("After Synthesis"):
+            with gr.Row():
+                with gr.Column():
+                    file_list = os.listdir("cache/data/graphgen")
+                    input_file = gr.Dropdown(choices=file_list, label="Input File")
+                    file_button = gr.Button("Submit File")
+
+            with gr.Row():
+                output_plot = gr.Plot(label="Graph Visualization")
+
+            def synthesize_text(file):
+                tokenizer = Tokenizer()
+                with open(f"cache/data/graphgen/{file}", "r", encoding='utf-8') as f:
+                    data = json.load(f)
+                stats = []
+                for key in data:
+                    item = data[key]
+                    item['post_length'] = len(tokenizer.encode_string(item['answer']))
+                    stats.append({
+                        'post_length': item['post_length']
+                    })
+                fig = plot_post_synth_length_distribution(stats)
+                return fig
+            file_button.click(
+                fn=synthesize_text,
+                inputs=[input_file],
+                outputs=[output_plot]
+            )
+
+    app.launch()