feat(webui): count token usage

ChenZiHong-Gavin · ChenZiHong-Gavin · commit dd0377f43872 · 2025-04-18T21:25:45.000+08:00
diff --git a/graphgen/models/llm/openai_model.py b/graphgen/models/llm/openai_model.py
@@ -1,5 +1,5 @@
 import math
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Dict, Optional
 import openai
 from openai import AsyncOpenAI, RateLimitError, APIConnectionError, APITimeoutError
@@ -31,10 +31,10 @@ class OpenAIModel(TopkTokenModel):
     model_name: str = "gpt-4o-mini"
     api_key: str = None
     base_url: str = None
-
     system_prompt: str = ""
     json_mode: bool = False
     seed: int = None
+    token_usage: list = field(default_factory=list)
 
     def __post_init__(self):
         assert self.api_key is not None, "Please provide api key to access openai api."
@@ -99,7 +99,12 @@ async def generate_answer(self, text: str, history: Optional[List[str]] = None,
             model=self.model_name,
             **kwargs
         )
-
+        if hasattr(completion, "usage"):
+            self.token_usage.append({
+                "prompt_tokens": completion.usage.prompt_tokens,
+                "completion_tokens": completion.usage.completion_tokens,
+                "total_tokens": completion.usage.total_tokens,
+            })
         return completion.choices[0].message.content
 
     async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
diff --git a/webui/app.py b/webui/app.py
@@ -3,11 +3,13 @@
 import json
 import tempfile
 
+import pandas as pd
 import gradio as gr
 
 from gradio_i18n import Translate, gettext as _
 from test_api import test_api_connection
 from cache_utils import setup_workspace, cleanup_workspace
+from count_tokens import count_tokens
 
 # pylint: disable=wrong-import-position
 root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -24,7 +26,6 @@
 }
 """
 
-
 def init_graph_gen(config: dict, env: dict) -> GraphGen:
     # Set up working directory
     working_dir = setup_workspace(os.path.join(root_dir, "cache"))
@@ -65,6 +66,9 @@ def init_graph_gen(config: dict, env: dict) -> GraphGen:
 
 # pylint: disable=too-many-statements
 def run_graphgen(*arguments: list, progress=gr.Progress()):
+    def sum_tokens(client):
+        return sum(u["total_tokens"] for u in client.token_usage)
+
     # Unpack arguments
     config = {
         "if_trainee_model": arguments[0],
@@ -174,14 +178,44 @@ def run_graphgen(*arguments: list, progress=gr.Progress()):
         # Clean up workspace
         cleanup_workspace(graph_gen.working_dir)
 
+        synthesizer_tokens = sum_tokens(graph_gen.synthesizer_llm_client)
+        trainee_tokens = sum_tokens(graph_gen.trainee_llm_client) if config['if_trainee_model'] else 0
+        total_tokens = synthesizer_tokens + trainee_tokens
+
+        data_frame = arguments[-1]
+        try:
+            data_frame = arguments[-1]
+            _update_data = [
+                [
+                    data_frame.iloc[0, 0],
+                    data_frame.iloc[0, 1],
+                    str(total_tokens)
+                ]
+            ]
+            new_df = pd.DataFrame(
+                _update_data,
+                columns=data_frame.columns
+            )
+            data_frame = new_df
+
+        except Exception as e:
+            raise gr.Error(f"DataFrame operation error: {str(e)}")
+
         progress(1.0, "Graph traversed")
-        return output_file
+        return output_file, gr.DataFrame(label='Token Stats',
+                         headers=["Source Text Token Count", "Predicted Token Count", "Token Used"],
+                         datatype=["str", "str", "str"],
+                         interactive=False,
+                         value=data_frame,
+                         visible=True,
+                         wrap=True)
 
     except Exception as e:  # pylint: disable=broad-except
         raise gr.Error(f"Error occurred: {str(e)}")
 
-with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
-               css=css) as demo:
+
+with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
+               css=css) as demo):
     # Header
     gr.Image(value=os.path.join(root_dir, 'resources', 'images', 'logo.png'),
              label="GraphGen Banner",
@@ -353,6 +387,14 @@ def run_graphgen(*arguments: list, progress=gr.Progress()):
                         interactive=False,
                     )
 
+        with gr.Blocks():
+            token_counter = gr.DataFrame(label='Token Stats',
+                         headers=["Source Text Token Count", "Predicted Token Count", "Token Used"],
+                         datatype=["str", "str", "str"],
+                         interactive=False,
+                         visible=False,
+                         wrap=True)
+
         submit_btn = gr.Button("Run GraphGen")
 
         # Test Connection
@@ -377,17 +419,32 @@ def run_graphgen(*arguments: list, progress=gr.Progress()):
             inputs=if_trainee_model,
             outputs=[trainee_model, quiz_samples, edge_sampling])
 
+        # 计算上传文件的token数
+        upload_file.change(
+            lambda x: (gr.update(visible=True)),
+            inputs=[upload_file],
+            outputs=[token_counter],
+        ).then(
+            count_tokens,
+            inputs=[upload_file, tokenizer, token_counter],
+            outputs=[token_counter],
+        )
+
         # run GraphGen
         submit_btn.click(
+            lambda x: (gr.update(visible=False)),
+            inputs=[token_counter],
+            outputs=[token_counter],
+        ).then(
             run_graphgen,
             inputs=[
                 if_trainee_model, upload_file, tokenizer, qa_form,
                 bidirectional, expand_method, max_extra_edges, max_tokens,
                 max_depth, edge_sampling, isolated_node_strategy,
                 loss_strategy, base_url, synthesizer_model, trainee_model,
-                api_key, chunk_size
+                api_key, chunk_size, token_counter
             ],
-            outputs=[output],
+            outputs=[output, token_counter],
         )
 
 if __name__ == "__main__":
diff --git a/webui/count_tokens.py b/webui/count_tokens.py
@@ -0,0 +1,56 @@
+import os
+import sys
+import json
+import pandas as pd
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(root_dir)
+
+from graphgen.models import Tokenizer
+
+def count_tokens(file, tokenizer_name, data_frame):
+    if file.endswith(".jsonl"):
+        with open(file, "r", encoding='utf-8') as f:
+            data = [json.loads(line) for line in f]
+    elif file.endswith(".json"):
+        with open(file, "r", encoding='utf-8') as f:
+            data = json.load(f)
+            data = [item for sublist in data for item in sublist]
+    elif file.endswith(".txt"):
+        with open(file, "r", encoding='utf-8') as f:
+            data = f.read()
+            chunks = [
+                data[i:i + 512] for i in range(0, len(data), 512)
+            ]
+            data = [{"content": chunk} for chunk in chunks]
+    else:
+        raise ValueError(f"Unsupported file type: {file}")
+
+    tokenizer = Tokenizer(tokenizer_name)
+
+    # Count tokens
+    token_count = 0
+
+    for item in data:
+        if isinstance(item, dict):
+            content = item.get("content", "")
+        else:
+            content = item
+        token_count += len(tokenizer.encode_string(content))
+
+    _update_data = [[
+        str(token_count),
+        str(token_count * 50),
+        "N/A"
+    ]]
+
+    try:
+        new_df = pd.DataFrame(
+            _update_data,
+            columns=data_frame.columns
+        )
+        data_frame = new_df
+
+    except Exception as e:
+        print("[ERROR] DataFrame操作异常:", str(e))
+
+    return data_frame