-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
UsenPang
committed
Jan 21, 2025
1 parent
8b5103e
commit 9039bcd
Showing
7 changed files
with
383 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# 支持的模型:kimi、deepseek、qwen、glm、openai | ||
MODEL=kimi | ||
# api key | ||
API_KEY= | ||
# 按照多少个字符分批翻译 | ||
BATCH_MAX_CHARTS=2000 | ||
# 失败最大重试次数 | ||
MAX_RETRIES=4 | ||
# po文件原目录 | ||
FROM_DIR= | ||
# 翻译后的目录, 为空的话直接覆盖原文件 | ||
TO_DIR= | ||
# 目标语言 | ||
TARGET_LANG=中文 | ||
|
||
# 代理配置,使用openai时请使用代理 | ||
#http_proxy= | ||
#https_proxy= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# 项目名称:PO-Translator | ||
## 项目简介: | ||
PO-Translator 是一个基于大语言模型(LLM)的自动化翻译工具,专门用于处理 .po 文件的国际化(i18n)和本地化(l10n)任务。该项目利用先进的大语言模型(如 kimi、deepseek、qwen、glm、openai 或其他类似模型)提供高质量的翻译,同时结合 Python 编程语言的强大功能,为开发者和翻译团队提供了一个高效、智能的解决方案。 | ||
## 主要功能: | ||
- 智能翻译:利用大语言模型(LLM)进行高质量的自动翻译,支持多种语言。 | ||
- 自动翻译:支持从源语言(如英语)到目标语言(如中文、法语等)的自动翻译。 | ||
- 多语言支持:支持多种目标语言,满足不同地区和用户的需求。 | ||
- 文件管理:自动处理 .po 文件的读取、翻译和保存,支持批量处理。 | ||
|
||
## 使用场景: | ||
- 软件开发:帮助开发者快速完成软件的国际化和本地化任务。 | ||
- 翻译团队:提高翻译效率,减少重复工作。 | ||
- 多语言项目:适用于需要支持多种语言的软件项目。 | ||
|
||
## 如何使用: | ||
安装依赖: | ||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
配置环境变量: | ||
编辑 .env 文件,配置翻译 API 密钥和其他参数。 | ||
运行脚本: | ||
``` | ||
python main.py | ||
``` | ||
查看输出: | ||
翻译后的 .po 文件将保存到指定目录。 | ||
|
||
## 更多模型: | ||
该程序已经内置了一些默认模型可供选择,如有更多的模型需求,请编辑config.py | ||
```python | ||
MODEL_CONFIG_DICT = { | ||
'kimi-128k': { | ||
'model': 'moonshot-v1-128k', | ||
'base_url': 'https://api.moonshot.cn/v1', | ||
'prompt': PROMPT, | ||
'temperature': 0.3, | ||
'rpm': 3, | ||
} | ||
} | ||
``` | ||
将会根据你环境变量中的`MODEL`找到对应的配置。 | ||
|
||
temperature:采样温度,较高的值将使输出更加随机,而较低的值将使其更加集中和确定性。调整该参数会影响翻译的效果。 | ||
prompt:提示词,其中prompt也是影响翻译效果的一大因素。 | ||
rpm:每分钟调用的次数,模型api调用会有限速,不同的模型限制不一样。 | ||
|
||
**注意**:以上的模型api应该支持openai SDK的调用。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import os | ||
import sys | ||
import logging | ||
from logging.handlers import RotatingFileHandler | ||
from dotenv import load_dotenv | ||
|
||
# 设置日志 | ||
log_file = 'logs/translation.log' | ||
|
||
# 确保日志目录存在 | ||
log_dir = os.path.dirname(log_file) | ||
if not os.path.exists(log_dir): | ||
os.makedirs(log_dir) | ||
|
||
logger = logging.getLogger('translation_logger') | ||
logger.setLevel(logging.INFO) | ||
|
||
# 文件处理器 | ||
file_handler = RotatingFileHandler(log_file, maxBytes=3 * 1024 * 1024, backupCount=5) | ||
file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | ||
file_handler.setFormatter(file_formatter) | ||
logger.addHandler(file_handler) | ||
|
||
# 终端处理器 | ||
stream_handler = logging.StreamHandler(sys.stdout) | ||
stream_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | ||
stream_handler.setFormatter(stream_formatter) | ||
logger.addHandler(stream_handler) | ||
|
||
load_dotenv(override=True) | ||
TARGET_LANG = os.getenv('TARGET_LANG') | ||
PROMPT = f""" | ||
- Role: 技术文档翻译专家 | ||
- Background: 作为资深技术文档翻译专家,专注于网页文本和 AI 领域的翻译工作,需确保翻译内容的专业性和清晰度,符合技术文档标准。 | ||
- Profile: 拥有扎实的多语言基础和对技术文档结构的深刻理解,能精准翻译技术术语和专业名词。 | ||
- Skills: 能准确理解并翻译技术术语,提供流畅翻译,保持原文专业性,确保翻译真实、准确、连贯。 | ||
- Goals: 将技术文档准确、专业地翻译成{TARGET_LANG},符合改语言读者习惯,保持原文格式和链接。 | ||
- Constrains: 保留网页链接和、代码和表格格式,仅输出译文,不破坏原来格式。 | ||
- OutputFormat: 使用如下 JSON 格式输出你的结果,仅输出译文。 | ||
- Workflow: | ||
1. 接收由Json对象的英文内容。 | ||
2. 逐条准确翻译。 | ||
3. 审核校对确保流畅连贯。 | ||
4. 使用Json输出翻译结果。 | ||
5. 输出符合要求的最终翻译。 | ||
- Initialization: 在第一次对话中,直接输出译文,不附加额外说明。 | ||
""" | ||
|
||
MODEL = os.getenv('MODEL') | ||
API_KEY = os.getenv('API_KEY') | ||
BATCH_MAX_CHARTS = int(os.getenv('BATCH_MAX_CHARTS')) | ||
MAX_RETRIES = int(os.getenv('MAX_RETRIES')) | ||
FROM_DIR = os.getenv('FROM_DIR') | ||
TO_DIR = os.getenv('TO_DIR') | ||
|
||
MODEL_CONFIG_DICT = { | ||
'kimi': { | ||
'model': 'moonshot-v1-auto', | ||
'base_url': 'https://api.moonshot.cn/v1', | ||
'prompt': PROMPT, | ||
'temperature': 0.3, | ||
'rpm': 3, | ||
}, | ||
'deepseek': { | ||
'model': 'deepseek-chat', | ||
'base_url': 'https://api.deepseek.com', | ||
'prompt': PROMPT, | ||
'temperature': 1.3, | ||
'rpm': 60, | ||
}, | ||
'qwen': { | ||
'model': 'qwen-plus', | ||
'base_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1', | ||
'prompt': PROMPT, | ||
'temperature': 0.7, | ||
'rpm': 1200, | ||
}, | ||
'glm': { | ||
'model': 'glm-4-plus', | ||
'base_url': 'https://open.bigmodel.cn/api/paas/v4/', | ||
'prompt': PROMPT, | ||
'temperature': 0.95, | ||
'rpm': 50, | ||
}, | ||
'openai': { | ||
'model': 'gpt-4o', | ||
'base_url': None, | ||
'prompt': PROMPT, | ||
'temperature': 0.8, | ||
'rpm': 30, | ||
} | ||
} | ||
|
||
MODEL_CONFIG = MODEL_CONFIG_DICT.get(MODEL) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import polib | ||
import json | ||
from json import JSONDecodeError | ||
from tqdm import tqdm | ||
from openai import OpenAI | ||
from config import * | ||
from rate_limiter import RateLimiter | ||
|
||
|
||
class AiTranslator: | ||
def __init__(self, config, api_key): | ||
self.config = config | ||
self.client = OpenAI( | ||
api_key=api_key, | ||
base_url=config['base_url'], | ||
) | ||
self.prompt = config['prompt'] | ||
self.rate_limiter = RateLimiter(config.get('rpm', 60), 60) | ||
|
||
def translate_text(self, text): | ||
@self.rate_limiter | ||
def limited_chat(msgs): | ||
return self.client.chat.completions.create( | ||
model=self.config['model'], | ||
messages=msgs, | ||
stream=False, | ||
response_format={"type": "json_object"}, | ||
temperature=self.config.get('temperature', 1) | ||
) | ||
|
||
for attempt in range(MAX_RETRIES): | ||
messages = [ | ||
{"role": "system", "content": self.prompt}, | ||
{"role": "user", "content": text}, | ||
] | ||
completion_contents = [] | ||
try: | ||
while True: | ||
completion = limited_chat(messages) | ||
messages.append(completion.choices[0].message) | ||
completion_contents.append(completion.choices[0].message.content) | ||
if completion.choices[0].finish_reason != "length": | ||
break | ||
|
||
return ''.join(completion_contents) | ||
except Exception as e: | ||
logger.error(f"尝试 {attempt + 1} 次失败:{e}") | ||
if attempt >= MAX_RETRIES - 1: | ||
logger.error("已达到最大重试次数。返回空。") | ||
|
||
return '' | ||
|
||
|
||
class PoWalkTranslator: | ||
def __init__(self, translator: AiTranslator, src: str, dest: str = None): | ||
self.translator = translator | ||
self.src = src | ||
self.dest = dest | ||
|
||
def run(self): | ||
for root, dirs, files in os.walk(self.src): | ||
logger.info(f"正在翻译目录{root}") | ||
for file in files: | ||
if file.endswith(".po"): | ||
file_path = os.path.join(root, file) | ||
new_file_path = file_path.replace(self.src, self.dest) if self.dest else file_path | ||
self.translate_po_file(file_path, new_file_path) | ||
|
||
def translate_po_file(self, file_path, new_file_path=None): | ||
logger.info(f"正在翻译: {file_path}") | ||
|
||
po = polib.pofile(file_path) | ||
untranslated_entries = po.untranslated_entries() | ||
|
||
if not untranslated_entries: | ||
logger.info("没有需要翻译的条目,跳过翻译。") | ||
return | ||
|
||
# 处理批次 | ||
batches = [] | ||
current_batch = [] | ||
current_length = 0 | ||
|
||
for entry in untranslated_entries: | ||
if current_length + len(entry.msgid) > BATCH_MAX_CHARTS: | ||
batches.append(current_batch) | ||
current_batch = [] | ||
current_length = 0 | ||
current_batch.append(entry) | ||
current_length += len(entry.msgid) | ||
|
||
if current_batch: | ||
batches.append(current_batch) | ||
|
||
# 按批次翻译 | ||
for batch in tqdm(batches, desc="翻译进度"): | ||
batch_map = {str(index): entry.msgid for index, entry in enumerate(batch)} | ||
content = json.dumps(batch_map) | ||
translated_content = self.translator.translate_text(content) | ||
try: | ||
translated_batch_map = json.loads(translated_content) | ||
except JSONDecodeError as e: | ||
logger.error(e.msg) | ||
logger.error("可能是翻译的条目过多,丢失该部分翻译,请尝试修改 BATCH_MAX_CHARTS 配置重新运行") | ||
translated_batch_map = '' | ||
|
||
if not translated_content: | ||
logger.info(f"翻译失败原文: {content}") | ||
logger.warning("警告: 批次翻译失败,保持原有的空翻译") | ||
continue | ||
|
||
# 更新翻译 | ||
for key, value in translated_batch_map.items(): | ||
msgid, msgstr = batch_map[key], value | ||
entry = po.find(msgid) | ||
if entry: | ||
if entry.msgid_plural: | ||
entry.msgstr_plural['0'] = msgstr | ||
entry.msgstr_plural['1'] = msgstr | ||
else: | ||
entry.msgstr = msgstr | ||
|
||
# 保存翻译文件 | ||
to_file_path = new_file_path or file_path | ||
self.ensure_directory_exists(to_file_path) | ||
po.save(to_file_path) | ||
logger.info(f"已保存翻译后的文件到: {to_file_path}\n") | ||
|
||
@staticmethod | ||
def ensure_directory_exists(file_path): | ||
""" | ||
确保文件路径的目录存在,如果不存在则创建。 | ||
:param file_path: 目标文件的完整路径 | ||
""" | ||
directory = os.path.dirname(file_path) | ||
if directory: # 如果路径中包含目录 | ||
os.makedirs(directory, exist_ok=True) | ||
|
||
|
||
if __name__ == "__main__": | ||
translator = AiTranslator(MODEL_CONFIG, API_KEY) | ||
po_walk_translator = PoWalkTranslator(translator, FROM_DIR, TO_DIR) | ||
po_walk_translator.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import time | ||
from collections import deque | ||
|
||
|
||
class RateLimiter: | ||
def __init__(self, max_calls, period): | ||
""" | ||
初始化 RateLimiter | ||
:param max_calls: 在指定时间段内允许的最大调用次数 | ||
:param period: 时间段的长度(秒) | ||
""" | ||
self.max_calls = max_calls | ||
self.period = period | ||
self.tokens = max_calls | ||
self.last_refill_time = time.time() | ||
self.call_times = deque() | ||
|
||
def _refill_tokens(self): | ||
""" | ||
重新填充令牌 | ||
""" | ||
current_time = time.time() | ||
elapsed_time = current_time - self.last_refill_time | ||
self.last_refill_time = current_time | ||
|
||
# 计算应该添加的令牌数量 | ||
new_tokens = elapsed_time / self.period * self.max_calls | ||
self.tokens = min(self.max_calls, self.tokens + new_tokens) | ||
|
||
def _remove_expired_calls(self): | ||
""" | ||
移除过期的调用记录 | ||
""" | ||
current_time = time.time() | ||
while self.call_times and self.call_times[0] < current_time - self.period: | ||
self.call_times.popleft() | ||
|
||
def __call__(self, func): | ||
""" | ||
装饰器:限制函数调用频率 | ||
""" | ||
|
||
def wrapper(*args, **kwargs): | ||
current_time = time.time() | ||
self._remove_expired_calls() | ||
|
||
# 检查当前时间段内的调用次数是否已达到上限 | ||
if len(self.call_times) >= self.max_calls: | ||
# 如果达到上限,计算需要等待的时间 | ||
wait_time = self.call_times[0] + self.period - current_time | ||
if wait_time > 0: | ||
time.sleep(wait_time) | ||
current_time = time.time() # 更新当前时间 | ||
|
||
# 添加调用记录 | ||
self.call_times.append(current_time) | ||
return func(*args, **kwargs) | ||
|
||
return wrapper | ||
|
Oops, something went wrong.