diff --git a/dataflow_agent/toolkits/image2drawio/__init__.py b/dataflow_agent/toolkits/image2drawio/__init__.py index b70cabc3..1ac55e45 100644 --- a/dataflow_agent/toolkits/image2drawio/__init__.py +++ b/dataflow_agent/toolkits/image2drawio/__init__.py @@ -9,6 +9,8 @@ save_masked_rgba, bbox_iou_px, ) +from .metric_evaluator import evaluate as metric_evaluate +from .refinement_processor import refine as refinement_refine __all__ = [ "classify_shape", @@ -18,4 +20,6 @@ "sample_fill_stroke", "save_masked_rgba", "bbox_iou_px", + "metric_evaluate", + "refinement_refine", ] diff --git a/dataflow_agent/toolkits/image2drawio/metric_evaluator.py b/dataflow_agent/toolkits/image2drawio/metric_evaluator.py new file mode 100644 index 00000000..e93f8263 --- /dev/null +++ b/dataflow_agent/toolkits/image2drawio/metric_evaluator.py @@ -0,0 +1,692 @@ +""" +metric_evaluator.py — Image2DrawIO quality evaluation module. + +Computes a content-coverage score and detects uncovered "bad regions" +that need fallback rescue. Works with Paper2Any's dict-based element +format (kind/bbox_px/image_path …). + +Core idea: + score = covered_content_pixels / total_content_pixels × 100 + +Three-channel bad-region detection: + Fine — small icons / sub-figures (0.05 %–20 % of image) + Coarse — panels / large images (0.2 %–30 %) + Complex— high-variance areas without base64 (heatmaps, photos …) + +Usage: + from dataflow_agent.toolkits.image2drawio.metric_evaluator import evaluate + + result = evaluate( + image_path="input.png", + elements=[...], # from _build_elements_from_sam3 + text_blocks=[...], # from _text_node + output_dir="outputs/xx", # optional, saves debug images + ) + print(result["score"], result["bad_regions"]) +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import cv2 +import numpy as np +from dataflow_agent.logger import get_logger + +log = get_logger(__name__) + +# ======================== helpers ======================== + +def _bbox_area(bbox: List[int]) -> int: + return max(0, bbox[2] - bbox[0]) * max(0, bbox[3] - bbox[1]) + + +def _bbox_iou(a: List[int], b: List[int]) -> float: + xa = max(a[0], b[0]) + ya = max(a[1], b[1]) + xb = min(a[2], b[2]) + yb = min(a[3], b[3]) + inter = max(0, xb - xa) * max(0, yb - ya) + area_a = _bbox_area(a) + area_b = _bbox_area(b) + union = area_a + area_b - inter + return inter / union if union > 0 else 0.0 + + +# ======================== configuration ======================== + +DEFAULT_CONFIG: Dict[str, Any] = { + # content mask + "content_threshold": 240, # 更严格:灰度<240即为内容(原245太宽松) + "use_edge_detection": True, + "edge_low": 25, # 更敏感的边缘检测(原30) + "edge_high": 80, # 降低高阈值以捕获更多边缘(原100) + "denoise_kernel": 2, + "min_content_area": 20, # 保留更小的内容区域(原30) + + # fine channel — 检测小图标/人脸/小子图 + "fine_min_ratio": 0.0003, # 更敏感:0.03%(原0.05%) + "fine_max_ratio": 0.25, # 扩大到25%(原20%) + "fine_min_fill": 0.12, # 更宽松的填充率(原0.15) + "fine_max_aspect": 10.0, # 允许更大宽高比(原8.0) + + # coarse channel — 检测版块/大图 + "coarse_min_ratio": 0.001, # 更敏感:0.1%(原0.2%) + "coarse_max_ratio": 0.35, # 扩大到35%(原30%) + "coarse_min_fill": 0.15, # 更宽松的填充率(原0.20) + "coarse_max_aspect": 10.0, # 允许更大宽高比(原8.0) + "coarse_kernel": 7, # 稍大的核以合并相邻内容(原5) + + # NMS / dedup + "nms_iou": 0.25, # 更严格的NMS(原0.3) + "existing_iou": 0.30, # 更积极过滤:与已有元素 IoU>30% 即跳过(原0.45) + "max_covered_ratio": 0.65, # 降低已覆盖容忍度(原0.7) + "min_missing_ratio": 0.03, # 更敏感的漏检内容检测(原0.05) + + # merge + "merge_distance_ratio": 0.08, # 更保守的合并距离(原0.10) + "small_region_threshold": 0.025, # 更小的小区域阈值(原0.03) + + # text protection + "text_pad_px": 15, # 文字 bbox 外扩像素,弥补 OCR 框偏小(原8太小) + "text_overlap_skip": 0.35, # 候选区域被文字覆盖 ≥ 35% 则跳过(原0.5太高,文字区域容易漏过) +} + +# ======================== public API ======================== + +def evaluate( + image_path: str, + elements: List[Dict[str, Any]], + text_blocks: Optional[List[Dict[str, Any]]] = None, + output_dir: Optional[str] = None, + config: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """ + Evaluate how well existing elements cover the image content. + + Returns dict with keys: + score – 0..100 (100 = perfect coverage) + bad_regions – list of {bbox, area, area_ratio, channel, …} + needs_refinement – bool + metrics – detailed numbers for debugging + """ + cfg = {**DEFAULT_CONFIG, **(config or {})} + + cv2_image = cv2.imread(image_path) + if cv2_image is None: + log.error(f"[MetricEvaluator] Cannot read image: {image_path}") + return {"score": 0, "bad_regions": [], "needs_refinement": False, "metrics": {}} + + h, w = cv2_image.shape[:2] + img_area = h * w + + # 1. content mask (foreground pixels) + content_mask = _create_content_mask(cv2_image, cfg) + total_content = int(np.count_nonzero(content_mask)) + + # 2. covered mask (elements that produced actual output) + covered_mask, existing_bboxes = _create_covered_mask( + elements, text_blocks or [], h, w, cfg + ) + + # 3. build text-only mask (with padding) for text-overlap filtering + text_bboxes = _collect_text_bboxes(text_blocks or [], h, w, cfg) + + # 4. pixel coverage + covered_content = int(np.count_nonzero(cv2.bitwise_and(content_mask, covered_mask))) + pixel_coverage = (covered_content / total_content * 100) if total_content > 0 else 100.0 + + # 5. uncovered content + uncovered = cv2.bitwise_and(content_mask, cv2.bitwise_not(covered_mask)) + + # 6. detect bad regions (three channels) + bad_regions = _detect_bad_regions( + cv2_image, content_mask, covered_mask, uncovered, + existing_bboxes, text_bboxes, elements, img_area, cfg, + ) + + # 7. score = 100 − bad-region area ratio (de-duplicated) + bad_mask = np.zeros((h, w), dtype=np.uint8) + for r in bad_regions: + x1, y1, x2, y2 = r["bbox"] + bad_mask[max(0, y1):min(h, y2), max(0, x1):min(w, x2)] = 255 + bad_ratio = float(np.count_nonzero(bad_mask) / img_area * 100) if img_area > 0 else 0.0 + score = max(0.0, 100.0 - bad_ratio) + + needs_refinement = len(bad_regions) > 0 + + metrics = { + "score": round(score, 2), + "pixel_coverage": round(pixel_coverage, 2), + "total_content_px": total_content, + "covered_content_px": covered_content, + "image_area": img_area, + "element_count": len(elements), + "text_block_count": len(text_blocks or []), + "bad_region_count": len(bad_regions), + "bad_region_ratio": round(bad_ratio, 2), + } + + log.info( + f"[MetricEvaluator] score={score:.1f}, " + f"bad_regions={len(bad_regions)}, bad_ratio={bad_ratio:.1f}%" + ) + + # 8. optional visualisation / JSON dump + if output_dir: + _save_debug(cv2_image, covered_mask, uncovered, bad_regions, + metrics, needs_refinement, score, output_dir) + + return { + "score": round(score, 2), + "bad_regions": bad_regions, + "needs_refinement": needs_refinement, + "metrics": metrics, + } + + +# ======================== content mask ======================== + +def _create_content_mask(img: np.ndarray, cfg: dict) -> np.ndarray: + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + h, w = gray.shape + + # threshold + mask_gray = ((gray < cfg["content_threshold"]).astype(np.uint8)) * 255 + + # edge detection (optional) + if cfg.get("use_edge_detection", True): + edges = cv2.Canny(gray, cfg["edge_low"], cfg["edge_high"]) + edges = cv2.dilate(edges, np.ones((5, 5), np.uint8), iterations=2) + mask_gray = cv2.bitwise_or(mask_gray, edges) + + # denoise + ks = cfg.get("denoise_kernel", 2) + if ks > 0: + mask_gray = cv2.morphologyEx(mask_gray, cv2.MORPH_OPEN, np.ones((ks, ks), np.uint8)) + + # remove tiny connected components + min_cc = cfg.get("min_content_area", 20) + if min_cc > 0: + n_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask_gray, connectivity=8) + clean = np.zeros_like(mask_gray) + for i in range(1, n_labels): + if stats[i, cv2.CC_STAT_AREA] >= min_cc: + clean[labels == i] = 255 + mask_gray = clean + + return mask_gray + + +# ======================== covered mask ======================== + +def _create_covered_mask( + elements: List[Dict], + text_blocks: List[Dict], + h: int, w: int, + cfg: dict, +) -> Tuple[np.ndarray, List[List[int]]]: + """ + Build a mask of regions that already have real output. + + Rules: + - shape with fill/stroke → covered (矢量已还原) + - image with existing image_path → covered + - text block with geometry → covered (带 padding 扩展) + """ + mask = np.zeros((h, w), dtype=np.uint8) + bboxes: List[List[int]] = [] + + for el in elements: + bbox = el.get("bbox_px") + if not bbox or len(bbox) != 4: + continue + kind = el.get("kind", "") + + # image kind: must have a valid file to count + if kind == "image": + ip = el.get("image_path", "") + if not ip or not os.path.exists(ip): + continue + + x1, y1, x2, y2 = [int(v) for v in bbox] + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(w, x2), min(h, y2) + if x2 > x1 and y2 > y1: + mask[y1:y2, x1:x2] = 255 + bboxes.append([x1, y1, x2, y2]) + + # 文字 bbox 带 padding 写入 covered mask + pad = cfg.get("text_pad_px", 8) + for blk in text_blocks: + geo = blk.get("geometry", {}) + x = int(float(geo.get("x", 0))) + y = int(float(geo.get("y", 0))) + bw = int(float(geo.get("width", 0))) + bh = int(float(geo.get("height", 0))) + x1, y1 = max(0, x - pad), max(0, y - pad) + x2, y2 = min(w, x + bw + pad), min(h, y + bh + pad) + if x2 > x1 and y2 > y1: + mask[y1:y2, x1:x2] = 255 + bboxes.append([x1, y1, x2, y2]) + + return mask, bboxes + + +def _collect_text_bboxes( + text_blocks: List[Dict], h: int, w: int, cfg: dict, +) -> List[List[int]]: + """Collect padded text bboxes for text-overlap filtering.""" + pad = cfg.get("text_pad_px", 8) + bboxes: List[List[int]] = [] + for blk in text_blocks: + geo = blk.get("geometry", {}) + x = int(float(geo.get("x", 0))) + y = int(float(geo.get("y", 0))) + bw = int(float(geo.get("width", 0))) + bh = int(float(geo.get("height", 0))) + x1, y1 = max(0, x - pad), max(0, y - pad) + x2, y2 = min(w, x + bw + pad), min(h, y + bh + pad) + if x2 > x1 and y2 > y1: + bboxes.append([x1, y1, x2, y2]) + return bboxes + + +# ======================== bad-region detection ======================== + +def _detect_bad_regions( + cv2_image: np.ndarray, + content_mask: np.ndarray, + covered_mask: np.ndarray, + uncovered: np.ndarray, + existing_bboxes: List[List[int]], + text_bboxes: List[List[int]], + elements: List[Dict], + img_area: int, + cfg: dict, +) -> List[Dict[str, Any]]: + h, w = cv2_image.shape[:2] + candidates: List[Tuple[List[int], str]] = [] + + # fine channel + for box in _channel_cc(uncovered, img_area, + cfg["fine_min_ratio"], cfg["fine_max_ratio"], + cfg["fine_min_fill"], cfg["fine_max_aspect"]): + candidates.append((box, "fine")) + + # coarse channel + k = cfg["coarse_kernel"] + closed = cv2.morphologyEx(uncovered, cv2.MORPH_CLOSE, + cv2.getStructuringElement(cv2.MORPH_RECT, (k, k))) + for box in _channel_cc(closed, img_area, + cfg["coarse_min_ratio"], cfg["coarse_max_ratio"], + cfg["coarse_min_fill"], cfg["coarse_max_aspect"]): + candidates.append((box, "coarse")) + + # complex channel (high-variance regions without element coverage) + for box in _detect_complex(cv2_image, elements, covered_mask, img_area): + candidates.append((box, "complex")) + + # banner channel — 横幅/标题栏: 宽度 ≥ 图片宽度 40%, 宽高比 > 10 + # 普通通道会因 max_aspect 过滤掉这类区域, 但横跨画面的标题栏应该保留 + for box in _channel_cc(uncovered, img_area, + cfg.get("banner_min_ratio", 0.003), + cfg.get("banner_max_ratio", 0.15), + cfg.get("banner_min_fill", 0.08), + max_aspect=100.0): # 放宽宽高比 + bw = box[2] - box[0] + bh = box[3] - box[1] + # 必须宽度 ≥ 图片宽度的 40%(排除普通窄条噪音) + if bw >= w * 0.4 and bh >= 10: + candidates.append((box, "banner")) + + log.info( + f"[MetricEvaluator] candidates: " + f"fine={sum(1 for _, c in candidates if c == 'fine')}, " + f"coarse={sum(1 for _, c in candidates if c == 'coarse')}, " + f"complex={sum(1 for _, c in candidates if c == 'complex')}, " + f"banner={sum(1 for _, c in candidates if c == 'banner')}" + ) + + # small-box-first NMS + candidates = _nms_small_first(candidates, cfg["nms_iou"]) + + # filter vs existing elements + text overlap + coverage check + regions = _filter_candidates( + candidates, covered_mask, existing_bboxes, text_bboxes, + uncovered, img_area, cfg, + ) + + # merge nearby small regions + merge_dist = min(h, w) * cfg.get("merge_distance_ratio", 0.10) + regions = _merge_nearby(regions, merge_dist, img_area, + cfg.get("small_region_threshold", 0.03)) + + regions.sort(key=lambda r: r["area"], reverse=True) + return regions + + +def _channel_cc( + mask: np.ndarray, img_area: int, + min_ratio: float, max_ratio: float, + min_fill: float, max_aspect: float, +) -> List[List[int]]: + """Connected-component channel: returns list of bboxes.""" + min_a = img_area * min_ratio + max_a = img_area * max_ratio + n, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8) + boxes: List[List[int]] = [] + for i in range(1, n): + x, y, rw, rh, cc_area = stats[i] + if rw <= 0 or rh <= 0: + continue + ba = rw * rh + if ba < min_a or ba > max_a: + continue + if max(rw, rh) / max(1, min(rw, rh)) > max_aspect: + continue + if cc_area / ba < min_fill: + continue + boxes.append([int(x), int(y), int(x + rw), int(y + rh)]) + return boxes + + +def _detect_complex( + cv2_image: np.ndarray, + elements: List[Dict], + covered_mask: np.ndarray, + img_area: int, +) -> List[List[int]]: + """Detect high-complexity regions not covered by any element. + + NOTE: text regions have already been painted into covered_mask + (with padding), so they are excluded from `uncovered_hi` via the + bitwise_and(hi, ~covered_mask) step. + """ + h, w = cv2_image.shape[:2] + gray = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2GRAY).astype(np.float32) + + ks = max(21, min(h, w) // 50) + if ks % 2 == 0: + ks += 1 + + local_mean = cv2.blur(gray, (ks, ks)) + local_var = cv2.blur(gray ** 2, (ks, ks)) - local_mean ** 2 + local_var = np.maximum(local_var, 0) + + edges = cv2.Canny(gray.astype(np.uint8), 30, 100) + edge_density = cv2.blur(edges.astype(np.float32), (ks, ks)) + + var_norm = local_var / (local_var.max() + 1e-6) + edge_norm = edge_density / (edge_density.max() + 1e-6) + complexity = var_norm * 0.6 + edge_norm * 0.4 + + thresh = np.percentile(complexity, 75) + hi = (complexity > thresh).astype(np.uint8) * 255 + hi = cv2.morphologyEx(hi, cv2.MORPH_CLOSE, np.ones((15, 15), np.uint8)) + + uncovered_hi = cv2.bitwise_and(hi, cv2.bitwise_not(covered_mask)) + uncovered_hi = cv2.morphologyEx(uncovered_hi, cv2.MORPH_OPEN, np.ones((7, 7), np.uint8)) + uncovered_hi = cv2.morphologyEx(uncovered_hi, cv2.MORPH_CLOSE, np.ones((51, 51), np.uint8)) + + min_a = img_area * 0.002 + max_a = img_area * 0.30 + n, _, stats, _ = cv2.connectedComponentsWithStats(uncovered_hi, connectivity=8) + boxes: List[List[int]] = [] + for i in range(1, n): + x, y, rw, rh, _ = stats[i] + ba = rw * rh + if ba < min_a or ba > max_a: + continue + if max(rw, rh) / max(1, min(rw, rh)) > 8: + continue + boxes.append([int(x), int(y), int(x + rw), int(y + rh)]) + return boxes + + +# ======================== NMS / filtering ======================== + +def _nms_small_first( + candidates: List[Tuple[List[int], str]], + iou_thresh: float, +) -> List[Tuple[List[int], str]]: + """Keep smaller boxes, suppress larger overlapping ones.""" + if not candidates: + return [] + items = [(b, c, _bbox_area(b)) for b, c in candidates] + items.sort(key=lambda x: x[2]) # ascending area + keep: List[Tuple[List[int], str]] = [] + suppressed = [False] * len(items) + for i, (bi, ci, _) in enumerate(items): + if suppressed[i]: + continue + keep.append((bi, ci)) + for j in range(i + 1, len(items)): + if not suppressed[j] and _bbox_iou(bi, items[j][0]) > iou_thresh: + suppressed[j] = True + return keep + + +def _text_overlap_ratio(candidate: List[int], text_bboxes: List[List[int]]) -> float: + """计算 candidate 区域被文字 bbox 覆盖的面积占比. + + 将所有与 candidate 相交的文字区域的交集面积累加(使用 mask 去重), + 然后除以 candidate 面积。 + """ + c_area = _bbox_area(candidate) + if c_area <= 0 or not text_bboxes: + return 0.0 + + cx1, cy1, cx2, cy2 = candidate + cw, ch = cx2 - cx1, cy2 - cy1 + + # 用小 mask 精确计算覆盖面积(避免多个文字 bbox 重叠导致重复计算) + tmask = np.zeros((ch, cw), dtype=np.uint8) + for tb in text_bboxes: + # 相交区域(相对于 candidate 的局部坐标) + lx1 = max(0, tb[0] - cx1) + ly1 = max(0, tb[1] - cy1) + lx2 = min(cw, tb[2] - cx1) + ly2 = min(ch, tb[3] - cy1) + if lx2 > lx1 and ly2 > ly1: + tmask[ly1:ly2, lx1:lx2] = 255 + + covered = int(np.count_nonzero(tmask)) + return covered / c_area + + +def _filter_candidates( + candidates: List[Tuple[List[int], str]], + covered_mask: np.ndarray, + existing_bboxes: List[List[int]], + text_bboxes: List[List[int]], + uncovered: np.ndarray, + img_area: int, + cfg: dict, +) -> List[Dict[str, Any]]: + iou_thresh = cfg["existing_iou"] + max_covered = cfg["max_covered_ratio"] + min_missing = cfg["min_missing_ratio"] + text_skip = cfg.get("text_overlap_skip", 0.5) + + regions: List[Dict[str, Any]] = [] + for box, channel in candidates: + x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3]) + area = int(_bbox_area(box)) + is_complex = channel == "complex" + + # skip if high IoU with an existing element + eff_iou = 0.8 if is_complex else iou_thresh + if any(_bbox_iou(box, eb) > eff_iou for eb in existing_bboxes): + continue + + # skip if candidate is mostly covered by union of existing element bboxes + # (handles cases where no single element has high IoU but collectively they cover it) + if not is_complex and existing_bboxes: + cw, ch = x2 - x1, y2 - y1 + if cw > 0 and ch > 0: + union_mask = np.zeros((ch, cw), dtype=np.uint8) + for eb in existing_bboxes: + lx1 = max(0, eb[0] - x1) + ly1 = max(0, eb[1] - y1) + lx2 = min(cw, eb[2] - x1) + ly2 = min(ch, eb[3] - y1) + if lx2 > lx1 and ly2 > ly1: + union_mask[ly1:ly2, lx1:lx2] = 255 + union_covered = float(np.count_nonzero(union_mask)) / (cw * ch) + if union_covered > 0.50: + continue + + # ★ skip if candidate is predominantly text + # 如果候选区域被文字 bbox 覆盖的面积 ≥ text_overlap_skip,则认为是文字区域,跳过 + if text_bboxes and _text_overlap_ratio([x1, y1, x2, y2], text_bboxes) >= text_skip: + continue + + # skip if mostly already covered (except complex) + if not is_complex: + roi = covered_mask[max(0, y1):min(covered_mask.shape[0], y2), + max(0, x1):min(covered_mask.shape[1], x2)] + if roi.size > 0 and float(np.mean(roi > 0)) > max_covered: + continue + + # missing content pixels + roi_unc = uncovered[max(0, y1):min(uncovered.shape[0], y2), + max(0, x1):min(uncovered.shape[1], x2)] + missing_px = int(np.count_nonzero(roi_unc)) if not is_complex else area + + if not is_complex and area > 0 and missing_px < area * min_missing: + continue + + regions.append({ + "bbox": [x1, y1, x2, y2], + "area": area, + "area_ratio": round(area / img_area, 4) if img_area > 0 else 0, + "missing_pixels": missing_px, + "channel": channel, + "reason": "complex_image" if is_complex else "uncovered_content", + "description": ( + f"({x1},{y1})-({x2},{y2}) " + f"{'complex image' if is_complex else 'uncovered'} [{channel}]" + ), + }) + return regions + + +# ======================== merge ======================== + +def _merge_nearby( + regions: List[Dict], + merge_dist: float, + img_area: int, + small_thresh: float, +) -> List[Dict]: + if len(regions) <= 1: + return regions + + large = [r for r in regions if r["area_ratio"] >= small_thresh] + small = [r for r in regions if r["area_ratio"] < small_thresh] + if len(small) <= 1: + return regions + + def _dist(a, b): + dx = max(0, max(a[0], b[0]) - min(a[2], b[2])) + dy = max(0, max(a[1], b[1]) - min(a[3], b[3])) + return max(dx, dy) + + n = len(small) + parent = list(range(n)) + + def _find(x): + while parent[x] != x: + parent[x] = parent[parent[x]] + x = parent[x] + return x + + for i in range(n): + for j in range(i + 1, n): + if _dist(small[i]["bbox"], small[j]["bbox"]) < merge_dist: + pi, pj = _find(i), _find(j) + if pi != pj: + parent[pi] = pj + + groups: Dict[int, List[int]] = {} + for i in range(n): + groups.setdefault(_find(i), []).append(i) + + merged: List[Dict] = [] + for indices in groups.values(): + if len(indices) == 1: + merged.append(small[indices[0]]) + else: + bxs = [small[i]["bbox"] for i in indices] + mb = [int(min(b[0] for b in bxs)), int(min(b[1] for b in bxs)), + int(max(b[2] for b in bxs)), int(max(b[3] for b in bxs))] + ma = int(_bbox_area(mb)) + merged.append({ + "bbox": mb, + "area": ma, + "area_ratio": round(ma / img_area, 4) if img_area > 0 else 0, + "missing_pixels": sum(small[i]["missing_pixels"] for i in indices), + "channel": "merged", + "reason": "merged_regions", + "description": f"merged {len(indices)} small regions", + }) + return large + merged + + +# ======================== debug output ======================== + +def _save_debug( + cv2_image, covered_mask, uncovered, bad_regions, + metrics, needs_refinement, score, output_dir, +): + os.makedirs(output_dir, exist_ok=True) + + # visualisation + vis = cv2_image.copy() + overlay = cv2_image.copy() + h, w = vis.shape[:2] + for i, r in enumerate(bad_regions): + x1, y1, x2, y2 = r["bbox"] + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(w, x2), min(h, y2) + cv2.rectangle(overlay, (x1, y1), (x2, y2), (0, 0, 255), -1) + cv2.rectangle(vis, (x1, y1), (x2, y2), (0, 0, 255), 3) + label = f"#{i+1} {r['channel']} ({r['area_ratio']*100:.1f}%)" + (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2) + cv2.rectangle(vis, (x1, y1 - th - 8), (x1 + tw + 6, y1), (0, 0, 255), -1) + cv2.putText(vis, label, (x1 + 3, y1 - 4), + cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2) + result_img = cv2.addWeighted(overlay, 0.25, vis, 0.75, 0) + cv2.imwrite(str(Path(output_dir) / "metric_eval.png"), result_img) + + # JSON + def _native(o): + if isinstance(o, (np.integer,)): + return int(o) + if isinstance(o, (np.floating,)): + return float(o) + if isinstance(o, np.ndarray): + return o.tolist() + if isinstance(o, list): + return [_native(x) for x in o] + if isinstance(o, dict): + return {k: _native(v) for k, v in o.items()} + return o + + with open(str(Path(output_dir) / "metric_eval.json"), "w", encoding="utf-8") as f: + json.dump( + { + "score": round(float(score), 2), + "needs_refinement": bool(needs_refinement), + "metrics": {k: _native(v) for k, v in metrics.items()}, + "bad_regions": [{k: _native(v) for k, v in r.items()} for r in bad_regions], + }, + f, + ensure_ascii=False, + indent=2, + ) + log.info(f"[MetricEvaluator] debug saved to {output_dir}") diff --git a/dataflow_agent/toolkits/image2drawio/refinement_processor.py b/dataflow_agent/toolkits/image2drawio/refinement_processor.py new file mode 100644 index 00000000..b8663725 --- /dev/null +++ b/dataflow_agent/toolkits/image2drawio/refinement_processor.py @@ -0,0 +1,333 @@ +""" +refinement_processor.py — Fallback rescue for uncovered regions. + +Takes bad regions from metric_evaluator, crops them from the original +image, saves as PNG, and returns new element dicts compatible with +the existing Paper2Any _render_xml_node format. + +Strategy (conservative): + - Crop the region from the original image + - Save as PNG file + - Return as kind="image" element with image_path + - Skip regions that are >95% white or too small + +Usage: + from dataflow_agent.toolkits.image2drawio.refinement_processor import refine + + new_elements = refine( + image_path="input.png", + bad_regions=[...], # from metric_evaluator + existing_elements=[...], + output_dir="outputs/xx", + ) + # new_elements are dicts with kind="image", bbox_px, image_path, etc. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any, Dict, List, Optional + +import cv2 +import numpy as np + +from dataflow_agent.logger import get_logger + +log = get_logger(__name__) + +# ======================== configuration ======================== + +DEFAULT_CONFIG: Dict[str, Any] = { + "min_region_area": 100, # skip regions smaller than this (px) + "min_region_ratio": 0.0005, # skip regions smaller than 0.05% of image + "expand_margin": 5, # expand crop by N pixels each side + "skip_mostly_white": True, # skip regions that are almost all white + "white_threshold": 0.95, # ratio of white pixels to skip + "white_pixel_value": 245, # grayscale > this = "white" + "skip_mostly_text": True, # skip regions that are mostly thin text strokes + "text_stroke_threshold": 0.55, # ratio: if >55% of dark pixels are thin strokes → text(原0.80太严格) +} + + +# ======================== public API ======================== + +def refine( + image_path: str, + bad_regions: List[Dict[str, Any]], + existing_elements: List[Dict[str, Any]], + output_dir: str, + config: Optional[Dict[str, Any]] = None, +) -> List[Dict[str, Any]]: + """ + Process bad regions and return new image elements. + + Args: + image_path: path to the original image + bad_regions: list of dicts from metric_evaluator (each has "bbox") + existing_elements: current element list (for ID numbering) + output_dir: directory to save cropped PNGs + + Returns: + List of new element dicts (kind="image") ready for _render_xml_node. + These should be appended to existing_elements by the caller. + """ + cfg = {**DEFAULT_CONFIG, **(config or {})} + + if not bad_regions: + log.info("[Refinement] No bad regions to process") + return [] + + cv2_image = cv2.imread(image_path) + if cv2_image is None: + log.error(f"[Refinement] Cannot read image: {image_path}") + return [] + + h, w = cv2_image.shape[:2] + img_area = h * w + + crop_dir = Path(output_dir) / "refinement_crops" + crop_dir.mkdir(parents=True, exist_ok=True) + + min_area = cfg["min_region_area"] + min_ratio = cfg["min_region_ratio"] + margin = cfg["expand_margin"] + + # Collect existing element bboxes for overlap checking + existing_bboxes: List[List[int]] = [] + for el in existing_elements: + bbox = el.get("bbox_px") + if bbox and len(bbox) == 4: + existing_bboxes.append([int(v) for v in bbox]) + + new_elements: List[Dict[str, Any]] = [] + skipped = 0 + + # Generate IDs that don't collide with existing elements + max_existing_id = 0 + for el in existing_elements: + eid = el.get("id", "") + if isinstance(eid, str): + # extract numeric part from "s42", "i13", etc. + digits = "".join(c for c in eid if c.isdigit()) + if digits: + max_existing_id = max(max_existing_id, int(digits)) + elif isinstance(eid, int): + max_existing_id = max(max_existing_id, eid) + + next_id = max_existing_id + 1 + + for i, region in enumerate(bad_regions): + bbox = region.get("bbox") + if not bbox or len(bbox) != 4: + skipped += 1 + continue + + x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) + area = (x2 - x1) * (y2 - y1) + + # size filter + if area < min_area or (img_area > 0 and area < img_area * min_ratio): + log.debug(f"[Refinement] Region {i} too small ({area}px), skip") + skipped += 1 + continue + + # white filter + if cfg.get("skip_mostly_white", True) and _is_mostly_white( + cv2_image, [x1, y1, x2, y2], + cfg["white_pixel_value"], cfg["white_threshold"] + ): + log.debug(f"[Refinement] Region {i} mostly white, skip") + skipped += 1 + continue + + # overlap filter: skip if region is significantly covered by existing elements + if existing_bboxes: + rw, rh = x2 - x1, y2 - y1 + if rw > 0 and rh > 0: + overlap_mask = np.zeros((rh, rw), dtype=np.uint8) + for eb in existing_bboxes: + lx1 = max(0, eb[0] - x1) + ly1 = max(0, eb[1] - y1) + lx2 = min(rw, eb[2] - x1) + ly2 = min(rh, eb[3] - y1) + if lx2 > lx1 and ly2 > ly1: + overlap_mask[ly1:ly2, lx1:lx2] = 255 + overlap_ratio = float(np.count_nonzero(overlap_mask)) / (rw * rh) + if overlap_ratio > 0.40: + log.debug(f"[Refinement] Region {i} overlaps {overlap_ratio:.0%} with existing elements, skip") + skipped += 1 + continue + + # text-stroke filter: skip if region is mostly thin text strokes + # Exception: banner channel regions are OCR-missed titles that MUST be kept + is_banner = region.get("channel") == "banner" + if not is_banner and cfg.get("skip_mostly_text", True) and _is_mostly_text( + cv2_image, [x1, y1, x2, y2], + cfg.get("text_stroke_threshold", 0.70) + ): + log.debug(f"[Refinement] Region {i} mostly text strokes, skip") + skipped += 1 + continue + + # expand margin + cx1 = max(0, x1 - margin) + cy1 = max(0, y1 - margin) + cx2 = min(w, x2 + margin) + cy2 = min(h, y2 + margin) + + # crop and save + crop = cv2_image[cy1:cy2, cx1:cx2] + if crop.size == 0: + skipped += 1 + continue + + crop_path = str(crop_dir / f"refine_{i}.png") + cv2.imwrite(crop_path, crop) + + # build element dict compatible with _render_xml_node + new_elements.append({ + "id": f"r{next_id}", + "kind": "image", + "bbox_px": [cx1, cy1, cx2, cy2], + "image_path": crop_path, + "area": (cx2 - cx1) * (cy2 - cy1), + "group": "refinement", + "prompt": "fallback_crop", + "_source": "refinement", + "_channel": region.get("channel", "unknown"), + }) + next_id += 1 + + log.info( + f"[Refinement] Done: {len(new_elements)} new elements, " + f"{skipped} skipped" + ) + + # save visualisation + if new_elements: + _save_visualization(cv2_image, new_elements, existing_elements, output_dir) + + return new_elements + + +# ======================== helpers ======================== + +def _is_mostly_white( + cv2_image: np.ndarray, + bbox: List[int], + white_value: int = 245, + threshold: float = 0.95, +) -> bool: + """Check if a region is mostly white/empty.""" + x1, y1, x2, y2 = bbox + h, w = cv2_image.shape[:2] + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(w, x2), min(h, y2) + if x2 <= x1 or y2 <= y1: + return True + + roi = cv2_image[y1:y2, x1:x2] + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + white_count = int(np.count_nonzero(gray > white_value)) + total = gray.size + return (white_count / total) > threshold if total > 0 else True + + +def _is_mostly_text( + cv2_image: np.ndarray, + bbox: List[int], + threshold: float = 0.70, +) -> bool: + """Check if a region is mostly text strokes (dark-on-light OR light-on-dark). + + Detects both: + - Dark text on light background (standard documents) + - Light/white text on dark background (dark-theme panels, banners) + + Heuristic: text = strokes of foreground color on uniform background. + Uses adaptive kernel sizing based on region height to handle both + small body text (thin strokes) and large bold titles (thick strokes). + After morphological opening, text strokes disappear but filled shapes remain. + """ + x1, y1, x2, y2 = bbox + h, w = cv2_image.shape[:2] + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(w, x2), min(h, y2) + if x2 <= x1 or y2 <= y1: + return False + + roi = cv2_image[y1:y2, x1:x2] + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + total = gray.size + if total < 100: + return False + + roi_h = y2 - y1 + + # Determine polarity: is it dark-on-light or light-on-dark? + light_ratio = float(np.count_nonzero(gray > 200)) / total + dark_ratio = float(np.count_nonzero(gray < 60)) / total + + if light_ratio >= 0.55: + # Case 1: light background, dark text strokes + fg_mask = (gray < 180).astype(np.uint8) * 255 + elif dark_ratio >= 0.55: + # Case 2: dark background, light text strokes + fg_mask = (gray > 80).astype(np.uint8) * 255 + else: + # Mixed / mid-tone → probably not a text region + return False + + fg_count = int(np.count_nonzero(fg_mask)) + if fg_count < 10: + return False # almost no foreground + + # Adaptive kernel: larger regions may have thicker text (bold titles, headers) + # Use ~15% of region height as kernel size, clamped to [3, 11] + ks = max(3, min(11, int(roi_h * 0.15))) + if ks % 2 == 0: + ks += 1 # must be odd + + kernel = np.ones((ks, ks), np.uint8) + opened = cv2.morphologyEx(fg_mask, cv2.MORPH_OPEN, kernel) + thick_count = int(np.count_nonzero(opened)) + + # thin_ratio = fraction of foreground pixels that are thin (removed by opening) + thin_ratio = 1.0 - (thick_count / fg_count) if fg_count > 0 else 0.0 + + return thin_ratio >= threshold + + +def _save_visualization( + cv2_image: np.ndarray, + new_elements: List[Dict], + existing_elements: List[Dict], + output_dir: str, +): + """Save a debug image showing original + new elements.""" + vis = cv2_image.copy() + h, w = vis.shape[:2] + + # existing elements in blue + for el in existing_elements: + bbox = el.get("bbox_px") + if not bbox or len(bbox) != 4: + continue + x1, y1, x2, y2 = [int(v) for v in bbox] + cv2.rectangle(vis, (x1, y1), (x2, y2), (200, 100, 0), 1) + + # new (refinement) elements in red + for i, el in enumerate(new_elements): + bbox = el.get("bbox_px") + if not bbox or len(bbox) != 4: + continue + x1, y1, x2, y2 = [int(v) for v in bbox] + cv2.rectangle(vis, (x1, y1), (x2, y2), (0, 0, 255), 2) + label = f"NEW-{i}" + cv2.putText(vis, label, (x1, y1 - 5), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) + + out_path = str(Path(output_dir) / "refinement_result.png") + cv2.imwrite(out_path, vis) + log.info(f"[Refinement] Visualisation saved: {out_path}") diff --git a/dataflow_agent/toolkits/image2drawio/utils.py b/dataflow_agent/toolkits/image2drawio/utils.py index 22b88445..16df6881 100644 --- a/dataflow_agent/toolkits/image2drawio/utils.py +++ b/dataflow_agent/toolkits/image2drawio/utils.py @@ -107,19 +107,6 @@ def sample_fill_stroke(image_bgr: np.ndarray, mask: np.ndarray) -> Tuple[str, st if stroke_pixels.size == 0: stroke_pixels = image_bgr[mask] - # Select darkest quartile by luminance - if stroke_pixels.size > 0: - rgb = stroke_pixels[:, ::-1].astype(np.float32) - lum = 0.2126 * rgb[:, 0] + 0.7152 * rgb[:, 1] + 0.0722 * rgb[:, 2] - if lum.size > 10: - thresh = np.percentile(lum, 25) - sel = stroke_pixels[lum <= thresh] - else: - sel = stroke_pixels - stroke = tuple(np.mean(sel, axis=0).tolist()) - else: - stroke = (0, 0, 0) - # Fill: erode mask to remove border erode_k = max(1, int(min(h, w) * 0.004)) erode_k = min(erode_k, 7) @@ -134,14 +121,42 @@ def sample_fill_stroke(image_bgr: np.ndarray, mask: np.ndarray) -> Tuple[str, st else: fill = (255, 255, 255) + # Stroke: detect real border vs anti-aliased edge + if stroke_pixels.size > 0: + stroke_median = tuple(np.median(stroke_pixels, axis=0).tolist()) + # Compute luminance for stroke candidate and fill + def _lum_bgr(bgr): + return 0.0722 * bgr[0] + 0.7152 * bgr[1] + 0.2126 * bgr[2] + stroke_lum = _lum_bgr(stroke_median) + fill_lum = _lum_bgr(fill) + + # Check if edge pixels contain a distinctly dark border + rgb = stroke_pixels[:, ::-1].astype(np.float32) + lum = 0.2126 * rgb[:, 0] + 0.7152 * rgb[:, 1] + 0.0722 * rgb[:, 2] + dark_ratio = float(np.count_nonzero(lum < 50)) / max(1, len(lum)) + + if dark_ratio > 0.3: + # A significant portion of edge pixels are truly dark → real border + thresh = np.percentile(lum, 25) + sel = stroke_pixels[lum <= thresh] + stroke = tuple(np.mean(sel, axis=0).tolist()) + elif stroke_lum < 30 and fill_lum > 80: + # Stroke looks black but fill is colored → no real border, + # use slightly darkened fill + stroke = tuple(min(255, max(0, c * 0.7)) for c in fill) + else: + stroke = stroke_median + else: + stroke = (0, 0, 0) + return _to_hex(fill), _to_hex(stroke) def extract_text_color(image_bgr: np.ndarray, bbox_px: List[int]) -> str: x1, y1, x2, y2 = bbox_px - x1 = max(0, min(image_bgr.shape[1] - 1, int(x1))) + x1 = max(0, min(image_bgr.shape[1], int(x1))) x2 = max(0, min(image_bgr.shape[1], int(x2))) - y1 = max(0, min(image_bgr.shape[0] - 1, int(y1))) + y1 = max(0, min(image_bgr.shape[0], int(y1))) y2 = max(0, min(image_bgr.shape[0], int(y2))) if x2 <= x1 or y2 <= y1: return "#000000" diff --git a/dataflow_agent/workflow/deprecated/wf_image2drawio.py b/dataflow_agent/workflow/deprecated/wf_image2drawio.py index 571739f3..f51db93d 100644 --- a/dataflow_agent/workflow/deprecated/wf_image2drawio.py +++ b/dataflow_agent/workflow/deprecated/wf_image2drawio.py @@ -387,6 +387,7 @@ async def _ocr_node(state: Paper2FigureState) -> Paper2FigureState: log.warning(f"[image2drawio] no_text mask failed: {e}") state.no_text_path = "" + state.ocr_items = ocr_items return state diff --git a/dataflow_agent/workflow/wf_paper2drawio_sam3.py b/dataflow_agent/workflow/wf_paper2drawio_sam3.py index 29674b82..d2d4efba 100644 --- a/dataflow_agent/workflow/wf_paper2drawio_sam3.py +++ b/dataflow_agent/workflow/wf_paper2drawio_sam3.py @@ -55,6 +55,8 @@ save_masked_rgba, bbox_iou_px, ) +from dataflow_agent.toolkits.image2drawio.metric_evaluator import evaluate as metric_evaluate +from dataflow_agent.toolkits.image2drawio.refinement_processor import refine as refinement_refine from dataflow_agent.utils_common import robust_parse_json from dataflow_agent.workflow.sam3_segment_hint import ( dedupe_prompts, @@ -65,6 +67,7 @@ log = get_logger(__name__) # ==================== SAM3 PROMPTS (ported from Edit-Banana/prompts) ==================== +# 基本图形:覆盖主流流程图/架构图的所有几何元素 SHAPE_PROMPT = [ "rectangle", "rounded rectangle", @@ -73,6 +76,9 @@ "circle", "triangle", "hexagon", + "parallelogram", + "cylinder", + "cloud", ] ARROW_PROMPT = [ @@ -81,6 +87,7 @@ "connector", ] +# 图片类:覆盖各类非矢量化内容 IMAGE_PROMPT = [ "icon", "symbol", @@ -105,7 +112,7 @@ "blob", ] -# 泛化补召回提示词:避免与具体业务词绑定(如 planner/critic/robot) +# 泛化补召回提示词:低阈值兜底,避免与具体业务词绑定 IMAGE_PROMPT_RECALL = [ "illustration", "object", @@ -128,6 +135,8 @@ "container", "filled region", "background", + "section panel", + "title bar", ] SAM3_GROUPS = { @@ -139,22 +148,22 @@ # Thresholds aligned with Edit-Banana config defaults SAM3_GROUP_CONFIG = { - "shape": {"score_threshold": 0.5, "min_area": 200, "priority": 3}, - "arrow": {"score_threshold": 0.45, "min_area": 50, "priority": 4}, - "image": {"score_threshold": 0.5, "min_area": 100, "priority": 2}, - "background": {"score_threshold": 0.25, "min_area": 500, "priority": 1}, + "shape": {"score_threshold": 0.45, "min_area": 150, "priority": 3}, + "arrow": {"score_threshold": 0.40, "min_area": 30, "priority": 4}, + "image": {"score_threshold": 0.45, "min_area": 80, "priority": 2}, + "background": {"score_threshold": 0.20, "min_area": 400, "priority": 1}, } # 第2轮 image 召回配置(低阈值 + 动态最小面积) -SAM3_IMAGE_RECALL_SCORE_THRESHOLD = 0.38 -SAM3_IMAGE_RECALL_MIN_AREA_BASE = 40 -SAM3_IMAGE_RECALL_MIN_AREA_RATIO = 0.00003 -SAM3_IMAGE_RECALL_TRIGGER_MAX_IMAGES = 2 +SAM3_IMAGE_RECALL_SCORE_THRESHOLD = 0.35 +SAM3_IMAGE_RECALL_MIN_AREA_BASE = 30 +SAM3_IMAGE_RECALL_MIN_AREA_RATIO = 0.00002 +SAM3_IMAGE_RECALL_TRIGGER_MAX_IMAGES = 4 # Dedup params aligned with Edit-Banana defaults -SAM3_DEDUP_IOU = 0.7 -SAM3_ARROW_DEDUP_IOU = 0.85 -SAM3_SHAPE_IMAGE_IOU = 0.6 +SAM3_DEDUP_IOU = 0.65 +SAM3_ARROW_DEDUP_IOU = 0.80 +SAM3_SHAPE_IMAGE_IOU = 0.55 MAX_DRAWIO_ELEMENTS = 800 MIN_IMAGE_AREA_RATIO = 0.00001 @@ -950,6 +959,12 @@ def _shape_style( base = "shape=triangle;" elif st in {"hexagon"}: base = "shape=hexagon;perimeter=hexagonPerimeter2;fixedSize=1;" + elif st in {"parallelogram"}: + base = "shape=parallelogram;perimeter=parallelogramPerimeter;fixedSize=1;" + elif st in {"cylinder"}: + base = "shape=cylinder3;boundedLbl=1;backgroundOutline=1;size=15;" + elif st in {"cloud"}: + base = "ellipse;shape=cloud;" elif st in {"container", "rounded rectangle", "rounded_rect", "rounded rectangle"}: base = "rounded=1;" else: @@ -1123,7 +1138,7 @@ def _shape_type_from_prompt(prompt: str) -> str: p = normalize_prompt(prompt) if p in {"rounded rectangle", "rounded_rectangle"}: return "rounded rectangle" - if p in {"rectangle", "square", "panel", "background", "filled region", "title bar", "section_panel"}: + if p in {"rectangle", "square", "panel", "background", "filled region", "title bar", "section_panel", "section panel"}: return "rectangle" if p in {"container"}: return "rounded rectangle" @@ -1135,9 +1150,182 @@ def _shape_type_from_prompt(prompt: str) -> str: return "triangle" if p in {"hexagon"}: return "hexagon" + if p in {"parallelogram"}: + return "parallelogram" + if p in {"cylinder"}: + return "cylinder" + if p in {"cloud"}: + return "cloud" return p or "rectangle" +# ==================== CV BACKGROUND PANEL DETECTION ==================== +# 当 SAM3 没有检测到任何 background 组时,使用 CV 方法补充检测大面积 +# 色块面板(典型的海报/PPT 中的深色或浅色背景面板)。 + +# 面板检测的最小/最大面积比例 +_BG_MIN_AREA_RATIO = 0.02 # ≥ 2% 画面面积 +_BG_MAX_AREA_RATIO = 0.85 # ≤ 85% +_BG_MAX_ASPECT = 12.0 # 最大宽高比 +_BG_MAX_PANELS = 12 # 最多检测 12 个面板 +_BG_IOU_DEDUP = 0.3 # 面板间 IoU 去重阈值 +_BG_EXISTING_IOU = 0.6 # 与已有元素 IoU 去重阈值 +_BG_MIN_CONTAINED = 2 # 面板内部至少包含 N 个已有元素才算"容器" +_BG_SMALL_PANEL_RATIO = 0.08 # 面积 < 8% 的面板必须满足容器条件 + + +def _detect_background_panels_cv( + image_bgr: np.ndarray, + existing_elements: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """ + 用 CV 方法检测大面积矩形面板作为背景元素。 + + 策略: Canny 边缘 → 轮廓 → 筛选大矩形 → NMS 去重 → 颜色采样 + 适用于海报/PPT 中有明确矩形分区的图片。 + """ + h, w = image_bgr.shape[:2] + img_area = h * w + panels: List[Dict[str, Any]] = [] + + # 收集已有元素的 bbox + existing_bboxes = [] + for el in existing_elements: + bbox = el.get("bbox_px") + if bbox and len(bbox) == 4: + existing_bboxes.append(bbox) + + gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) + + # 边缘检测 + 膨胀连接 + edges = cv2.Canny(gray, 20, 60) + edges = cv2.dilate(edges, np.ones((3, 3), np.uint8), iterations=2) + + contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + # 找大矩形轮廓 + candidates: List[Tuple[List[int], float]] = [] # (bbox, rect_fill) + for cnt in contours: + area = cv2.contourArea(cnt) + if area < img_area * _BG_MIN_AREA_RATIO: + continue + peri = cv2.arcLength(cnt, True) + approx = cv2.approxPolyDP(cnt, 0.02 * peri, True) + if len(approx) < 4 or len(approx) > 8: + continue + x, y, rw, rh = cv2.boundingRect(cnt) + ba = rw * rh + rect_fill = area / ba if ba > 0 else 0 + if rect_fill < 0.5: # 至少 50% 填充 → 近似矩形 + continue + ratio = ba / img_area + if ratio > _BG_MAX_AREA_RATIO: + continue + aspect = max(rw, rh) / max(1, min(rw, rh)) + if aspect > _BG_MAX_ASPECT: + continue + candidates.append(([int(x), int(y), int(x + rw), int(y + rh)], rect_fill)) + + if not candidates: + return [] + + # NMS: 按面积从大到小,去掉高 IoU 重叠的 (内外轮廓去重) + candidates.sort(key=lambda c: _bbox_area(c[0]), reverse=True) + kept: List[List[int]] = [] + for bbox, _ in candidates: + skip = False + for kb in kept: + if bbox_iou_px(bbox, kb) > _BG_IOU_DEDUP: + skip = True + break + if not skip: + kept.append(bbox) + if len(kept) >= _BG_MAX_PANELS: + break + + # 过滤掉与已有元素高度重叠的 + final: List[List[int]] = [] + for bbox in kept: + skip = False + for eb in existing_bboxes: + if bbox_iou_px(bbox, eb) > _BG_EXISTING_IOU: + skip = True + break + if not skip: + final.append(bbox) + + # 构建 shape 元素 — 从边框附近采样颜色 + # 对于较小的面板 (< 8%), 要求内部包含至少 N 个已有元素才算"容器" + # 大面板 (≥ 8%) 通常就是布局面板,可以直接保留 + for idx, bbox in enumerate(final): + x1, y1, x2, y2 = bbox + panel_ratio = _bbox_area(bbox) / img_area + + # 小面板容器验证: 计算内部包含多少个已有元素 + if panel_ratio < _BG_SMALL_PANEL_RATIO: + contained = 0 + for eb in existing_bboxes: + # 元素中心在面板内部 → 算被包含 + ecx = (eb[0] + eb[2]) / 2 + ecy = (eb[1] + eb[3]) / 2 + if x1 <= ecx <= x2 and y1 <= ecy <= y2: + contained += 1 + if contained < _BG_MIN_CONTAINED: + log.debug( + f"[paper2drawio_sam3] CV panel [{x1},{y1},{x2},{y2}] " + f"ratio={panel_ratio*100:.1f}% skipped: only {contained} " + f"contained elements (need ≥{_BG_MIN_CONTAINED})" + ) + continue + x1, y1, x2, y2 = bbox + roi = image_bgr[y1:y2, x1:x2] + if roi.size == 0: + continue + rh_roi, rw_roi = roi.shape[:2] + border_w = max(3, min(rw_roi, rh_roi) // 20) + + # 边框区域像素 + border_pixels = np.concatenate([ + roi[:border_w, :].reshape(-1, 3), # top + roi[-border_w:, :].reshape(-1, 3), # bottom + roi[:, :border_w].reshape(-1, 3), # left + roi[:, -border_w:].reshape(-1, 3), # right + ], axis=0) + + fill_bgr = np.median(border_pixels, axis=0).astype(int) + fill_hex = "#{:02x}{:02x}{:02x}".format( + int(fill_bgr[2]), int(fill_bgr[1]), int(fill_bgr[0]) + ) + # 边框色: 稍微深一点 + darker = np.clip(fill_bgr * 0.7, 0, 255).astype(int) + stroke_hex = "#{:02x}{:02x}{:02x}".format( + int(darker[2]), int(darker[1]), int(darker[0]) + ) + + panels.append({ + "id": f"bg{idx}", + "kind": "shape", + "shape_type": "rectangle", + "bbox_px": bbox, + "fill": fill_hex, + "stroke": stroke_hex, + "text": "", + "text_color": None, + "font_size": None, + "area": _bbox_area(bbox), + "group": "background", + "prompt": "cv_panel", + }) + + if panels: + log.info( + f"[paper2drawio_sam3] CV background panels: " + f"{len(panels)} detected, areas={[round(p['area']/img_area*100,1) for p in panels]}%" + ) + + return panels + + def _sam3_predict_groups( client: Any, image_path: str, @@ -1172,6 +1360,14 @@ def _sam3_predict_groups( image_path=image_path, runs=base_runs, ) + + # Diagnostic: log per-group counts before dedup + _pre_dedup: Dict[str, int] = {} + for item in all_results: + g = str(item.get("group", "unknown")) + _pre_dedup[g] = _pre_dedup.get(g, 0) + 1 + log.info(f"[paper2drawio_sam3] SAM3 raw results (before dedup): {json.dumps(_pre_dedup)}") + all_results = dedup_sam3_results_across_groups( all_results, group_config=SAM3_GROUP_CONFIG, @@ -1434,6 +1630,15 @@ def _refine_low_coverage_image_mask(mask: np.ndarray, bbox: List[int]) -> Tuple[ shapes.sort(key=lambda s: s.get("area", 0), reverse=True) images.sort(key=lambda s: s.get("area", 0), reverse=True) + + # ---- CV fallback: detect background panels not found by SAM3 ---- + bg_count = sum(1 for s in shapes if s.get("group") == "background") + if bg_count == 0: + cv_bg = _detect_background_panels_cv(image_bgr, shapes + images) + if cv_bg: + log.info(f"[paper2drawio_sam3] CV background detection: added {len(cv_bg)} panels") + shapes = cv_bg + shapes # backgrounds go first (rendered at back) + total = len(shapes) + len(images) if total > MAX_DRAWIO_ELEMENTS: keep = max(0, MAX_DRAWIO_ELEMENTS - len(shapes)) @@ -1499,15 +1704,16 @@ async def _text_node(state: Paper2DrawioState) -> Paper2DrawioState: temp_state.request.chat_api_key = api_key try: - vlm_timeout = int(os.getenv("VLM_OCR_TIMEOUT", "120")) + vlm_timeout = int(os.getenv("VLM_OCR_TIMEOUT", "180")) except ValueError: - vlm_timeout = 120 + vlm_timeout = 180 + agent = create_vlm_agent( name="ImageTextBBoxAgent", model_name="qwen-vl-ocr-2025-11-20", chat_api_url=chat_api_url, - max_tokens=4096, vlm_mode="ocr", + max_tokens=8192, additional_params={"input_image": img_path, "timeout": vlm_timeout}, ) new_state = await agent.execute(temp_state) @@ -1640,6 +1846,65 @@ async def _build_elements_node(state: Paper2DrawioState) -> Paper2DrawioState: state.temp_data["fallback_hide_text_blocks"] = fallback_hide_text_blocks return state + async def _evaluate_node(state: Paper2DrawioState) -> Paper2DrawioState: + """Evaluate coverage quality and detect uncovered bad regions.""" + img_path = state.temp_data.get("input_image_path") + if not img_path or not os.path.exists(img_path): + state.temp_data["bad_regions"] = [] + return state + + elements = state.temp_data.get("drawio_elements", []) or [] + text_blocks = state.temp_data.get("text_blocks", []) or [] + base_dir = str(Path(_ensure_result_path(state))) + + eval_result = metric_evaluate( + image_path=img_path, + elements=elements, + text_blocks=text_blocks, + output_dir=base_dir, + ) + + state.temp_data["bad_regions"] = eval_result.get("bad_regions", []) + state.temp_data["eval_score"] = eval_result.get("score", 100) + state.temp_data["needs_refinement"] = eval_result.get("needs_refinement", False) + + log.info( + f"[paper2drawio_sam3] Evaluation: score={eval_result.get('score', 0):.1f}, " + f"bad_regions={len(eval_result.get('bad_regions', []))}, " + f"needs_refinement={eval_result.get('needs_refinement', False)}" + ) + return state + + async def _refine_node(state: Paper2DrawioState) -> Paper2DrawioState: + """Fallback rescue: crop uncovered bad regions as image elements.""" + if not state.temp_data.get("needs_refinement", False): + return state + + img_path = state.temp_data.get("input_image_path") + if not img_path or not os.path.exists(img_path): + return state + + bad_regions = state.temp_data.get("bad_regions", []) + if not bad_regions: + return state + + elements = state.temp_data.get("drawio_elements", []) or [] + base_dir = str(Path(_ensure_result_path(state))) + + new_elements = refinement_refine( + image_path=img_path, + bad_regions=bad_regions, + existing_elements=elements, + output_dir=base_dir, + ) + + if new_elements: + elements.extend(new_elements) + state.temp_data["drawio_elements"] = elements + log.info(f"[paper2drawio_sam3] Refinement: added {len(new_elements)} fallback elements") + + return state + async def _render_xml_node(state: Paper2DrawioState) -> Paper2DrawioState: img_path = state.temp_data.get("input_image_path") if not img_path or not os.path.exists(img_path): @@ -1714,6 +1979,8 @@ async def _render_xml_node(state: Paper2DrawioState) -> Paper2DrawioState: "segment_hint": _segment_hint_node, "sam3": _sam3_node, "build_elements": _build_elements_node, + "evaluate": _evaluate_node, + "refine": _refine_node, "render_xml": _render_xml_node, "_end_": lambda s: s, } @@ -1723,7 +1990,9 @@ async def _render_xml_node(state: Paper2DrawioState) -> Paper2DrawioState: ("text_ocr", "segment_hint"), ("segment_hint", "sam3"), ("sam3", "build_elements"), - ("build_elements", "render_xml"), + ("build_elements", "evaluate"), + ("evaluate", "refine"), + ("refine", "render_xml"), ("render_xml", "_end_"), ] diff --git a/fastapi_app/schemas.py b/fastapi_app/schemas.py index a849e454..5b4ca2ef 100644 --- a/fastapi_app/schemas.py +++ b/fastapi_app/schemas.py @@ -261,7 +261,7 @@ class FrontendPPTGenerationRequest(BaseModel): class FrontendPPTExportRequest(BaseModel): - """Export frontend slides into screenshot-based PPTX/PDF.""" + """Legacy screenshot fallback for frontend slides; Canvas PPTX is generated in the browser.""" result_path: str slides: str diff --git a/fastapi_app/services/paper2ppt_frontend_service.py b/fastapi_app/services/paper2ppt_frontend_service.py index 61b6e421..1191343a 100644 --- a/fastapi_app/services/paper2ppt_frontend_service.py +++ b/fastapi_app/services/paper2ppt_frontend_service.py @@ -54,6 +54,70 @@ _DEFAULT_VISUAL_KEY = "main_visual" _DEFAULT_VISUAL_KEYS = ("main_visual", "secondary_visual") _MAX_INLINE_VISUAL_ASSETS = 2 +_SLIDE_SCHEMA_VERSION = "frontend_slide_schema_v2" +_CANVAS_SCHEMA_VERSION = "ppt_canvas_schema_v1" +_CANVAS_VISUAL_SPEC_VERSION = "ppt_canvas_visual_spec_v1" +_CANVAS_LAYOUT_IR_VERSION = "ppt_layout_ir_v1" +_ALLOWED_BLOCK_TYPES = {"text", "list", "image", "quote", "stat", "callout", "table"} +_ALLOWED_CANVAS_NODE_TYPES = {"container", "component"} +_ALLOWED_CANVAS_COMPONENTS = { + "heading", + "text", + "bullets", + "quote", + "stat", + "callout", + "figure", + "table", + "placeholder", +} +_ALLOWED_LAYOUT_ZONES = {"header", "main", "aside", "footer", "full", "left", "right"} +_ALLOWED_WIDTH_HINTS = {"full", "wide", "half", "third", "narrow", "auto"} +_ALLOWED_SIDE_HINTS = {"left", "right", "center", "auto"} +_ALLOWED_EMPHASIS_HINTS = {"high", "medium", "low"} +_SUPPORTED_SCHEMA_TEMPLATE_KEYS = ( + "title_cover", + "section_divider", + "text_focus", + "hero_visual", + "split_media", + "visual_compare", + "insight_grid", + "metrics_dashboard", + "timeline_overview", + "stacked_cards", + "quote_focus", + "dual_list", +) +_SCHEMA_TEMPLATE_ALIASES = { + "cover": "title_cover", + "cover_slide": "title_cover", + "title_slide": "title_cover", + "divider": "section_divider", + "section": "section_divider", + "section_break": "section_divider", + "text_only": "text_focus", + "text_heavy": "text_focus", + "hero": "hero_visual", + "single_visual": "hero_visual", + "media_split": "split_media", + "split_layout": "split_media", + "compare": "visual_compare", + "comparison": "visual_compare", + "image_compare": "visual_compare", + "grid": "insight_grid", + "card_grid": "insight_grid", + "dashboard": "metrics_dashboard", + "metrics": "metrics_dashboard", + "timeline": "timeline_overview", + "process_timeline": "timeline_overview", + "cards": "stacked_cards", + "card_stack": "stacked_cards", + "quote": "quote_focus", + "quote_slide": "quote_focus", + "two_lists": "dual_list", + "dual_column_list": "dual_list", +} _PREVIEW_MAX_SIDE = 1280 _PREVIEW_SMALL_FILE_BYTES = 900 * 1024 _PREVIEW_JPEG_QUALITY = 82 @@ -130,6 +194,7 @@ async def generate_slides( ) self._write_slide_spec(slides_dir, generated_slide) self._sync_deck_manifest(slides_dir) + self._write_raw_ai_manifest(slides_dir, [generated_slide]) response_slide = self._externalize_slide_assets(generated_slide, request, base_dir=base_dir) return { "success": True, @@ -202,6 +267,7 @@ async def generate_slides( for slide in ordered_slides: self._write_slide_spec(slides_dir, slide) self._sync_deck_manifest(slides_dir) + self._write_raw_ai_manifest(slides_dir, ordered_slides) response_slides = [self._externalize_slide_assets(slide, request, base_dir=base_dir) for slide in ordered_slides] return { @@ -489,6 +555,7 @@ async def _generate_single_slide( theme=theme, visual_assets=visual_assets, ) + normalized["_raw_ai_payload"] = raw_payload return normalized except Exception as exc: # noqa: BLE001 log.warning( @@ -499,6 +566,7 @@ async def _generate_single_slide( fallback_slide["generation_note"] = ( f"Fallback template used because frontend code generation failed: {exc}" ) + fallback_slide["_raw_ai_payload"] = {"error": str(exc), "fallback": True} return fallback_slide async def _call_llm_json( @@ -1174,34 +1242,1768 @@ def _build_theme_messages( "section_label_template": "Slide {page_num:02d}/{slide_count:02d}" } -Requirements: -1. Theme must fit text-first academic slides on a 1600x900 canvas. -2. Use restrained, professional colors and a single coherent component language. -3. Keep typography practical. Titles should stay below 60px, body text below 28px. -4. Avoid references to images, charts, SVG, or external assets. -5. Optimize for consistency across all slides in the same deck. -6. The theme_lock must be concrete enough to prevent per-slide drift during later regeneration. -7. If style_prompt contains explicit color or material directions, translate them into the palette instead of ignoring them. -8. Do not default to cyan/teal accents unless the style_prompt clearly asks for them. -""".strip() +Requirements: +1. Theme must fit text-first academic slides on a 1600x900 canvas. +2. Use restrained, professional colors and a single coherent component language. +3. Keep typography practical. Titles should stay below 60px, body text below 28px. +4. Avoid references to images, charts, SVG, or external assets. +5. Optimize for consistency across all slides in the same deck. +6. The theme_lock must be concrete enough to prevent per-slide drift during later regeneration. +7. If style_prompt contains explicit color or material directions, translate them into the palette instead of ignoring them. +8. Do not default to cyan/teal accents unless the style_prompt clearly asks for them. +""".strip() + + user_payload = { + "language": language, + "style_prompt": style or "", + "outline_summary": outline_summary, + } + + return [ + {"role": "system", "content": system_prompt}, + { + "role": "user", + "content": ( + "Create one deck theme for this outline summary:\n\n" + f"{json.dumps(user_payload, ensure_ascii=False, indent=2)}" + ), + }, + ] + + def _normalize_template_key( + self, + raw_value: Any, + *, + blocks: Sequence[Dict[str, Any]], + visual_assets: Sequence[Dict[str, Any]], + ) -> str: + candidate = self._slugify(raw_value or "") + if candidate in _SUPPORTED_SCHEMA_TEMPLATE_KEYS: + return candidate + if candidate in _SCHEMA_TEMPLATE_ALIASES: + return _SCHEMA_TEMPLATE_ALIASES[candidate] + + image_count = sum(1 for block in blocks if str(block.get("type") or "") == "image") + list_count = sum(1 for block in blocks if str(block.get("type") or "") == "list") + stat_count = sum(1 for block in blocks if str(block.get("type") or "") == "stat") + quote_count = sum(1 for block in blocks if str(block.get("type") or "") == "quote") + has_visual_assets = bool(visual_assets) + block_count = len(blocks) + + if quote_count > 0: + return "quote_focus" + if image_count >= 2: + return "visual_compare" + if stat_count >= 2: + return "metrics_dashboard" + if list_count >= 2: + return "dual_list" + if image_count == 1 and list_count: + return "split_media" + if image_count == 1 or has_visual_assets: + return "hero_visual" + if block_count <= 3: + return "section_divider" + if block_count >= 6: + return "insight_grid" + return "text_focus" + + def _normalize_layout_mode(self, raw_value: Any) -> str: + text = str(raw_value or "").strip().lower() + if text in {"fluid", "hybrid", "fixed"}: + return text + return "fluid" + + def _normalize_block_type(self, raw_value: Any) -> str: + text = str(raw_value or "").strip().lower() + aliases = { + "textarea": "text", + "paragraph": "text", + "body": "text", + "bullet_list": "list", + "bullets": "list", + "points": "list", + "visual": "image", + "figure": "image", + "chart": "image", + "metric": "stat", + "number": "stat", + "note": "callout", + } + normalized = aliases.get(text, text or "text") + return normalized if normalized in _ALLOWED_BLOCK_TYPES else "text" + + def _normalize_table_data(self, raw_value: Any) -> Optional[Dict[str, List[Any]]]: + if not isinstance(raw_value, dict): + return None + raw_headers = raw_value.get("headers") or raw_value.get("columns") or raw_value.get("cols") or [] + raw_rows = raw_value.get("rows") or raw_value.get("data") or raw_value.get("values") or [] + headers = [str(item).strip() for item in raw_headers if str(item).strip()] if isinstance(raw_headers, list) else [] + rows: List[List[str]] = [] + if isinstance(raw_rows, list): + for raw_row in raw_rows: + if not isinstance(raw_row, list): + continue + row = [str(cell).strip() for cell in raw_row] + if row: + rows.append(row) + max_columns = max([len(headers), *[len(row) for row in rows], 0]) + if max_columns <= 0: + return None + normalized_headers = [ + headers[index] if index < len(headers) and headers[index] else f"Column {index + 1}" + for index in range(max_columns) + ] + normalized_rows = [ + [ + row[index] if index < len(row) else "" + for index in range(max_columns) + ] + for row in rows + ] or [["" for _ in range(max_columns)]] + return { + "headers": normalized_headers, + "rows": normalized_rows, + } + + def _default_zone_for_block( + self, + *, + block_type: str, + role: str, + has_visual_assets: bool, + ) -> str: + if role in {"eyebrow", "title"}: + return "header" + if role in {"footer"}: + return "footer" + if block_type == "image": + return "aside" if has_visual_assets else "main" + if role in {"takeaway", "stat", "callout"}: + return "aside" if has_visual_assets else "main" + return "main" + + def _default_span_for_block( + self, + *, + block_type: str, + zone: str, + has_visual_assets: bool, + ) -> int: + if zone in {"header", "footer", "full"}: + return 12 + if block_type == "image": + return 6 if has_visual_assets else 12 + if zone in {"aside", "right", "left"}: + return 5 if block_type in {"stat", "callout"} else 6 + if block_type == "list": + return 6 if has_visual_assets else 12 + return 7 if has_visual_assets else 12 + + def _normalize_layout_hint( + self, + raw_layout: Any, + *, + block_type: str, + role: str, + order: int, + has_visual_assets: bool, + ) -> Dict[str, Any]: + layout = raw_layout if isinstance(raw_layout, dict) else {} + zone = str( + layout.get("zone") + or layout.get("slot") + or layout.get("region") + or layout.get("area") + or self._default_zone_for_block( + block_type=block_type, + role=role, + has_visual_assets=has_visual_assets, + ) + ).strip().lower() + if zone not in _ALLOWED_LAYOUT_ZONES: + zone = self._default_zone_for_block( + block_type=block_type, + role=role, + has_visual_assets=has_visual_assets, + ) + + try: + span = int(layout.get("span") or layout.get("columns") or 0) + except (TypeError, ValueError): + span = 0 + if span <= 0: + span = self._default_span_for_block( + block_type=block_type, + zone=zone, + has_visual_assets=has_visual_assets, + ) + span = max(1, min(12, span)) + + try: + normalized_order = int(layout.get("order") or order) + except (TypeError, ValueError): + normalized_order = order + normalized_order = max(1, normalized_order) + + preferred_width = str( + layout.get("preferred_width") + or layout.get("preferredWidth") + or layout.get("width") + or "" + ).strip().lower() + if preferred_width not in _ALLOWED_WIDTH_HINTS: + if span >= 12: + preferred_width = "full" + elif span >= 8: + preferred_width = "wide" + elif span >= 6: + preferred_width = "half" + elif span >= 4: + preferred_width = "third" + else: + preferred_width = "auto" + + preferred_side = str( + layout.get("preferred_side") + or layout.get("preferredSide") + or layout.get("side") + or "" + ).strip().lower() + if preferred_side not in _ALLOWED_SIDE_HINTS: + if zone in {"left"}: + preferred_side = "left" + elif zone in {"right", "aside"}: + preferred_side = "right" + else: + preferred_side = "auto" + + emphasis = str(layout.get("emphasis") or "").strip().lower() + if emphasis not in _ALLOWED_EMPHASIS_HINTS: + emphasis = "high" if role in {"title", "main_visual"} else "medium" if role in {"summary", "key_points"} else "low" + + return { + "zone": zone, + "span": span, + "order": normalized_order, + "preferred_width": preferred_width, + "preferred_side": preferred_side, + "emphasis": emphasis, + } + + def _normalize_blocks( + self, + raw_blocks: Any, + *, + outline_item: Dict[str, Any], + visual_assets: Sequence[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + if not isinstance(raw_blocks, list): + return [] + + outline_title = str(outline_item.get("title") or "").strip() + outline_points = [ + str(item).strip() + for item in (outline_item.get("key_points") or []) + if str(item).strip() + ] + layout_description = str(outline_item.get("layout_description") or "").strip() + available_asset_keys = [ + self._slugify(asset.get("key") or "") + for asset in visual_assets + if isinstance(asset, dict) and self._slugify(asset.get("key") or "") + ] + has_visual_assets = bool(available_asset_keys) + used_asset_keys: list[str] = [] + normalized: List[Dict[str, Any]] = [] + seen_ids: set[str] = set() + + def _pick_asset_key(preferred: str = "") -> str: + preferred_key = self._slugify(preferred or "") + if preferred_key in available_asset_keys: + if preferred_key not in used_asset_keys: + used_asset_keys.append(preferred_key) + return preferred_key + + for key in available_asset_keys: + if key not in used_asset_keys: + used_asset_keys.append(key) + return key + return available_asset_keys[0] if available_asset_keys else "" + + for index, raw_block in enumerate(raw_blocks): + if not isinstance(raw_block, dict): + continue + + raw_id = ( + raw_block.get("id") + or raw_block.get("key") + or raw_block.get("field_key") + or raw_block.get("fieldKey") + or raw_block.get("role") + or f"block_{index + 1}" + ) + block_id = self._slugify(raw_id) or f"block_{index + 1}" + if block_id in seen_ids: + block_id = f"{block_id}_{index + 1}" + seen_ids.add(block_id) + + block_type = self._normalize_block_type( + raw_block.get("type") + or raw_block.get("block_type") + or raw_block.get("blockType") + or raw_block.get("kind") + ) + role = self._slugify( + raw_block.get("role") + or raw_block.get("semantic_role") + or raw_block.get("semanticRole") + or block_id + ) or block_id + + items = [ + str(item).strip() + for item in ( + raw_block.get("items") + or raw_block.get("bullets") + or raw_block.get("points") + or [] + ) + if str(item).strip() + ] + content = str( + raw_block.get("content") + or raw_block.get("text") + or raw_block.get("value") + or raw_block.get("body") + or "" + ).strip() + + if block_type == "list" and not items and content: + items = [line.strip(" -\u2022") for line in content.splitlines() if line.strip(" -\u2022")] + if block_type != "list" and not content and items: + content = " ".join(items) + table_data = self._normalize_table_data( + raw_block.get("table_data") + or raw_block.get("tableData") + or raw_block.get("table") + or {} + ) if block_type == "table" else None + + asset_key = self._slugify( + raw_block.get("asset_key") + or raw_block.get("assetKey") + or raw_block.get("image_key") + or raw_block.get("imageKey") + or raw_block.get("visual_key") + or raw_block.get("visualKey") + or "" + ) + if block_type == "image": + asset_key = _pick_asset_key(asset_key) + if not asset_key: + continue + else: + asset_key = "" + + if block_type == "list" and not items: + if role in {"key_points", "bullets"} and outline_points: + items = outline_points[:4] + else: + continue + if block_type == "table" and not table_data: + continue + if block_type != "image" and block_type != "table" and not content and not items: + continue + + normalized_block = { + "id": block_id, + "type": block_type, + "role": role, + "content": content, + "items": items, + "asset_key": asset_key, + "layout": self._normalize_layout_hint( + raw_block.get("layout") or raw_block.get("layout_hint") or raw_block.get("layoutHint"), + block_type=block_type, + role=role, + order=index + 1, + has_visual_assets=has_visual_assets, + ), + } + if table_data: + normalized_block["table_data"] = table_data + normalized.append(normalized_block) + + if outline_title and not any(str(block.get("role") or "") == "title" for block in normalized): + normalized.insert( + 0, + { + "id": "title", + "type": "text", + "role": "title", + "content": outline_title, + "items": [], + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "header", "span": 12, "order": 1, "preferred_width": "full", "emphasis": "high"}, + block_type="text", + role="title", + order=1, + has_visual_assets=has_visual_assets, + ), + }, + ) + + if outline_points and not any(str(block.get("type") or "") == "list" for block in normalized): + normalized.append( + { + "id": "key_points", + "type": "list", + "role": "key_points", + "content": "", + "items": outline_points[:4], + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "main", "span": 6 if has_visual_assets else 12, "preferred_width": "wide"}, + block_type="list", + role="key_points", + order=len(normalized) + 1, + has_visual_assets=has_visual_assets, + ), + } + ) + + if (layout_description or outline_points) and not any( + str(block.get("role") or "") in {"summary", "body"} + for block in normalized + ): + normalized.append( + { + "id": "summary", + "type": "text", + "role": "summary", + "content": outline_points[0] if outline_points else layout_description, + "items": [], + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "main", "span": 7 if has_visual_assets else 12, "preferred_width": "wide"}, + block_type="text", + role="summary", + order=len(normalized) + 1, + has_visual_assets=has_visual_assets, + ), + } + ) + + if has_visual_assets and not any(str(block.get("type") or "") == "image" for block in normalized): + normalized.append( + { + "id": self._slugify(available_asset_keys[0]) or _DEFAULT_VISUAL_KEY, + "type": "image", + "role": "main_visual", + "content": "", + "items": [], + "asset_key": _pick_asset_key(available_asset_keys[0]), + "layout": self._normalize_layout_hint( + {"zone": "aside", "span": 6, "preferred_side": "right", "emphasis": "high"}, + block_type="image", + role="main_visual", + order=len(normalized) + 1, + has_visual_assets=has_visual_assets, + ), + } + ) + + normalized = sorted( + normalized[:8], + key=lambda item: int(((item.get("layout") or {}).get("order")) or 0), + ) + for index, block in enumerate(normalized, start=1): + layout = dict(block.get("layout") or {}) + layout["order"] = index + block["layout"] = layout + return normalized + + def _derive_fields_from_blocks( + self, + blocks: Sequence[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + normalized: List[Dict[str, Any]] = [] + seen_keys: set[str] = set() + + preferred_keys = { + "title": "title", + "summary": "summary", + "key_points": "key_points", + "takeaway": "takeaway", + "footer": "footer", + "eyebrow": "eyebrow", + } + + for block in blocks: + if not isinstance(block, dict): + continue + block_type = str(block.get("type") or "").strip().lower() + if block_type == "image": + continue + + role = self._slugify(block.get("role") or "") or self._slugify(block.get("id") or "") + key = preferred_keys.get(role, self._slugify(block.get("id") or role)) + if not key or key in seen_keys: + continue + + label = str(block.get("label") or key.replace("_", " ").title()).strip() + if block_type == "table": + table_data = self._normalize_table_data(block.get("table_data") or block.get("tableData") or {}) + if not table_data: + continue + for col_index, header in enumerate(table_data["headers"]): + field_key = f"{key}_cell_h_{col_index}" + if field_key in seen_keys: + continue + normalized.append( + { + "key": field_key, + "label": f"{label} Header {col_index + 1}", + "type": "text", + "value": str(header), + "items": [], + } + ) + seen_keys.add(field_key) + for row_index, row in enumerate(table_data["rows"]): + for col_index, cell in enumerate(row): + field_key = f"{key}_cell_{row_index}_{col_index}" + if field_key in seen_keys: + continue + normalized.append( + { + "key": field_key, + "label": f"{label} R{row_index + 1}C{col_index + 1}", + "type": "text", + "value": str(cell), + "items": [], + } + ) + seen_keys.add(field_key) + continue + if block_type == "list": + items = [ + str(item).strip() + for item in (block.get("items") or []) + if str(item).strip() + ] + if not items: + continue + normalized.append( + { + "key": key, + "label": label, + "type": "list", + "value": "", + "items": items, + } + ) + else: + value = str(block.get("content") or "").strip() + if not value: + continue + field_type = "text" if role in {"title", "eyebrow", "footer"} else "textarea" if len(value) > 80 or "\n" in value else "text" + normalized.append( + { + "key": key, + "label": label, + "type": field_type, + "value": value, + "items": [], + } + ) + seen_keys.add(key) + + return normalized + + def _derive_fields_from_canvas_content( + self, + content: Dict[str, Any], + referenced_keys: Optional[set[str]] = None, + ) -> List[Dict[str, Any]]: + normalized: List[Dict[str, Any]] = [] + seen_keys: set[str] = set() + allowed_keys = referenced_keys or set() + + for raw_key, raw_value in content.items(): + key = self._slugify(raw_key) + if not key or key == "assets" or key in seen_keys: + continue + if allowed_keys and key not in allowed_keys: + continue + label = key.replace("_", " ").title() + + table_data = self._normalize_table_data(raw_value) + if table_data: + for col_index, header in enumerate(table_data["headers"]): + field_key = f"{key}_cell_h_{col_index}" + normalized.append( + { + "key": field_key, + "label": f"{label} Header {col_index + 1}", + "type": "text", + "value": str(header), + "items": [], + } + ) + seen_keys.add(field_key) + for row_index, row in enumerate(table_data["rows"]): + for col_index, cell in enumerate(row): + field_key = f"{key}_cell_{row_index}_{col_index}" + normalized.append( + { + "key": field_key, + "label": f"{label} R{row_index + 1}C{col_index + 1}", + "type": "text", + "value": str(cell), + "items": [], + } + ) + seen_keys.add(field_key) + seen_keys.add(key) + continue + + if isinstance(raw_value, list): + items = [ + str(item).strip() + for item in raw_value + if isinstance(item, (str, int, float)) and str(item).strip() + ] + if items: + normalized.append( + { + "key": key, + "label": label, + "type": "list", + "value": "", + "items": items, + } + ) + seen_keys.add(key) + continue + + if isinstance(raw_value, (str, int, float)): + value = str(raw_value).strip() + if not value: + continue + field_type = "text" if key in {"title", "eyebrow", "footer"} else "textarea" if len(value) > 80 or "\n" in value else "text" + normalized.append( + { + "key": key, + "label": label, + "type": field_type, + "value": value, + "items": [], + } + ) + seen_keys.add(key) + + return normalized + + def _merge_editable_fields( + self, + *, + base_fields: Sequence[Dict[str, Any]], + override_fields: Sequence[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + merged: Dict[str, Dict[str, Any]] = {} + order: List[str] = [] + + for field_group in (base_fields, override_fields): + for raw_field in field_group: + if not isinstance(raw_field, dict): + continue + key = str(raw_field.get("key") or "").strip() + if not key: + continue + if key not in order: + order.append(key) + merged[key] = { + "key": key, + "label": str(raw_field.get("label") or key.replace("_", " ").title()).strip(), + "type": str(raw_field.get("type") or "text").strip(), + "value": str(raw_field.get("value") or "").strip(), + "items": [ + str(item).strip() + for item in (raw_field.get("items") or []) + if str(item).strip() + ], + } + + return [merged[key] for key in order if key in merged] + + def _build_canvas_content( + self, + *, + slide: Dict[str, Any], + visual_assets: Sequence[Dict[str, Any]], + ) -> Dict[str, Any]: + content: Dict[str, Any] = {} + table_groups: Dict[str, Dict[str, Any]] = {} + + def _get_table_group(owner_id: str) -> Dict[str, Any]: + table = table_groups.setdefault(owner_id, {"headers": [], "rows": []}) + if not isinstance(table.get("headers"), list): + table["headers"] = [] + if not isinstance(table.get("rows"), list): + table["rows"] = [] + return table + + def _set_table_cell(owner_id: str, row_index: Any, col_index: int, value: str) -> None: + table = _get_table_group(owner_id) + headers = table["headers"] + rows = table["rows"] + if row_index == "h": + while len(headers) <= col_index: + headers.append(f"Column {len(headers) + 1}") + headers[col_index] = value + return + if not isinstance(row_index, int) or row_index < 0: + return + while len(rows) <= row_index: + rows.append([]) + current_row = rows[row_index] + if not isinstance(current_row, list): + current_row = [] + rows[row_index] = current_row + while len(current_row) <= col_index: + current_row.append("") + current_row[col_index] = value + while len(headers) <= col_index: + headers.append(f"Column {len(headers) + 1}") + + for field in slide.get("editable_fields") or []: + if not isinstance(field, dict): + continue + key = self._slugify(field.get("key") or "") + if not key: + continue + match = re.match(r"^(.+)_cell_(h|\d+)_(\d+)$", key) + if match: + owner_id = match.group(1) + row_token = match.group(2) + row_index: Any = "h" if row_token == "h" else int(row_token) + col_index = int(match.group(3)) + _set_table_cell(owner_id, row_index, col_index, str(field.get("value") or "").strip()) + continue + if str(field.get("type") or "") == "list": + content[key] = [ + str(item).strip() + for item in (field.get("items") or []) + if str(item).strip() + ] + else: + content[key] = str(field.get("value") or "").strip() + + for owner_id, table_data in table_groups.items(): + content[owner_id] = { + "headers": [str(item).strip() for item in table_data.get("headers") or []], + "rows": [ + [str(cell).strip() for cell in row] + for row in (table_data.get("rows") or []) + if isinstance(row, list) + ], + } + + for block in slide.get("blocks") or []: + if not isinstance(block, dict): + continue + block_type = str(block.get("type") or "").strip().lower() + key = self._slugify(block.get("role") or block.get("id") or "") + if not key: + continue + if block_type == "table": + table_data = self._normalize_table_data(block.get("table_data") or block.get("tableData") or block.get("table") or {}) + if table_data: + content[key] = table_data + elif block_type == "list" and key not in content: + items = [ + str(item).strip() + for item in (block.get("items") or []) + if str(item).strip() + ] + if items: + content[key] = items + elif key not in content: + value = str(block.get("content") or "").strip() + if value: + content[key] = value + + assets: Dict[str, Dict[str, Any]] = {} + for asset in visual_assets: + if not isinstance(asset, dict): + continue + key = self._slugify(asset.get("key") or "") + if not key: + continue + assets[key] = { + "type": "image", + "asset_key": key, + "src": str(asset.get("src") or "").strip(), + "preview_src": str(asset.get("preview_src") or asset.get("previewSrc") or asset.get("src") or "").strip(), + "original_src": str(asset.get("original_src") or asset.get("originalSrc") or asset.get("storage_path") or "").strip(), + "alt": str(asset.get("alt") or asset.get("label") or key).strip(), + } + content["assets"] = assets + return content + + def _clean_canvas_visual_number( + self, + value: Any, + *, + min_value: float | None = None, + max_value: float | None = None, + ) -> float | int | None: + try: + parsed = float(value) + except Exception: # noqa: BLE001 + return None + if min_value is not None: + parsed = max(min_value, parsed) + if max_value is not None: + parsed = min(max_value, parsed) + return int(parsed) if parsed.is_integer() else parsed + + def _normalize_canvas_visual_style(self, raw_style: Any) -> Dict[str, Any]: + if not isinstance(raw_style, dict): + return {} + style: Dict[str, Any] = {} + + fill = str( + raw_style.get("fill") + or raw_style.get("background") + or raw_style.get("backgroundColor") + or raw_style.get("background_color") + or "" + ).strip() + if fill: + style["fill"] = fill + + color = str(raw_style.get("color") or raw_style.get("textColor") or raw_style.get("text_color") or "").strip() + if color: + style["color"] = color + + border_color = str(raw_style.get("borderColor") or raw_style.get("border_color") or "").strip() + if border_color: + style["border_color"] = border_color + + numeric_fields = { + "border_width": ("borderWidth", "border_width", 0, 12), + "radius": ("radius", "borderRadius", "border_radius", 0, 96), + "padding": ("padding", "padding_px", 0, 96), + "font_size": ("fontSize", "font_size", 8, 96), + "line_height": ("lineHeight", "line_height", 8, 140), + "opacity": ("opacity", "alpha", 0, 1), + } + for output_key, candidates in numeric_fields.items(): + min_value = candidates[-2] + max_value = candidates[-1] + value = None + for candidate in candidates[:-2]: + if candidate in raw_style: + value = raw_style.get(candidate) + break + cleaned = self._clean_canvas_visual_number(value, min_value=min_value, max_value=max_value) + if cleaned is not None: + style[output_key] = cleaned + + font_family = str(raw_style.get("fontFamily") or raw_style.get("font_family") or "").strip() + if font_family: + style["font_family"] = font_family + + if raw_style.get("fontWeight") is not None or raw_style.get("font_weight") is not None: + font_weight = raw_style.get("fontWeight", raw_style.get("font_weight")) + style["font_weight"] = str(font_weight).strip() + + font_style = str(raw_style.get("fontStyle") or raw_style.get("font_style") or "").strip().lower() + if font_style in {"normal", "italic"}: + style["font_style"] = font_style + + text_align = str(raw_style.get("textAlign") or raw_style.get("text_align") or "").strip().lower() + if text_align in {"left", "center", "right", "justify"}: + style["text_align"] = text_align + + image_fit = str(raw_style.get("imageFit") or raw_style.get("image_fit") or "").strip().lower() + if image_fit in {"contain", "cover", "fill"}: + style["image_fit"] = image_fit + + emphasis = str(raw_style.get("emphasis") or "").strip().lower() + if emphasis in _ALLOWED_EMPHASIS_HINTS: + style["emphasis"] = emphasis + + return style + + def _normalize_canvas_visual_spec(self, raw_spec: Any) -> Dict[str, Any]: + if not isinstance(raw_spec, dict): + return {} + normalized: Dict[str, Any] = {"version": _CANVAS_VISUAL_SPEC_VERSION} + + palette_source = raw_spec.get("palette") if isinstance(raw_spec.get("palette"), dict) else {} + palette: Dict[str, str] = {} + for key in ("bg", "panel", "primary", "secondary", "accent", "text", "muted"): + value = str(palette_source.get(key) or "").strip() + if value: + palette[key] = value + if palette: + normalized["palette"] = palette + + typography_source = raw_spec.get("typography") if isinstance(raw_spec.get("typography"), dict) else {} + typography: Dict[str, Any] = {} + title_font = str(typography_source.get("title_font_stack") or typography_source.get("titleFontStack") or "").strip() + body_font = str(typography_source.get("body_font_stack") or typography_source.get("bodyFontStack") or "").strip() + if title_font: + typography["title_font_stack"] = title_font + if body_font: + typography["body_font_stack"] = body_font + for output_key, candidates in { + "eyebrow_size": ("eyebrow_size", "eyebrowSize", 8, 32), + "title_size": ("title_size", "titleSize", 24, 78), + "summary_size": ("summary_size", "summarySize", 14, 44), + "body_size": ("body_size", "bodySize", 12, 36), + }.items(): + min_value = candidates[-2] + max_value = candidates[-1] + value = next((typography_source.get(candidate) for candidate in candidates[:-2] if candidate in typography_source), None) + cleaned = self._clean_canvas_visual_number(value, min_value=min_value, max_value=max_value) + if cleaned is not None: + typography[output_key] = cleaned + if typography: + normalized["typography"] = typography + + surface_source = raw_spec.get("surface") if isinstance(raw_spec.get("surface"), dict) else {} + surface: Dict[str, Any] = {} + for output_key, candidates in { + "background": ("background",), + "panel": ("panel",), + "primary": ("primary",), + "secondary": ("secondary",), + "accent": ("accent",), + "text": ("text",), + "muted": ("muted",), + }.items(): + value = str(next((surface_source.get(candidate) for candidate in candidates if candidate in surface_source), "") or "").strip() + if value: + surface[output_key] = value + for output_key, candidates in { + "card_radius": ("card_radius", "cardRadius", 0, 64), + "card_padding": ("card_padding", "cardPadding", 0, 72), + "section_gap": ("section_gap", "sectionGap", 0, 72), + }.items(): + min_value = candidates[-2] + max_value = candidates[-1] + value = next((surface_source.get(candidate) for candidate in candidates[:-2] if candidate in surface_source), None) + cleaned = self._clean_canvas_visual_number(value, min_value=min_value, max_value=max_value) + if cleaned is not None: + surface[output_key] = cleaned + if surface: + normalized["surface"] = surface + + layout_source = raw_spec.get("layout") if isinstance(raw_spec.get("layout"), dict) else {} + layout: Dict[str, Any] = {} + for output_key, candidates in { + "safe_margin": ("safe_margin", "safeMargin", 0, 120), + "section_gap": ("section_gap", "sectionGap", 0, 72), + "content_gap": ("content_gap", "contentGap", 0, 72), + "max_columns": ("max_columns", "maxColumns", 1, 4), + }.items(): + min_value = candidates[-2] + max_value = candidates[-1] + value = next((layout_source.get(candidate) for candidate in candidates[:-2] if candidate in layout_source), None) + cleaned = self._clean_canvas_visual_number(value, min_value=min_value, max_value=max_value) + if cleaned is not None: + layout[output_key] = cleaned + if layout: + normalized["layout"] = layout + + node_styles_source = raw_spec.get("node_styles") or raw_spec.get("nodeStyles") + if isinstance(node_styles_source, dict): + node_styles: Dict[str, Any] = {} + for raw_key, raw_style in node_styles_source.items(): + key = self._slugify(raw_key) + style = self._normalize_canvas_visual_style(raw_style) + if key and style: + node_styles[key] = style + if node_styles: + normalized["node_styles"] = node_styles + + component_styles_source = raw_spec.get("component_styles") or raw_spec.get("componentStyles") + if isinstance(component_styles_source, dict): + component_styles: Dict[str, Any] = {} + for raw_key, raw_style in component_styles_source.items(): + component = self._normalize_canvas_component_name(raw_key) + style = self._normalize_canvas_visual_style(raw_style) + if component and style: + component_styles[component] = style + if component_styles: + normalized["component_styles"] = component_styles + + return normalized if len(normalized) > 1 else {} + + def _build_canvas_visual_spec(self, *, theme: Dict[str, Any], has_visual_assets: bool = False) -> Dict[str, Any]: + fallback_theme = self._build_fallback_theme(language="zh", style="") + palette = theme.get("palette") if isinstance(theme.get("palette"), dict) else fallback_theme["palette"] + typography = theme.get("typography") if isinstance(theme.get("typography"), dict) else fallback_theme["typography"] + raw_spec = { + "version": _CANVAS_VISUAL_SPEC_VERSION, + "palette": { + "bg": palette.get("bg"), + "panel": palette.get("panel"), + "primary": palette.get("primary"), + "secondary": palette.get("secondary"), + "accent": palette.get("accent"), + "text": palette.get("text"), + "muted": palette.get("muted"), + }, + "typography": { + "title_font_stack": typography.get("title_font_stack"), + "body_font_stack": typography.get("body_font_stack"), + "eyebrow_size": typography.get("eyebrow_size"), + "title_size": typography.get("title_size"), + "summary_size": typography.get("summary_size"), + "body_size": typography.get("body_size"), + }, + "surface": { + "background": palette.get("bg"), + "panel": palette.get("panel"), + "primary": palette.get("primary"), + "secondary": palette.get("secondary"), + "accent": palette.get("accent"), + "text": palette.get("text"), + "muted": palette.get("muted"), + "card_radius": 22 if has_visual_assets else 24, + "card_padding": 22, + "section_gap": 22, + }, + "layout": { + "safe_margin": 62, + "section_gap": 22, + "content_gap": 18, + "max_columns": 2, + }, + "component_styles": { + "heading": { + "font_family": typography.get("title_font_stack"), + "font_size": typography.get("title_size"), + "font_weight": 700, + "color": palette.get("text"), + }, + "text": { + "font_family": typography.get("body_font_stack"), + "font_size": typography.get("body_size"), + "color": palette.get("text"), + }, + "bullets": { + "font_family": typography.get("body_font_stack"), + "font_size": typography.get("body_size"), + "color": palette.get("text"), + }, + "callout": { + "fill": palette.get("panel"), + "border_color": palette.get("accent"), + "font_size": typography.get("body_size"), + "color": palette.get("text"), + }, + "figure": { + "fill": palette.get("panel"), + "border_color": palette.get("primary"), + "image_fit": "contain", + }, + "table": { + "fill": palette.get("panel"), + "border_color": palette.get("primary"), + "font_size": max(14, int(typography.get("body_size") or 24) - 6), + "color": palette.get("text"), + }, + }, + } + return self._normalize_canvas_visual_spec(raw_spec) + + def _normalize_canvas_component_name(self, raw_value: Any) -> str: + text = self._slugify(raw_value or "") + aliases = { + "h1": "heading", + "h2": "heading", + "title": "heading", + "subtitle": "text", + "paragraph": "text", + "body": "text", + "body_text": "text", + "bullet_list": "bullets", + "bullet_points": "bullets", + "key_points": "bullets", + "list": "bullets", + "points": "bullets", + "image": "figure", + "visual": "figure", + "chart": "figure", + "diagram": "figure", + "table_card": "table", + "data_table": "table", + "metric": "stat", + "number": "stat", + "kpi": "stat", + "card": "callout", + "note": "callout", + "insight": "callout", + "timeline": "bullets", + "timeline_item": "text", + } + normalized = aliases.get(text, text or "placeholder") + return normalized if normalized in _ALLOWED_CANVAS_COMPONENTS else "text" + + def _normalize_canvas_node_tree(self, node: Any) -> Any: + if not isinstance(node, dict): + return node + normalized = dict(node) + node_type = str(normalized.get("type") or "").strip().lower() + if node_type == "component": + props = normalized.get("props") if isinstance(normalized.get("props"), dict) else {} + normalized["component"] = self._normalize_canvas_component_name( + normalized.get("component") or props.get("component") or props.get("kind") + ) + children = normalized.get("children") + if isinstance(children, list): + normalized["children"] = [self._normalize_canvas_node_tree(child) for child in children if isinstance(child, dict)] + return normalized + + def _component_for_block(self, block: Dict[str, Any]) -> str: + block_type = str(block.get("type") or "").strip().lower() + role = str(block.get("role") or "").strip().lower() + if role == "title": + return "heading" + if block_type == "list": + return "bullets" + if block_type == "image": + return "figure" + if block_type == "quote": + return "quote" + if block_type == "stat": + return "stat" + if block_type == "callout": + return "callout" + if block_type == "table": + return "table" + return "text" + + def _props_for_canvas_block(self, block: Dict[str, Any]) -> Dict[str, Any]: + component = self._component_for_block(block) + role = self._slugify(block.get("role") or block.get("id") or "") + block_id = self._slugify(block.get("id") or role) or role + ref = role or block_id + if component == "bullets": + if role in {"key_points", "points", "bullets", "main_points", "takeaways"} or block_id in {"key_points", "points", "bullets"}: + ref = "key_points" + return {"items_ref": ref} + if component == "figure": + asset_key = self._slugify(block.get("asset_key") or block.get("assetKey") or block_id) + return {"asset_ref": asset_key, "asset_key": asset_key, "fit": "contain"} + if component == "stat": + return {"value_ref": ref, "label": str(block.get("label") or role.replace("_", " ").title()).strip()} + if component == "table": + return {"table_ref": ref} + return {"text_ref": ref} + + def _build_canvas_root_from_blocks( + self, + *, + blocks: Sequence[Dict[str, Any]], + template_key: str, + ) -> Dict[str, Any]: + header: List[Dict[str, Any]] = [] + main_left: List[Dict[str, Any]] = [] + main_right: List[Dict[str, Any]] = [] + footer: List[Dict[str, Any]] = [] + + for block in blocks: + if not isinstance(block, dict): + continue + block_id = self._slugify(block.get("id") or block.get("role") or "") or "block" + layout = block.get("layout") if isinstance(block.get("layout"), dict) else {} + zone = str(layout.get("zone") or "main").strip().lower() + side = str(layout.get("preferred_side") or layout.get("preferredSide") or "").strip().lower() + component_node = { + "type": "component", + "id": block_id, + "component": self._component_for_block(block), + "props": self._props_for_canvas_block(block), + "style": {"emphasis": str(layout.get("emphasis") or "medium")}, + } + if zone == "header": + header.append(component_node) + elif zone == "footer": + footer.append(component_node) + elif zone in {"aside", "right"} or side == "right": + main_right.append(component_node) + else: + main_left.append(component_node) + + main_children: List[Dict[str, Any]] = [] + if main_left: + main_children.append( + { + "type": "container", + "id": "main_left", + "style": {"direction": "column", "gap": 18, "weight": 1, "align": "stretch"}, + "children": main_left, + } + ) + if main_right: + main_children.append( + { + "type": "container", + "id": "main_right", + "style": {"direction": "column", "gap": 18, "weight": 1, "align": "stretch"}, + "children": main_right, + } + ) + + children: List[Dict[str, Any]] = [] + if header: + children.append( + { + "type": "container", + "id": "header", + "style": {"direction": "column", "gap": 12, "align": "stretch"}, + "children": header, + } + ) + children.append( + { + "type": "container", + "id": "main", + "style": { + "direction": "row" if len(main_children) > 1 else "column", + "gap": 24, + "weight": 1, + "align": "stretch", + }, + "children": main_children or [ + { + "type": "component", + "id": "empty_main", + "component": "placeholder", + "props": {"text": "No content"}, + } + ], + } + ) + if footer: + children.append( + { + "type": "container", + "id": "footer", + "style": {"direction": "row", "gap": 16, "align": "end", "justify": "between"}, + "children": footer, + } + ) + + return { + "type": "container", + "id": "root", + "style": { + "direction": "column", + "gap": 24, + "padding": 0, + "align": "stretch", + "justify": "start", + }, + "children": children, + } + + def _collect_canvas_refs(self, node: Dict[str, Any], refs: List[Dict[str, str]], node_ids: set[str], issues: List[Dict[str, Any]]) -> None: + if not isinstance(node, dict): + return + node_id = self._slugify(node.get("id") or "") + if not node_id: + issues.append({"severity": "repairable", "code": "missing_node_id", "message": "Canvas node is missing id."}) + elif node_id in node_ids: + issues.append({"severity": "repairable", "code": "duplicate_node_id", "node_id": node_id, "message": f"Duplicate canvas node id: {node_id}"}) + else: + node_ids.add(node_id) + + if str(node.get("type") or "") == "component": + props = node.get("props") if isinstance(node.get("props"), dict) else {} + for key, value in props.items(): + if key.endswith("_ref") or key.endswith("Ref"): + ref = str(value or "").strip() + if ref: + refs.append({"node_id": node_id, "prop": key, "ref": ref}) + + children = node.get("children") + if isinstance(children, list): + for child in children: + self._collect_canvas_refs(child, refs, node_ids, issues) + + def _collect_canvas_referenced_keys(self, node: Dict[str, Any], keys: set[str]) -> None: + if not isinstance(node, dict): + return + if str(node.get("type") or "") == "component": + props = node.get("props") if isinstance(node.get("props"), dict) else {} + for key, value in props.items(): + if not (key.endswith("_ref") or key.endswith("Ref")): + continue + ref = self._slugify(value) + if ref: + keys.add(ref) + children = node.get("children") + if isinstance(children, list): + for child in children: + self._collect_canvas_referenced_keys(child, keys) + + def _collect_canvas_component_refs(self, node: Dict[str, Any], refs: set[str]) -> None: + if not isinstance(node, dict): + return + if str(node.get("type") or "") == "component": + component = self._normalize_canvas_component_name(node.get("component")) + props = node.get("props") if isinstance(node.get("props"), dict) else {} + if component == "stat": + value_ref = self._slugify( + props.get("value_ref") + or props.get("valueRef") + or props.get("ref") + or props.get("text_ref") + or props.get("textRef") + ) + label_ref = self._slugify(props.get("label_ref") or props.get("labelRef")) + if value_ref: + refs.add(value_ref) + if label_ref: + refs.add(label_ref) + return + if component in {"heading", "text", "quote", "callout"}: + text_ref = self._slugify(props.get("text_ref") or props.get("textRef") or props.get("ref")) + if text_ref: + refs.add(text_ref) + return + if component == "bullets": + items_ref = self._slugify(props.get("items_ref") or props.get("itemsRef") or props.get("ref")) + if items_ref: + refs.add(items_ref) + return + if component == "table": + table_ref = self._slugify(props.get("table_ref") or props.get("tableRef") or props.get("ref")) + if table_ref: + refs.add(table_ref) + return + if component == "figure": + asset_ref = self._slugify( + props.get("asset_ref") + or props.get("assetRef") + or props.get("asset_key") + or props.get("assetKey") + or props.get("ref") + ) + if asset_ref: + refs.add(asset_ref) + return + children = node.get("children") + if isinstance(children, list): + for child in children: + self._collect_canvas_component_refs(child, refs) + + def _normalize_canvas_schema( + self, + *, + slide: Dict[str, Any], + visual_assets: Sequence[Dict[str, Any]], + ) -> Dict[str, Any]: + normalized = dict(slide) + derived_content = self._build_canvas_content(slide=normalized, visual_assets=visual_assets) + content = normalized.get("content") if isinstance(normalized.get("content"), dict) else None + if content is None: + content = derived_content + else: + content = dict(content) + for key in list(content.keys()): + if key != "assets" and re.match(r"^(.+)_cell_(h|\d+)_(\d+)$", self._slugify(key)): + content.pop(key, None) + for key, value in derived_content.items(): + if key == "assets": + continue + if key not in content or content.get(key) in ("", None, []): + content[key] = value + content["assets"] = { + **(derived_content.get("assets") if isinstance(derived_content.get("assets"), dict) else {}), + **(content.get("assets") if isinstance(content.get("assets"), dict) else {}), + } + + blocks = normalized.get("blocks") or [] + if not isinstance(normalized.get("root"), dict): + normalized["root"] = self._build_canvas_root_from_blocks( + blocks=blocks if isinstance(blocks, list) else [], + template_key=str(normalized.get("template_key") or ""), + ) + elif isinstance(normalized.get("root"), dict): + normalized["root"] = self._normalize_canvas_node_tree(normalized["root"]) + + visual_spec = self._normalize_canvas_visual_spec( + normalized.get("visual_spec") or normalized.get("visualSpec") + ) + if visual_spec: + normalized["visual_spec"] = visual_spec + normalized.pop("visualSpec", None) + + normalized["schema_version"] = _CANVAS_SCHEMA_VERSION + render_engine = str(normalized.get("render_engine") or normalized.get("renderEngine") or "canvas").strip().lower() + normalized["render_engine"] = "blocks" if render_engine == "blocks" else "canvas" + normalized["content"] = content + if isinstance(normalized.get("root"), dict): + self._repair_canvas_refs(normalized["root"], content=content) + component_refs: set[str] = set() + self._collect_canvas_component_refs(normalized["root"], component_refs) + if component_refs: + normalized["editable_fields"] = self._derive_fields_from_canvas_content(content, component_refs) + normalized.setdefault("layout_family", str(normalized.get("template_key") or "custom")) + normalized.setdefault("constraints", {"min_font_size": 18, "max_font_size": 56, "allow_overflow": False, "fit_mode": "browser_measure"}) + normalized.setdefault("editable_map", {}) + + normalized["canvas_validation"] = self._validate_canvas_schema(normalized) + return normalized - user_payload = { - "language": language, - "style_prompt": style or "", - "outline_summary": outline_summary, + def _validate_canvas_schema(self, slide: Dict[str, Any]) -> Dict[str, Any]: + content = slide.get("content") if isinstance(slide.get("content"), dict) else {} + assets = content.get("assets") if isinstance(content.get("assets"), dict) else {} + defined = { + self._slugify(key) + for key in content.keys() + if key != "assets" and self._slugify(key) } + defined.update(f"assets.{self._slugify(key)}" for key in assets.keys() if self._slugify(key)) + + refs: List[Dict[str, str]] = [] + issues: List[Dict[str, Any]] = [] + node_ids: set[str] = set() + root = slide.get("root") + if not isinstance(root, dict): + issues.append({"severity": "error", "code": "missing_root", "message": "Canvas schema root is missing."}) + else: + self._collect_canvas_refs(root, refs, node_ids, issues) + + used_refs: List[str] = [] + missing_refs: List[str] = [] + for ref_item in refs: + ref = ref_item["ref"] + normalized_ref = self._slugify(ref) + defined_key = f"assets.{normalized_ref}" if ref_item["prop"] in {"asset_ref", "assetRef"} else normalized_ref + used_refs.append(ref) + if defined_key not in defined: + suggested = "" + if ref_item["prop"] in {"items_ref", "itemsRef"} and "key_points" in defined: + suggested = "key_points" + elif ref_item["prop"] in {"text_ref", "textRef"} and "title" in defined: + suggested = "title" + elif ref_item["prop"] in {"asset_ref", "assetRef"} and assets: + suggested = next(iter(assets.keys())) + missing_refs.append(ref) + issues.append( + { + "severity": "repairable", + "code": "missing_ref", + "node_id": ref_item["node_id"], + "ref": ref, + "suggested_ref": suggested, + "message": f"Reference '{ref}' does not exist in slide content.", + } + ) - return [ - {"role": "system", "content": system_prompt}, + used_normalized = {self._slugify(ref) for ref in used_refs} + orphan = sorted( + key + for key in defined + if not key.startswith("assets.") and key not in used_normalized + ) + return { + "ok": not any(issue.get("severity") == "error" for issue in issues), + "used_refs": used_refs, + "defined_content_keys": sorted(defined), + "missing_refs": missing_refs, + "orphan_content_keys": orphan, + "empty_components": [], + "issues": issues, + } + + def _repair_canvas_refs(self, node: Dict[str, Any], *, content: Dict[str, Any]) -> None: + if not isinstance(node, dict): + return + props = node.get("props") if isinstance(node.get("props"), dict) else None + assets = content.get("assets") if isinstance(content.get("assets"), dict) else {} + content_keys = {self._slugify(key): key for key in content.keys() if key != "assets"} + asset_keys = {self._slugify(key): key for key in assets.keys()} + + if props is not None: + for prop_name in list(props.keys()): + if not (prop_name.endswith("_ref") or prop_name.endswith("Ref")): + continue + raw_ref = str(props.get(prop_name) or "").strip() + normalized_ref = self._slugify(raw_ref) + if prop_name in {"asset_ref", "assetRef"}: + if normalized_ref not in asset_keys and asset_keys: + props[prop_name] = next(iter(asset_keys.values())) + continue + if normalized_ref in content_keys: + props[prop_name] = content_keys[normalized_ref] + continue + if prop_name in {"items_ref", "itemsRef"} and "key_points" in content_keys: + props[prop_name] = content_keys["key_points"] + elif prop_name in {"text_ref", "textRef"} and "title" in content_keys: + props[prop_name] = content_keys["title"] + + children = node.get("children") + if isinstance(children, list): + for child in children: + self._repair_canvas_refs(child, content=content) + + def _build_fallback_blocks( + self, + *, + outline_item: Dict[str, Any], + slide_index: int, + slide_count: int, + theme: Dict[str, Any], + visual_assets: Optional[Sequence[Dict[str, Any]]] = None, + ) -> List[Dict[str, Any]]: + visual_assets = list(visual_assets or [])[:_MAX_INLINE_VISUAL_ASSETS] + has_visual = bool(visual_assets) + key_points = [ + str(item).strip() + for item in (outline_item.get("key_points") or []) + if str(item).strip() + ][:4] + summary = key_points[0] if key_points else str(outline_item.get("layout_description") or "").strip() + takeaway = key_points[-1] if key_points else "Refine the narrative in the editor" + section_template = str(theme.get("section_label_template") or "Slide {page_num:02d}/{slide_count:02d}") + try: + eyebrow = section_template.format(page_num=slide_index + 1, slide_count=slide_count) + except Exception: # noqa: BLE001 + eyebrow = f"Slide {slide_index + 1:02d}/{slide_count:02d}" + + blocks: List[Dict[str, Any]] = [ { - "role": "user", - "content": ( - "Create one deck theme for this outline summary:\n\n" - f"{json.dumps(user_payload, ensure_ascii=False, indent=2)}" + "id": "eyebrow", + "type": "text", + "role": "eyebrow", + "content": eyebrow, + "items": [], + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "header", "span": 12, "preferred_width": "full"}, + block_type="text", + role="eyebrow", + order=1, + has_visual_assets=has_visual, + ), + }, + { + "id": "title", + "type": "text", + "role": "title", + "content": str(outline_item.get("title") or f"Slide {slide_index + 1}"), + "items": [], + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "header", "span": 12, "preferred_width": "full", "emphasis": "high"}, + block_type="text", + role="title", + order=2, + has_visual_assets=has_visual, + ), + }, + { + "id": "summary", + "type": "text", + "role": "summary", + "content": summary, + "items": [], + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "main", "span": 7 if has_visual else 12, "preferred_width": "wide"}, + block_type="text", + role="summary", + order=3, + has_visual_assets=has_visual, ), }, ] + if key_points: + blocks.append( + { + "id": "key_points", + "type": "list", + "role": "key_points", + "content": "", + "items": key_points, + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "main", "span": 6 if has_visual else 12, "preferred_width": "wide"}, + block_type="list", + role="key_points", + order=len(blocks) + 1, + has_visual_assets=has_visual, + ), + } + ) + + for asset_index, asset in enumerate(visual_assets): + asset_key = self._slugify(asset.get("key") or "") or self._build_visual_asset_key(asset_index) + blocks.append( + { + "id": f"{asset_key}_{asset_index + 1}", + "type": "image", + "role": "main_visual" if asset_index == 0 else "supporting_visual", + "content": "", + "items": [], + "asset_key": asset_key, + "layout": self._normalize_layout_hint( + { + "zone": "aside" if asset_index == 0 else "right", + "span": 6, + "preferred_side": "right", + "preferred_width": "half", + "emphasis": "high" if asset_index == 0 else "medium", + }, + block_type="image", + role="main_visual" if asset_index == 0 else "supporting_visual", + order=len(blocks) + 1, + has_visual_assets=has_visual, + ), + } + ) + + blocks.extend( + [ + { + "id": "takeaway", + "type": "text", + "role": "takeaway", + "content": takeaway, + "items": [], + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "footer", "span": 8, "preferred_width": "wide"}, + block_type="text", + role="takeaway", + order=len(blocks) + 1, + has_visual_assets=has_visual, + ), + }, + { + "id": "footer", + "type": "text", + "role": "footer", + "content": str(theme.get("footer_text") or "Paper2Any Frontend PPT"), + "items": [], + "asset_key": "", + "layout": self._normalize_layout_hint( + {"zone": "footer", "span": 4, "preferred_side": "right", "preferred_width": "third"}, + block_type="text", + role="footer", + order=len(blocks) + 2, + has_visual_assets=has_visual, + ), + }, + ] + ) + return blocks + + def _normalize_legacy_slide_payload( + self, + *, + payload: Dict[str, Any], + outline_item: Dict[str, Any], + slide_index: int, + slide_count: int, + theme: Dict[str, Any], + visual_assets: List[Dict[str, Any]], + fallback_slide: Dict[str, Any], + ) -> Dict[str, Any]: + html_template = payload.get("html_template") or payload.get("html") or "" + css_code = payload.get("css_code") or payload.get("css") or "" + if not isinstance(html_template, str) or not isinstance(css_code, str): + return fallback_slide + if len(html_template) > 16000 or len(css_code) > 20000: + return fallback_slide + if _FORBIDDEN_HTML_RE.search(html_template) or _FORBIDDEN_CSS_RE.search(css_code): + return fallback_slide + + normalized_html = self._sanitize_html_template(html_template) + normalized_css = self._sanitize_css(css_code, theme=theme) + editable_fields = self._normalize_fields( + payload.get("editable_fields"), + outline_item=outline_item, + slide_index=slide_index, + ) + if not editable_fields: + return fallback_slide + + normalized_html, attribute_warnings = self._sanitize_attribute_placeholders( + normalized_html, + editable_fields, + ) + if attribute_warnings: + log.warning( + "[Paper2PPTFrontendService] Sanitized attribute placeholders for page %s: %s", + slide_index + 1, + ", ".join(attribute_warnings), + ) + + field_keys = {field["key"] for field in editable_fields} + placeholders = set(_FIELD_PLACEHOLDER_RE.findall(normalized_html)) + image_placeholders = set(_IMAGE_PLACEHOLDER_RE.findall(normalized_html)) + asset_keys = {str(asset.get("key") or "").strip() for asset in visual_assets if str(asset.get("key") or "").strip()} + if not placeholders: + return fallback_slide + if not placeholders.issubset(field_keys): + return fallback_slide + if image_placeholders and not image_placeholders.issubset(asset_keys): + return fallback_slide + if visual_assets and not image_placeholders: + return fallback_slide + + title_value = ( + self._find_field_value(editable_fields, "title") + or outline_item.get("title") + or f"Slide {slide_index + 1}" + ) + blocks = self._build_fallback_blocks( + outline_item=outline_item, + slide_index=slide_index, + slide_count=slide_count, + theme=theme, + visual_assets=visual_assets, + ) + slide = { + "slide_id": str(payload.get("slide_id") or slide_index + 1), + "page_num": slide_index + 1, + "title": str(payload.get("title") or title_value), + "schema_version": _SLIDE_SCHEMA_VERSION, + "layout_mode": "fixed", + "template_key": self._normalize_template_key( + payload.get("template_key") or payload.get("template") or "", + blocks=blocks, + visual_assets=visual_assets, + ), + "blocks": blocks, + "html_template": normalized_html, + "css_code": normalized_css, + "editable_fields": self._merge_editable_fields( + base_fields=fallback_slide.get("editable_fields") or [], + override_fields=editable_fields, + ), + "visual_assets": visual_assets, + "visual_spec": self._build_canvas_visual_spec(theme=theme, has_visual_assets=bool(visual_assets)), + "generation_note": str(payload.get("generation_note") or "Normalized from legacy html/css slide payload."), + "status": "done", + } + return self._normalize_canvas_schema(slide=slide, visual_assets=visual_assets) + def _build_messages( self, *, @@ -1218,39 +3020,62 @@ def _build_messages( visual_assets: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: system_prompt = """ -You are an expert academic slide frontend engineer. -Generate a single 16:9 presentation slide as HTML/CSS for a browser-based PPT editor. +You are an expert academic slide information architect. +Generate a single 16:9 presentation slide as Canvas schema JSON for a browser-based PPT editor. Hard requirements: 1. Return JSON only. No markdown fences. No explanation. 2. Output schema: { "title": "short string", - "html_template": "HTML string", - "css_code": "CSS string", - "editable_fields": [ - {"key": "title", "label": "Title", "type": "text", "value": "..."}, - {"key": "summary", "label": "Summary", "type": "textarea", "value": "..."}, - {"key": "key_points", "label": "Key Points", "type": "list", "items": ["...", "..."]} - ], + "render_engine": "canvas", + "layout_family": "two_column | text_focus | visual_compare | grid | timeline | custom", + "root": { + "type": "container", + "id": "root", + "style": {"direction": "column", "gap": 24, "align": "stretch"}, + "children": [ + { + "type": "container", + "id": "main", + "style": {"direction": "row", "gap": 24, "weight": 1}, + "children": [ + {"type": "component", "id": "title", "component": "heading", "props": {"text_ref": "title"}} + ] + } + ] + }, + "content": { + "title": "same title text", + "summary": "short summary text", + "key_points": ["same bullet texts"], + "table_1": {"headers": ["Column A", "Column B"], "rows": [["A1", "B1"]]}, + "assets": {"main_visual": {"type": "image", "asset_key": "main_visual"}} + }, + "visual_spec": { + "palette": {"bg": "#0b1020", "panel": "rgba(15,23,42,0.92)", "primary": "#7dd3fc", "secondary": "#38bdf8", "accent": "#f59e0b", "text": "#e2e8f0", "muted": "#94a3b8"}, + "typography": {"title_font_stack": "Georgia, serif", "body_font_stack": "Segoe UI, sans-serif", "title_size": 56, "body_size": 24}, + "surface": {"card_radius": 24, "card_padding": 22, "section_gap": 22}, + "layout": {"safe_margin": 62, "section_gap": 22, "content_gap": 18, "max_columns": 2}, + "component_styles": {"heading": {"font_size": 56, "font_weight": 700}, "figure": {"image_fit": "contain"}} + }, + "constraints": {"min_font_size": 18, "max_font_size": 56, "allow_overflow": false, "fit_mode": "browser_measure"}, "generation_note": "one short sentence" } -3. Every visible text in html_template must come from placeholders only: - - text/textarea fields: {{field:key}} - - list fields: {{list:key}} - - controlled images, when required: {{image:key}} -4. css_code must only target .slide-root and its descendants. -5. Do not use external assets, remote fonts, raw image URLs, svg, canvas, script, iframe, video or img tags. -6. The slide must fit inside a 1600x900 canvas with safe margins and no overflow. -7. Use the supplied deck theme so every page looks like the same presentation family. -8. Treat theme_lock as non-negotiable. Do not invent a new palette family, component language, or typography system. -9. Keep titles within 2 lines, with title font 42-60px and body text 18-28px. -10. Prefer grid/flex layouts over brittle absolute positioning. -11. If visual_assets are supplied, reserve layout space and place them using {{image:key}} placeholders. Never write a raw tag yourself. -12. If visual_assets are empty, build a text-first slide using editable text blocks and CSS decoration only. -13. The HTML must contain a single .slide-root root element. -14. If reference deck slides are provided, preserve their shared component grammar, spacing rhythm, and card treatment. -15. Never put {{field:...}}, {{list:...}}, or {{image:...}} placeholders inside HTML attributes like aria-label, title, alt, data-*, href, or style. Placeholders may only appear in element content. +3. Do not output blocks, elements, content_blocks, template_key, html_template, css_code, CSS, SVG, pixel coordinates, percentages, or absolute positions. +4. Every meaningful visible text must appear in content, then root components reference it. +5. Supported node types: container, component. Supported components: heading, text, bullets, quote, stat, callout, figure, table, placeholder. +6. Component refs: heading/text/quote/callout use text_ref; bullets use items_ref; table uses table_ref; figure uses asset_ref. +7. Every *_ref in root.props must exactly match a key in content, or assets. for asset_ref. Do not invent refs. +8. For figures, use only supplied visual asset keys. Never invent image URLs or asset ids. If visual_assets are empty, do not create figure components. +9. root.style may express layout intent only: direction, gap, weight, columns, padding, align, justify. Put color, typography, radius, card padding, and image fit in visual_spec. +10. Keep the slide inside a practical 1600x900 presentation canvas using fluid layout intent, not fixed coordinates. +11. Use the supplied deck theme so every page belongs to the same presentation family. +12. Treat theme_lock as non-negotiable. Do not invent a new palette family, typography system, or unrelated component language. +13. Prefer 4-8 meaningful visible components. Avoid over-fragmentation and repeated content. +14. Keep titles within 2 lines, and keep body content concise enough for a readable academic slide. +15. If reference deck slides are provided, preserve their Canvas component grammar, layout family, and visual_spec language. +16. Browser layout measurement will produce layout_ir later; do not output layout_ir. """.strip() outline_payload = { @@ -1280,9 +3105,10 @@ def _build_messages( "Deck identity summary that must stay stable across the whole deck:", json.dumps(deck_identity, ensure_ascii=False, indent=2), ( - "Keep the slide text-editable and visually consistent with the shared deck theme. " - "If visual_assets exist, include them as controlled image slots. " - "If space is tight, simplify layout and tighten spacing instead of enlarging the canvas." + "Produce only Canvas root/content/visual_spec JSON. " + "The frontend will map Canvas nodes into a fluid layout engine, so focus on semantic grouping, hierarchy, " + "layout intent, and explicit visual tokens instead of code-level rendering. " + "If space is tight, merge related ideas into fewer components instead of inventing extra decorative nodes." ), ] @@ -1297,11 +3123,11 @@ def _build_messages( if current_slide: user_sections.extend( [ - "Current slide JSON for reference:", - json.dumps(current_slide, ensure_ascii=False, indent=2), + "Current slide schema for reference:", + json.dumps(self._summarize_reference_slide(current_slide), ensure_ascii=False, indent=2), ( - "Preserve the same deck component grammar, accent usage, title rhythm, and spacing language " - "from the current slide unless the revision request explicitly changes structure." + "Preserve the same layout family, node naming style, and reading flow from the current slide " + "unless the revision request explicitly changes structure." ), ] ) @@ -1309,7 +3135,7 @@ def _build_messages( user_sections.append(f"Revision request: {edit_prompt}") user_sections.append( - "Ensure the editable_fields fully cover all meaningful visible text shown on the slide." + "Ensure the root tree fully covers all meaningful visible content on the slide, and keep node ids/content keys stable for downstream editing." ) return [ @@ -1434,65 +3260,144 @@ def _normalize_slide_payload( theme=theme, visual_assets=visual_assets, ) - html_template = payload.get("html_template") or payload.get("html") or "" - css_code = payload.get("css_code") or payload.get("css") or "" - if not isinstance(html_template, str) or not isinstance(css_code, str): - return fallback_slide - if len(html_template) > 16000 or len(css_code) > 20000: - return fallback_slide - if _FORBIDDEN_HTML_RE.search(html_template) or _FORBIDDEN_CSS_RE.search(css_code): - return fallback_slide + raw_root = payload.get("root") + raw_content = payload.get("content") + raw_blocks = payload.get("blocks") + if raw_blocks is None: + raw_blocks = payload.get("elements") + if raw_blocks is None: + raw_blocks = payload.get("content_blocks") or payload.get("contentBlocks") + + if isinstance(raw_root, dict) and isinstance(raw_content, dict): + derived_fields = self._derive_fields_from_canvas_content(raw_content) + if not derived_fields: + derived_fields = self._normalize_fields( + payload.get("editable_fields"), + outline_item=outline_item, + slide_index=slide_index, + ) + editable_fields = self._merge_editable_fields( + base_fields=[], + override_fields=derived_fields, + ) + title_value = ( + self._find_field_value(editable_fields, "title") + or str(raw_content.get("title") or "").strip() + or outline_item.get("title") + or f"Slide {slide_index + 1}" + ) + visual_spec = ( + self._normalize_canvas_visual_spec(payload.get("visual_spec") or payload.get("visualSpec")) + or self._build_canvas_visual_spec(theme=theme, has_visual_assets=bool(visual_assets)) + ) + slide = { + "slide_id": str(payload.get("slide_id") or slide_index + 1), + "page_num": slide_index + 1, + "title": str(payload.get("title") or title_value), + "schema_version": _CANVAS_SCHEMA_VERSION, + "render_engine": "canvas", + "layout_mode": self._normalize_layout_mode(payload.get("layout_mode") or payload.get("layoutMode")), + "template_key": self._normalize_template_key( + payload.get("template_key") or payload.get("template") or payload.get("layout_template") or payload.get("layoutTemplate"), + blocks=[], + visual_assets=visual_assets, + ), + "layout_family": str(payload.get("layout_family") or payload.get("layoutFamily") or "custom").strip() or "custom", + "blocks": [], + "html_template": str(fallback_slide.get("html_template") or ""), + "css_code": str(fallback_slide.get("css_code") or ""), + "editable_fields": editable_fields, + "visual_assets": visual_assets, + "root": raw_root, + "content": raw_content, + "visual_spec": visual_spec, + "generation_note": str(payload.get("generation_note") or "Canvas-only slide payload."), + "status": "done", + } + if isinstance(payload.get("constraints"), dict): + slide["constraints"] = payload["constraints"] + if isinstance(payload.get("editable_map"), dict): + slide["editable_map"] = payload["editable_map"] + return self._normalize_canvas_schema(slide=slide, visual_assets=visual_assets) + + if raw_blocks is None: + return self._normalize_legacy_slide_payload( + payload=payload, + outline_item=outline_item, + slide_index=slide_index, + slide_count=slide_count, + theme=theme, + visual_assets=visual_assets, + fallback_slide=fallback_slide, + ) - normalized_html = self._sanitize_html_template(html_template) - normalized_css = self._sanitize_css(css_code, theme=theme) - editable_fields = self._normalize_fields( - payload.get("editable_fields"), + normalized_blocks = self._normalize_blocks( + raw_blocks, outline_item=outline_item, - slide_index=slide_index, + visual_assets=visual_assets, ) - if not editable_fields: + if not normalized_blocks: return fallback_slide - normalized_html, attribute_warnings = self._sanitize_attribute_placeholders( - normalized_html, - editable_fields, - ) - if attribute_warnings: - log.warning( - "[Paper2PPTFrontendService] Sanitized attribute placeholders for page %s: %s", - slide_index + 1, - ", ".join(attribute_warnings), + derived_fields = self._derive_fields_from_blocks(normalized_blocks) + if not derived_fields: + derived_fields = self._normalize_fields( + payload.get("editable_fields"), + outline_item=outline_item, + slide_index=slide_index, ) - field_keys = {field["key"] for field in editable_fields} - placeholders = set(_FIELD_PLACEHOLDER_RE.findall(normalized_html)) - image_placeholders = set(_IMAGE_PLACEHOLDER_RE.findall(normalized_html)) - asset_keys = {str(asset.get("key") or "").strip() for asset in visual_assets if str(asset.get("key") or "").strip()} - if not placeholders: - return fallback_slide - if not placeholders.issubset(field_keys): - return fallback_slide - if image_placeholders and not image_placeholders.issubset(asset_keys): - return fallback_slide - if visual_assets and not image_placeholders: - return fallback_slide + editable_fields = self._merge_editable_fields( + base_fields=fallback_slide.get("editable_fields") or [], + override_fields=derived_fields, + ) title_value = ( self._find_field_value(editable_fields, "title") + or next( + ( + str(block.get("content") or "").strip() + for block in normalized_blocks + if str(block.get("role") or "") == "title" and str(block.get("content") or "").strip() + ), + "", + ) or outline_item.get("title") or f"Slide {slide_index + 1}" ) - return { + visual_spec = ( + self._normalize_canvas_visual_spec(payload.get("visual_spec") or payload.get("visualSpec")) + or self._build_canvas_visual_spec(theme=theme, has_visual_assets=bool(visual_assets)) + ) + slide = { "slide_id": str(payload.get("slide_id") or slide_index + 1), "page_num": slide_index + 1, "title": str(payload.get("title") or title_value), - "html_template": normalized_html, - "css_code": normalized_css, + "schema_version": _SLIDE_SCHEMA_VERSION, + "layout_mode": self._normalize_layout_mode(payload.get("layout_mode") or payload.get("layoutMode")), + "template_key": self._normalize_template_key( + payload.get("template_key") or payload.get("template") or payload.get("layout_template") or payload.get("layoutTemplate"), + blocks=normalized_blocks, + visual_assets=visual_assets, + ), + "blocks": normalized_blocks, + "html_template": str(fallback_slide.get("html_template") or ""), + "css_code": str(fallback_slide.get("css_code") or ""), "editable_fields": editable_fields, "visual_assets": visual_assets, - "generation_note": str(payload.get("generation_note") or ""), + "visual_spec": visual_spec, + "generation_note": str(payload.get("generation_note") or "Schema-driven slide payload."), "status": "done", } + if isinstance(payload.get("root"), dict): + slide["root"] = payload["root"] + if isinstance(payload.get("content"), dict): + slide["content"] = payload["content"] + if isinstance(payload.get("constraints"), dict): + slide["constraints"] = payload["constraints"] + if isinstance(payload.get("editable_map"), dict): + slide["editable_map"] = payload["editable_map"] + return self._normalize_canvas_schema(slide=slide, visual_assets=visual_assets) def _normalize_review_payload( self, @@ -2146,6 +4051,37 @@ def _load_reference_slides( return [self._summarize_reference_slide(slide) for slide in references] def _summarize_reference_slide(self, slide: Dict[str, Any]) -> Dict[str, Any]: + blocks = slide.get("blocks") or slide.get("elements") or [] + if isinstance(blocks, list) and blocks: + block_outline: List[Dict[str, Any]] = [] + for raw_block in blocks[:8]: + if not isinstance(raw_block, dict): + continue + layout = raw_block.get("layout") or {} + block_outline.append( + { + "id": str(raw_block.get("id") or "").strip(), + "type": str(raw_block.get("type") or "").strip(), + "role": str(raw_block.get("role") or "").strip(), + "zone": str(layout.get("zone") or "").strip(), + "span": layout.get("span"), + "asset_key": str(raw_block.get("asset_key") or raw_block.get("assetKey") or "").strip(), + } + ) + + return { + "page_num": int(slide.get("page_num") or 0), + "title": str(slide.get("title") or "").strip(), + "template_key": str(slide.get("template_key") or slide.get("templateKey") or "").strip(), + "layout_mode": str(slide.get("layout_mode") or slide.get("layoutMode") or "").strip(), + "field_keys": [ + str(field.get("key") or "").strip() + for field in (slide.get("editable_fields") or []) + if isinstance(field, dict) and str(field.get("key") or "").strip() + ][:10], + "block_outline": block_outline, + } + html_template = str(slide.get("html_template") or "") css_code = str(slide.get("css_code") or "") editable_fields = slide.get("editable_fields") or [] @@ -2523,17 +4459,35 @@ def _build_fallback_slide( }, ) - return { + blocks = self._build_fallback_blocks( + outline_item=outline_item, + slide_index=slide_index, + slide_count=slide_count, + theme=theme, + visual_assets=visual_assets, + ) + template_key = self._normalize_template_key( + "split_media" if has_visual else "text_focus", + blocks=blocks, + visual_assets=visual_assets, + ) + slide = { "slide_id": str(slide_index + 1), "page_num": slide_index + 1, "title": str(outline_item.get("title") or f"Slide {slide_index + 1}"), + "schema_version": _SLIDE_SCHEMA_VERSION, + "layout_mode": "fluid", + "template_key": template_key, + "blocks": blocks, "html_template": html_template, "css_code": css_code, "editable_fields": editable_fields, "visual_assets": visual_assets, + "visual_spec": self._build_canvas_visual_spec(theme=theme, has_visual_assets=has_visual), "generation_note": "Built-in fallback template", "status": "done", } + return self._normalize_canvas_schema(slide=slide, visual_assets=visual_assets) def _sanitize_html_template(self, html_template: str) -> str: cleaned = re.sub(r"<\s*/?\s*(html|head|body)\b[^>]*>", "", html_template, flags=re.IGNORECASE) @@ -2658,6 +4612,14 @@ def _write_slide_spec(self, slides_dir: Path, slide: Dict[str, Any]) -> None: encoding="utf-8", ) + raw_payload = slide.get("_raw_ai_payload") + if raw_payload is not None: + raw_path = slides_dir / f"page_{page_num - 1:03d}.raw_ai.json" + raw_path.write_text( + json.dumps(raw_payload, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + def _sync_deck_manifest(self, slides_dir: Path) -> None: slides: List[Dict[str, Any]] = [] for path in sorted(slides_dir.glob("page_*.json")): @@ -2674,6 +4636,23 @@ def _sync_deck_manifest(self, slides_dir: Path) -> None: encoding="utf-8", ) + def _write_raw_ai_manifest(self, slides_dir: Path, slides: Sequence[Dict[str, Any]]) -> None: + raw_entries = [ + { + "page_num": int(slide.get("page_num") or 0), + "title": str(slide.get("title") or ""), + "raw_ai_payload": slide.get("_raw_ai_payload"), + } + for slide in slides + if slide.get("_raw_ai_payload") is not None + ] + if not raw_entries: + return + (slides_dir / "frontend_raw_ai.json").write_text( + json.dumps(raw_entries, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + def _parse_json_text(self, raw_text: Optional[str], field_name: str) -> Optional[Dict[str, Any]]: if raw_text is None or not raw_text.strip(): return None diff --git a/frontend-workflow/package-lock.json b/frontend-workflow/package-lock.json index 2589b264..c0bbbd27 100644 --- a/frontend-workflow/package-lock.json +++ b/frontend-workflow/package-lock.json @@ -15,6 +15,7 @@ "i18next-browser-languagedetector": "^8.2.0", "lucide-react": "^0.294.0", "mermaid": "^10.9.5", + "pptxgenjs": "^4.0.1", "react": "^18.2.0", "react-dom": "^18.2.0", "react-drawio": "^1.0.7", @@ -2262,6 +2263,12 @@ "dev": true, "license": "MIT" }, + "node_modules/core-util-is": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", + "license": "MIT" + }, "node_modules/cose-base": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/cose-base/-/cose-base-1.0.3.tgz", @@ -3089,6 +3096,12 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/https": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/https/-/https-1.0.0.tgz", + "integrity": "sha512-4EC57ddXrkaF0x83Oj8sM6SLQHAWXw90Skqu2M4AEWENZ3F02dFJE/GARA8igO79tcgYqGrD7ae4f5L3um2lgg==", + "license": "ISC" + }, "node_modules/i18next": { "version": "25.7.4", "resolved": "https://registry.npmjs.org/i18next/-/i18next-25.7.4.tgz", @@ -3150,6 +3163,33 @@ "node": ">=0.10.0" } }, + "node_modules/image-size": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/image-size/-/image-size-1.2.1.tgz", + "integrity": "sha512-rH+46sQJ2dlwfjfhCyNx5thzrv+dtmBIhPHk0zgRUukHzZ/kRueTJXoYYsclBaKcSMBWuGbOFXtioLpzTb5euw==", + "license": "MIT", + "dependencies": { + "queue": "6.0.2" + }, + "bin": { + "image-size": "bin/image-size.js" + }, + "engines": { + "node": ">=16.x" + } + }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==", + "license": "MIT" + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "license": "ISC" + }, "node_modules/inline-style-parser": { "version": "0.2.7", "resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.7.tgz", @@ -3273,6 +3313,12 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "license": "MIT" + }, "node_modules/jiti": { "version": "1.21.7", "dev": true, @@ -3307,6 +3353,18 @@ "node": ">=6" } }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "license": "(MIT OR GPL-3.0-or-later)", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, "node_modules/katex": { "version": "0.16.28", "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.28.tgz", @@ -3352,6 +3410,15 @@ "integrity": "sha512-8h2oVEZNktL4BH2JCOI90iD1yXwL6iNW7KcCKT2QZgQJR2vbqDsldCTPRU9NifTCqHZci57XvQQ15YTu+sTYPg==", "license": "MIT" }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "license": "MIT", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lilconfig": { "version": "3.1.3", "dev": true, @@ -5905,6 +5972,12 @@ "node": ">= 6" } }, + "node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", + "license": "(MIT AND Zlib)" + }, "node_modules/parse-entities": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-4.0.2.tgz", @@ -6110,6 +6183,39 @@ "dev": true, "license": "MIT" }, + "node_modules/pptxgenjs": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/pptxgenjs/-/pptxgenjs-4.0.1.tgz", + "integrity": "sha512-TeJISr8wouAuXw4C1F/mC33xbZs/FuEG6nH9FG1Zj+nuPcGMP5YRHl6X+j3HSUnS1f3at6k75ZZXPMZlA5Lj9A==", + "license": "MIT", + "dependencies": { + "@types/node": "^22.8.1", + "https": "^1.0.0", + "image-size": "^1.2.1", + "jszip": "^3.10.1" + } + }, + "node_modules/pptxgenjs/node_modules/@types/node": { + "version": "22.19.17", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.17.tgz", + "integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==", + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/pptxgenjs/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "license": "MIT" + }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", + "license": "MIT" + }, "node_modules/property-information": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz", @@ -6120,6 +6226,15 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/queue": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/queue/-/queue-6.0.2.tgz", + "integrity": "sha512-iHZWu+q3IdFZFX36ro/lKBkSvfkztY5Y7HMiPlOUjhupPcG2JMfst2KKEpu5XndviX/3UhFbRngUPNKtgvtZiA==", + "license": "MIT", + "dependencies": { + "inherits": "~2.0.3" + } + }, "node_modules/queue-microtask": { "version": "1.2.3", "dev": true, @@ -6264,6 +6379,21 @@ "pify": "^2.3.0" } }, + "node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, "node_modules/readdirp": { "version": "3.6.0", "dev": true, @@ -6938,6 +7068,12 @@ "node": ">=6" } }, + "node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "license": "MIT" + }, "node_modules/safer-buffer": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", @@ -6959,6 +7095,12 @@ "semver": "bin/semver.js" } }, + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", + "license": "MIT" + }, "node_modules/source-map-js": { "version": "1.2.1", "dev": true, @@ -6977,6 +7119,15 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, "node_modules/stringify-entities": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/stringify-entities/-/stringify-entities-4.0.4.tgz", @@ -7368,7 +7519,6 @@ }, "node_modules/util-deprecate": { "version": "1.0.2", - "dev": true, "license": "MIT" }, "node_modules/uuid": { diff --git a/frontend-workflow/package.json b/frontend-workflow/package.json index a1b65fce..4fdb81e5 100644 --- a/frontend-workflow/package.json +++ b/frontend-workflow/package.json @@ -15,6 +15,7 @@ "i18next-browser-languagedetector": "^8.2.0", "lucide-react": "^0.294.0", "mermaid": "^10.9.5", + "pptxgenjs": "^4.0.1", "react": "^18.2.0", "react-dom": "^18.2.0", "react-drawio": "^1.0.7", diff --git a/frontend-workflow/src/components/paper2ppt/FrontendCompleteStep.tsx b/frontend-workflow/src/components/paper2ppt/FrontendCompleteStep.tsx index 3bc6e8c2..211286ef 100644 --- a/frontend-workflow/src/components/paper2ppt/FrontendCompleteStep.tsx +++ b/frontend-workflow/src/components/paper2ppt/FrontendCompleteStep.tsx @@ -7,11 +7,12 @@ import { RotateCcw, Sparkles, } from 'lucide-react'; -import { FrontendSlide } from './types'; +import { FrontendDeckTheme, FrontendSlide } from './types'; import FrontendSlidePreview from './FrontendSlidePreview'; interface FrontendCompleteStepProps { slides: FrontendSlide[]; + deckTheme?: FrontendDeckTheme | null; downloadUrl: string | null; pdfPreviewUrl: string | null; isGeneratingFinal: boolean; @@ -25,6 +26,7 @@ interface FrontendCompleteStepProps { const FrontendCompleteStep: React.FC = ({ slides, + deckTheme = null, downloadUrl, pdfPreviewUrl, isGeneratingFinal, @@ -52,7 +54,7 @@ const FrontendCompleteStep: React.FC = ({
{slides.map((slide) => (
- +

第 {slide.pageNum} 页 · {slide.title}

@@ -70,7 +72,7 @@ const FrontendCompleteStep: React.FC = ({ > {isGeneratingFinal ? ( <> - 正在截图并导出... + 正在导出... ) : ( <> @@ -79,7 +81,7 @@ const FrontendCompleteStep: React.FC = ({ )}

- 导出会将每一页前端渲染结果截图,再打包成整页图片版 PPTX / PDF。 + 默认导出为可编辑 PPTX;仅在缺少 Canvas 布局信息时回退为截图版。

) : ( diff --git a/frontend-workflow/src/components/paper2ppt/FrontendGenerateStep.tsx b/frontend-workflow/src/components/paper2ppt/FrontendGenerateStep.tsx index 2a2e6bb9..e4219bf0 100644 --- a/frontend-workflow/src/components/paper2ppt/FrontendGenerateStep.tsx +++ b/frontend-workflow/src/components/paper2ppt/FrontendGenerateStep.tsx @@ -1,10 +1,11 @@ -import React, { useEffect, useState } from 'react'; +import React, { useEffect, useRef, useState } from 'react'; import { AlertCircle, ArrowLeft, CheckCircle2, Code2, FileText, + ImagePlus, Loader2, MonitorSmartphone, Plus, @@ -12,10 +13,13 @@ import { RotateCcw, ScanSearch, ShieldCheck, + Table2, Trash2, } from 'lucide-react'; import { FrontendDeckTheme, FrontendSlide, SlideOutline, Step } from './types'; +import { parseFrontendInsertZoneTarget } from './types'; import FrontendSlidePreview from './FrontendSlidePreview'; +import { buildFrontendSlideMarkup, isSchemaDrivenSlide } from './frontendSlideUtils'; interface FrontendGenerateStepProps { outlineData: SlideOutline[]; @@ -41,6 +45,11 @@ interface FrontendGenerateStepProps { addListItem: (slideIndex: number, fieldKey: string) => void; removeListItem: (slideIndex: number, fieldKey: string, itemIndex: number) => void; replaceVisualAsset: (slideIndex: number, imageKey: string, file: File) => Promise; + insertTextBlock: (slideIndex: number, targetBlockId?: string) => void; + insertCalloutBlock: (slideIndex: number, targetBlockId?: string) => void; + insertTableBlock: (slideIndex: number, targetBlockId?: string) => void; + insertImageBlock: (slideIndex: number, file: File, targetBlockId?: string) => Promise; + updateLayoutIr: (slideIndex: number, layoutIr: FrontendSlide['layoutIr']) => void; } const FrontendGenerateStep: React.FC = ({ @@ -67,12 +76,40 @@ const FrontendGenerateStep: React.FC = ({ addListItem, removeListItem, replaceVisualAsset, + insertTextBlock, + insertCalloutBlock, + insertTableBlock, + insertImageBlock, + updateLayoutIr, }) => { + const insertImageInputRef = useRef(null); const [panelMode, setPanelMode] = useState<'preview' | 'code'>('preview'); const [draftHtml, setDraftHtml] = useState(''); const [draftCss, setDraftCss] = useState(''); const [codeStatus, setCodeStatus] = useState(null); + const [selectedBlockId, setSelectedBlockId] = useState(null); + const [hoveredBlockId, setHoveredBlockId] = useState(null); + const [lastInsertionBlockId, setLastInsertionBlockId] = useState(null); const currentSlide = frontendSlides[currentSlideIndex]; + const currentSlideIsSchema = currentSlide ? isSchemaDrivenSlide(currentSlide) : false; + const currentCanvasValidation = currentSlide?.canvasValidation; + const currentOverflowIssues = currentSlide?.layoutIr?.overflowIssues || []; + const activeInsertionBlockId = selectedBlockId || hoveredBlockId || lastInsertionBlockId || null; + const activeInsertionZone = parseFrontendInsertZoneTarget(activeInsertionBlockId); + const describeInsertTarget = (target: string | null) => { + const zone = parseFrontendInsertZoneTarget(target); + if (zone) { + const zoneLabel = zone === 'left' + ? '左侧空白' + : zone === 'right' + ? '右侧空白' + : zone === 'main' + ? '主区域空白' + : `${zone} 空白`; + return `新增到${zoneLabel}`; + } + return target ? `插入到 ${target}` : '默认插入到主区域'; + }; const outlineSlide = outlineData[currentSlideIndex]; const isCodeDirty = draftHtml !== (currentSlide?.htmlTemplate || '') || draftCss !== (currentSlide?.cssCode || ''); const busyMessage = taskMessage || (currentSlide?.status === 'processing' ? '当前页仍在生成中,请稍候。' : '后台任务仍在处理中,请稍候。'); @@ -95,13 +132,40 @@ const FrontendGenerateStep: React.FC = ({ : currentSlide.status !== 'done' ? '当前页尚未完成生成' : ''; + const renderedHtmlValue = currentSlide ? buildFrontendSlideMarkup(currentSlide, deckTheme) : ''; useEffect(() => { setDraftHtml(currentSlide?.htmlTemplate || ''); setDraftCss(currentSlide?.cssCode || ''); setCodeStatus(null); + setSelectedBlockId(null); + setHoveredBlockId(null); + setLastInsertionBlockId(null); }, [currentSlide?.slideId, currentSlide?.htmlTemplate, currentSlide?.cssCode]); + const handleSelectBlock = (blockId: string | null) => { + setSelectedBlockId(blockId); + if (blockId) { + setLastInsertionBlockId(blockId); + } + }; + + const handleHoverBlock = (blockId: string | null) => { + setHoveredBlockId(blockId); + if (blockId) { + setLastInsertionBlockId(blockId); + } + }; + + const handleInsertImageChange = async (event: React.ChangeEvent) => { + const file = event.target.files?.[0]; + event.target.value = ''; + if (!file || !currentSlide) { + return; + } + await insertImageBlock(currentSlideIndex, file, activeInsertionBlockId || undefined); + }; + return (
@@ -177,7 +241,7 @@ const FrontendGenerateStep: React.FC = ({ : 'bg-white/5 text-gray-300 hover:bg-white/10' }`} > - 代码 + {currentSlideIsSchema ? '模板' : '代码'}
@@ -203,15 +267,19 @@ const FrontendGenerateStep: React.FC = ({ ) : isGenerating && currentSlide?.status === 'processing' ? (
-

正在生成这一页的前端代码...

+

正在生成这一页的结构化页面...

- {taskMessage || '大模型正在编排 HTML/CSS 模板'} + {taskMessage || '大模型正在规划模板选择、blocks 和图片槽位'}

) : currentSlide ? ( updateFieldValue(currentSlideIndex, fieldKey, value) } @@ -224,12 +292,79 @@ const FrontendGenerateStep: React.FC = ({ onReplaceImage={(imageKey, file) => replaceVisualAsset(currentSlideIndex, imageKey, file) } + onLayoutIrChange={(layoutIr) => updateLayoutIr(currentSlideIndex, layoutIr)} /> ) : (
等待生成
) + ) : currentSlideIsSchema ? ( +
+
+
+ Schema-Driven Template +
+
+
+
Render Engine
+
+ {currentSlide?.renderEngine || 'canvas'} +
+
+
+
Template Key
+
{currentSlide?.templateKey || 'auto'}
+
+
+
Canvas Valid
+
+ {currentCanvasValidation ? String(currentCanvasValidation.ok) : 'n/a'} +
+
+
+
Layout Nodes
+
{currentSlide?.layoutIr?.nodes?.length || 0}
+
+
+
Overflow
+
0 ? 'text-rose-200' : 'text-emerald-200'}`}> + {currentOverflowIssues.length} +
+
+
+
+
+
Missing Refs
+
+ {currentCanvasValidation?.missingRefs?.length + ? currentCanvasValidation.missingRefs.join(', ') + : 'none'} +
+
+
+
Orphan Content
+
+ {currentCanvasValidation?.orphanContentKeys?.length + ? currentCanvasValidation.orphanContentKeys.join(', ') + : 'none'} +
+
+
+
+ 当前页默认由 Canvas 引擎负责排版,并在后台采集浏览器真实 layout_ir。 +
+
+
+
Converted HTML
+