From 40136af71275cdd468a7f9242c779f5358b96d9f Mon Sep 17 00:00:00 2001
From: cxbsoft <cxbsoft@bsot.cn>
Date: Tue, 16 Jun 2026 22:24:33 +0800
Subject: [PATCH] Add image upload parsing

---
 etc/config.production.yaml            |   8 ++
 etc/config.yaml                       |   8 ++
 models/knowledge/industryKnowledge.go |   6 +-
 models/knowledge/knowledge.go         |   6 +-
 models/knowledge/orgKnowledge.go      |   6 +-
 service/task_executor/agents.py       |  25 +++++-
 service/task_executor/parse_doc.py    | 110 +++++++++++++++++++++++++-
 service/task_executor/prompts.py      |  38 ++++++++-
 8 files changed, 192 insertions(+), 15 deletions(-)

diff --git a/etc/config.production.yaml b/etc/config.production.yaml
index 50dab65..a10c971 100644
--- a/etc/config.production.yaml
+++ b/etc/config.production.yaml
@@ -191,6 +191,7 @@ KnowledgeBase:
       - image/jpg
       - image/gif
       - image/bmp
+      - image/webp
       - image/vnd.dwg
       - image/vnd.dxf
       - video/mp4
@@ -216,6 +217,7 @@ KnowledgeBase:
       - png
       - gif
       - bmp
+      - webp
       - dwg
       - mp4
       - mkv
@@ -232,6 +234,12 @@ KnowledgeBase:
       - xlsx
       - xls
       - ppt
+      - jpg
+      - jpeg
+      - png
+      - gif
+      - bmp
+      - webp
     
 YunwoToolbox:
   BaseURL: https://tools.yunwoai.com
diff --git a/etc/config.yaml b/etc/config.yaml
index 7a34f04..bce9947 100644
--- a/etc/config.yaml
+++ b/etc/config.yaml
@@ -193,6 +193,7 @@ KnowledgeBase:
       - image/jpg
       - image/gif
       - image/bmp
+      - image/webp
       - image/vnd.dwg
       - image/vnd.dxf
       - video/mp4
@@ -218,6 +219,7 @@ KnowledgeBase:
       - png
       - gif
       - bmp
+      - webp
       - dwg
       - mp4
       - mkv
@@ -234,6 +236,12 @@ KnowledgeBase:
       - xlsx
       - xls
       - ppt
+      - jpg
+      - jpeg
+      - png
+      - gif
+      - bmp
+      - webp
 
 YunwoToolbox:
   BaseURL: https://tools.yunwoai.com
diff --git a/models/knowledge/industryKnowledge.go b/models/knowledge/industryKnowledge.go
index e9765e0..cbbc5b8 100644
--- a/models/knowledge/industryKnowledge.go
+++ b/models/knowledge/industryKnowledge.go
@@ -674,7 +674,7 @@ func (industryKnowledgeModel *IndustryKnowledgeModel) getFolderTrace(industryID
 
 func (industryKnowledgeModel *IndustryKnowledgeModel) UploadFile(userID uint, industry_id uint, file multipart.File, info *multipart.FileHeader, folder_id string, autoParse string, extractPicture string, describePicture string, llmDescribe string) (map[string]interface{}, error) {
 	fileName := info.Filename
-	fileExtension := fileName[strings.LastIndex(fileName, ".")+1:]
+	fileExtension := strings.ToLower(fileName[strings.LastIndex(fileName, ".")+1:])
 	isAllowedFileType := false
 	isAllowedFileExtension := false
 	permission, _ := industryKnowledgeModel.GetIndustryPermission(userID, industry_id)
@@ -703,7 +703,7 @@ func (industryKnowledgeModel *IndustryKnowledgeModel) UploadFile(userID uint, in
 		}
 	}
 	for _, allowedFileExtension := range AllowedFileExtensions {
-		if fileExtension == allowedFileExtension {
+		if fileExtension == strings.ToLower(allowedFileExtension) {
 			isAllowedFileExtension = true
 			break
 		}
@@ -771,7 +771,7 @@ func (industryKnowledgeModel *IndustryKnowledgeModel) UploadFile(userID uint, in
 	if autoParse == "true" {
 		isParsableExtension := false
 		for _, parsableFileExtension := range ParsableExtensions {
-			if fileExtension == parsableFileExtension {
+			if fileExtension == strings.ToLower(parsableFileExtension) {
 				isParsableExtension = true
 				break
 			}
diff --git a/models/knowledge/knowledge.go b/models/knowledge/knowledge.go
index fbc619b..eed3c00 100644
--- a/models/knowledge/knowledge.go
+++ b/models/knowledge/knowledge.go
@@ -400,7 +400,7 @@ func (knowledgeModel *KnowledgeModel) getFolderTrace(userID uint, folderID uint)
 
 func (knowledgeModel *KnowledgeModel) UploadFile(userID uint, file multipart.File, info *multipart.FileHeader, folder_id string, autoParse string, extractPicture string, describePicture string, llm_describe string) (map[string]interface{}, error) {
 	fileName := info.Filename
-	fileExtension := fileName[strings.LastIndex(fileName, ".")+1:]
+	fileExtension := strings.ToLower(fileName[strings.LastIndex(fileName, ".")+1:])
 	isAllowedFileType := false
 	isAllowedFileExtension := false
 	for _, allowedFileType := range AllowedFileTypes {
@@ -410,7 +410,7 @@ func (knowledgeModel *KnowledgeModel) UploadFile(userID uint, file multipart.Fil
 		}
 	}
 	for _, allowedFileExtension := range AllowedFileExtensions {
-		if fileExtension == allowedFileExtension {
+		if fileExtension == strings.ToLower(allowedFileExtension) {
 			isAllowedFileExtension = true
 			break
 		}
@@ -493,7 +493,7 @@ func (knowledgeModel *KnowledgeModel) UploadFile(userID uint, file multipart.Fil
 	if autoParse == "true" {
 		isParsableExtension := false
 		for _, parsableFileExtension := range ParsableExtensions {
-			if fileExtension == parsableFileExtension {
+			if fileExtension == strings.ToLower(parsableFileExtension) {
 				isParsableExtension = true
 				break
 			}
diff --git a/models/knowledge/orgKnowledge.go b/models/knowledge/orgKnowledge.go
index 667fd6c..1b0c4ee 100644
--- a/models/knowledge/orgKnowledge.go
+++ b/models/knowledge/orgKnowledge.go
@@ -474,7 +474,7 @@ func (orgKnowledgeModel *OrgKnowledgeModel) getFolderTrace(orgID uint, folderID
 
 func (orgKnowledgeModel *OrgKnowledgeModel) UploadFile(userID uint, file multipart.File, info *multipart.FileHeader, folder_id string, autoParse string, extractPicture string, describePicture string, llmDescribe string) (map[string]interface{}, error) {
 	fileName := info.Filename
-	fileExtension := fileName[strings.LastIndex(fileName, ".")+1:]
+	fileExtension := strings.ToLower(fileName[strings.LastIndex(fileName, ".")+1:])
 	isAllowedFileType := false
 	isAllowedFileExtension := false
 	permission, _ := orgKnowledgeModel.GetOrgPermission(userID)
@@ -501,7 +501,7 @@ func (orgKnowledgeModel *OrgKnowledgeModel) UploadFile(userID uint, file multipa
 		}
 	}
 	for _, allowedFileExtension := range AllowedFileExtensions {
-		if fileExtension == allowedFileExtension {
+		if fileExtension == strings.ToLower(allowedFileExtension) {
 			isAllowedFileExtension = true
 			break
 		}
@@ -569,7 +569,7 @@ func (orgKnowledgeModel *OrgKnowledgeModel) UploadFile(userID uint, file multipa
 	if autoParse == "true" {
 		isParsableExtension := false
 		for _, parsableFileExtension := range ParsableExtensions {
-			if fileExtension == parsableFileExtension {
+			if fileExtension == strings.ToLower(parsableFileExtension) {
 				isParsableExtension = true
 				break
 			}
diff --git a/service/task_executor/agents.py b/service/task_executor/agents.py
index d3e48fd..44f9cb9 100644
--- a/service/task_executor/agents.py
+++ b/service/task_executor/agents.py
@@ -89,10 +89,17 @@ def paragraph_summary(document_content,doc_name=None):
         return [],""
     return list(set(response.get("key_words",[]))), response.get("summary", "")
 
-def image_summary(image_url,context = "",local=False):
-    history = prompts.build_prompts_image(prompts.image_summary_prompt, image_url,context)
+def image_summary(image_url,context = "",local=False, prompt=None):
+    history = prompts.build_prompts_image(prompt or prompts.image_summary_prompt, image_url,context, local=local)
     response = llm_clients.get_model_response_json(
-        image_summary_model, history, 0.7, streamMode=True,continueing_prompt=False)
+        image_summary_model,
+        history,
+        0.7,
+        streamMode=True,
+        continueing_prompt=False,
+        max_completion_tokens=2048,
+        max_retries=3,
+    )
     # print("Response:", response)
     
     # 检查响应是否为None或不是字典类型
@@ -104,6 +111,18 @@ def image_summary(image_url,context = "",local=False):
         return image_url, ""
     return response.get("title", ""), response.get("description", "")
 
+def uploaded_image_summary(image_data_url, file_name="", image_info=None):
+    context = json.dumps({
+        "file_name": file_name,
+        "image_info": image_info or {},
+    }, ensure_ascii=False)
+    return image_summary(
+        image_data_url,
+        context,
+        local=False,
+        prompt=prompts.uploaded_image_summary_prompt,
+    )
+
 def doc_metadata_extractor(filename, content, file_path=None):
     history = prompts.build_prompts(prompts.doc_metadata_extractor_prompt)
     history.append({
diff --git a/service/task_executor/parse_doc.py b/service/task_executor/parse_doc.py
index 5d0111f..3c47063 100644
--- a/service/task_executor/parse_doc.py
+++ b/service/task_executor/parse_doc.py
@@ -20,7 +20,7 @@ import textract
 import shutil  # 添加导入
 from pptx import Presentation  # 添加导入用于处理pptx文件
 import fitz  # PyMuPDF，用于PDF图片截取
-from PIL import Image  # 用于图片处理
+from PIL import Image, ImageOps  # 用于图片处理
 import io
 from html.parser import HTMLParser
 import database
@@ -34,6 +34,7 @@ from xml.etree import ElementTree as ET
 import agents
 from libreoffice_pool import libreoffice_pool, convert_with_retry
 import chardet
+from typing import Optional
 
 os.environ.setdefault("HTTPX_FORCE_HTTP1", "1")
 ocr_balancer = mistral_ocr_balancer.get_shared_balancer()
@@ -64,6 +65,111 @@ def _env_bool(name: str, default: bool = False) -> bool:
     return value.strip().lower() in {"1", "true", "yes", "y", "on"}
 
 
+IMAGE_PARSE_EXTENSIONS = {
+    ".jpg",
+    ".jpeg",
+    ".png",
+    ".gif",
+    ".bmp",
+    ".webp",
+    ".tif",
+    ".tiff",
+}
+
+
+def _is_image_file(file_path: str) -> bool:
+    _, ext = os.path.splitext(file_path or "")
+    return ext.lower() in IMAGE_PARSE_EXTENSIONS
+
+
+def _normalize_image_file(file_path: str, *, max_side: int = 2048, quality: int = 88):
+    """
+    将任意上传图片转为统一的 JPEG data URL，便于调用视觉模型。
+
+    返回:
+        (data_url, info_dict)
+    """
+    with Image.open(file_path) as img:
+        img = ImageOps.exif_transpose(img)
+        info = {
+            "original_mode": img.mode,
+            "width": img.width,
+            "height": img.height,
+            "format": (img.format or "").upper(),
+        }
+
+        if img.mode not in ("RGB", "RGBA", "L"):
+            img = img.convert("RGBA" if "A" in img.getbands() else "RGB")
+
+        if img.mode == "RGBA":
+            background = Image.new("RGB", img.size, (255, 255, 255))
+            alpha = img.getchannel("A")
+            background.paste(img, mask=alpha)
+            img = background
+        elif img.mode == "L":
+            img = img.convert("RGB")
+        elif img.mode != "RGB":
+            img = img.convert("RGB")
+
+        if max(img.width, img.height) > max_side:
+            resampling = getattr(Image, "Resampling", Image)
+            resample = getattr(resampling, "LANCZOS", getattr(Image, "LANCZOS", 1))
+            img.thumbnail((max_side, max_side), resample)
+
+        buffer = io.BytesIO()
+        img.save(buffer, format="JPEG", quality=quality, optimize=True)
+        jpeg_bytes = buffer.getvalue()
+        image_data_url = "data:image/jpeg;base64," + base64.b64encode(jpeg_bytes).decode("utf-8")
+        info.update({
+            "normalized_format": "JPEG",
+            "normalized_width": img.width,
+            "normalized_height": img.height,
+            "normalized_size": len(jpeg_bytes),
+        })
+        return image_data_url, info
+
+
+def _image_content_prompt(file_name: str, image_info: Optional[dict] = None) -> str:
+    image_info = image_info or {}
+    prompt_lines = [
+        f"文件名: {file_name or '未知图片'}",
+        f"原始格式: {image_info.get('format') or 'unknown'}",
+        f"原始尺寸: {image_info.get('width') or 'unknown'}x{image_info.get('height') or 'unknown'}",
+    ]
+    if image_info.get("normalized_width") and image_info.get("normalized_height"):
+        prompt_lines.append(
+            f"标准化尺寸: {image_info.get('normalized_width')}x{image_info.get('normalized_height')}"
+        )
+    return "\n".join(prompt_lines)
+
+
+def _extract_image_paragraphs(file_path: str):
+    image_data_url, image_info = _normalize_image_file(file_path)
+    title, description = agents.uploaded_image_summary(
+        image_data_url,
+        file_name=os.path.basename(file_path),
+        image_info=image_info,
+    )
+
+    title = (title or "").strip()
+    description = (description or "").strip()
+    file_name = os.path.basename(file_path)
+    if not description:
+        raise TransientParseError(f"图片视觉模型概括失败（可重试）: {file_path}")
+    if not title or title.startswith("data:") or len(title) > 200:
+        title = os.path.splitext(file_name)[0] or file_name or "image"
+
+    summary_lines = [
+        f"图片文件名: {file_name}",
+        f"图片标题: {title}",
+        f"图片描述: {description}",
+        f"图片格式: JPEG(标准化自 {image_info.get('format') or 'unknown'})",
+        f"图片尺寸: {image_info.get('normalized_width')}x{image_info.get('normalized_height')}",
+    ]
+    image_text = "\n".join(summary_lines)
+    return [[image_text]], [[image_text]]
+
+
 def _is_local_ocr_provider(provider: str) -> bool:
     return provider in {"ppocr_vl", "paddleocr_vl"}
 
@@ -2533,6 +2639,8 @@ def get_paragraph_router(file_path, extract_images=True, image_output_dir=None,u
     temp_files_to_cleanup = []
     
     try:
+        if _is_image_file(file_path):
+            return _extract_image_paragraphs(file_path)
         if ext == ".pdf":
             if extract_images:
                 result = perform_pdf_ocr(file_path, save_images=True, output_dir=image_output_dir, user_id=user_id)
diff --git a/service/task_executor/prompts.py b/service/task_executor/prompts.py
index 9fdda41..60c29dd 100644
--- a/service/task_executor/prompts.py
+++ b/service/task_executor/prompts.py
@@ -56,6 +56,28 @@ image_summary_prompt = """
 直接返回一段`json`给我即可
 """
 
+uploaded_image_summary_prompt = """
+你是一个图片解析器，用于把用户直接上传到知识库的图片转成可检索的文本。你会收到一张图片，以及图片文件名和基础元数据。请详细概括图片内容，方便后续RAG检索命中。
+
+#### 要求
+
+1. 优先描述图片中的主体、场景、人物/物体关系、颜色、布局、显著细节。
+2. 如果图片中有文字、表格、图表、截图、流程图、票据或证件信息，请尽量完整转写关键文字，并说明结构和含义。
+3. 如果图片像产品图、设备图、建筑图、报告截图或设计图，请描述可见部件、标注、用途线索和可能的业务语义。
+4. 不要编造看不见的信息；不确定时说明“无法确认”。
+
+#### 返回格式
+
+```json
+{
+  "title": "图片标题",
+  "description": "图片详细内容概括"
+}
+```
+
+直接返回一段`json`给我即可。
+"""
+
 doc_metadata_extractor_prompt = """
 你是一个文档元数据提取器,我会给你一个文档的前几页内容和文档的文件名,你需要帮我提取出文档的一些元信息,包括文档标题、作者、文档类型、文档摘要、与文档关联的日期,你需要直接给我返回`json`格式的内容.
 
@@ -154,6 +176,18 @@ def _should_use_direct_image_url(image_url: str) -> bool:
         return True
     return False
 
+
+def _short_image_log(value: str, max_len: int = 300) -> str:
+    if not isinstance(value, str):
+        return str(type(value))
+    if value.startswith("data:"):
+        header = value.split(",", 1)[0]
+        return f"{header},<base64:{max(0, len(value) - len(header) - 1)} chars>"
+    if len(value) <= max_len:
+        return value
+    return value[:max_len] + "...<truncated>"
+
+
 def build_prompts_image(prompt, image_url, custom_input="", local=False):
     # 如果 local=True，在 URL 前添加本地服务器前缀
     if local:
@@ -161,7 +195,7 @@ def build_prompts_image(prompt, image_url, custom_input="", local=False):
             # 确保路径以 / 开头
             image_url = f"http://localhost:8081{image_url if image_url.startswith('/') else '/' + image_url}"
             print(image_url)
-    print(f"[build_prompts_image] image_url={image_url}")
+    print(f"[build_prompts_image] image_url={_short_image_log(image_url)}")
 
     if _should_use_direct_image_url(image_url):
         image_payload = image_url
@@ -170,7 +204,7 @@ def build_prompts_image(prompt, image_url, custom_input="", local=False):
         if not image_payload:
             logger.warning("Falling back to raw image URL for %s", image_url)
             image_payload = image_url
-    print(f"[build_prompts_image] image_payload={image_payload}")
+    print(f"[build_prompts_image] image_payload={_short_image_log(image_payload)}")
     return [
         {
             "role": "user",
-- 
2.48.1