Initial commit
This commit is contained in:
116
backend/planner/planning_agent/input_pipeline.py
Normal file
116
backend/planner/planning_agent/input_pipeline.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from planner.mineru_client import MinerUClient
|
||||
import dotenv
|
||||
|
||||
_ENV_PATH = Path(__file__).resolve().parents[2] / ".env"
|
||||
dotenv.load_dotenv(dotenv_path=_ENV_PATH, override=False)
|
||||
|
||||
TEXT_EXTS = {".txt", ".md", ".markdown"}
|
||||
DOC_EXTS = {".pdf", ".docx", ".doc", ".xlsx", ".xls", ".pptx"}
|
||||
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff", ".webp"}
|
||||
|
||||
|
||||
def _get_mineru_token() -> str:
|
||||
token = os.getenv("MinerU_API") or os.getenv("MinerU_API_KEY")
|
||||
if token:
|
||||
return token
|
||||
if os.getenv("MinerU_URL"):
|
||||
return ""
|
||||
raise ValueError("MinerU_API 环境变量未设置")
|
||||
|
||||
|
||||
def _read_text_file(file_path: Path) -> str:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
return f.read().strip()
|
||||
|
||||
|
||||
def _extract_title_and_content(text: str, fallback_title: str) -> Dict[str, str]:
|
||||
if not text:
|
||||
return {"title": fallback_title, "content": ""}
|
||||
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
if not lines:
|
||||
return {"title": fallback_title, "content": text}
|
||||
|
||||
first = lines[0]
|
||||
if first.startswith("#"):
|
||||
title = first.lstrip("#").strip() or fallback_title
|
||||
content_start_index = text.find(lines[1]) if len(lines) > 1 else -1
|
||||
content = text[content_start_index:].strip() if content_start_index != -1 else ""
|
||||
return {"title": title, "content": content or text}
|
||||
|
||||
title = first
|
||||
content = "\n".join(lines[1:]).strip()
|
||||
return {"title": title, "content": content}
|
||||
|
||||
|
||||
def _ensure_markdown(text: str, title: str) -> str:
|
||||
if not text:
|
||||
return f"# {title}\n"
|
||||
stripped = text.lstrip()
|
||||
if stripped.startswith("#"):
|
||||
return text
|
||||
return f"# {title}\n\n{text}"
|
||||
|
||||
|
||||
def parse_intent_file(input_path: str, save_dir: str = "planner\\mineru_result") -> Dict[str, Any]:
|
||||
"""
|
||||
解析意图编制导入文件:
|
||||
- txt/md: 直接读取
|
||||
- pdf/图片: 使用 MinerU 解析,返回 JSON 与解析文本
|
||||
"""
|
||||
path = Path(input_path)
|
||||
if not path.exists() or not path.is_file():
|
||||
raise FileNotFoundError(f"输入文件不存在: {input_path}")
|
||||
|
||||
suffix = path.suffix.lower()
|
||||
file_title = path.stem
|
||||
|
||||
if suffix in TEXT_EXTS:
|
||||
content = _read_text_file(path)
|
||||
result = _extract_title_and_content(content, file_title)
|
||||
return {"source": str(path), "type": "text", **result, "raw_result": None}
|
||||
|
||||
if suffix in IMAGE_EXTS or suffix == ".pdf":
|
||||
mineru_token = _get_mineru_token()
|
||||
client = MinerUClient(token=mineru_token)
|
||||
|
||||
result_json = client.parse_file(str(path))
|
||||
|
||||
content: Optional[str] = None
|
||||
if os.getenv("MinerU_URL"):
|
||||
file_key = path.stem
|
||||
result_obj = (result_json or {}).get("results", {}).get(file_key, {})
|
||||
content = result_obj.get("md_content") or ""
|
||||
else:
|
||||
file_result_dir = os.path.join(save_dir, path.stem)
|
||||
folder = client.download_result(save_dir=file_result_dir)
|
||||
|
||||
md_path = Path(folder) / "full.md"
|
||||
txt_path = Path(folder) / "merged_text.txt"
|
||||
|
||||
if md_path.exists():
|
||||
content = _read_text_file(md_path)
|
||||
elif txt_path.exists():
|
||||
content = _read_text_file(txt_path)
|
||||
else:
|
||||
for tf in Path(folder).rglob("*.*"):
|
||||
if tf.suffix.lower() in [".md", ".txt"]:
|
||||
content = _read_text_file(tf)
|
||||
break
|
||||
|
||||
if suffix == ".pdf":
|
||||
content = _ensure_markdown(content or "", file_title)
|
||||
|
||||
parsed = _extract_title_and_content(content or "", file_title)
|
||||
return {
|
||||
"source": str(path),
|
||||
"type": "mineru",
|
||||
**parsed,
|
||||
"raw_result": result_json,
|
||||
}
|
||||
|
||||
raise ValueError(f"不支持的文件类型: {path.name}")
|
||||
Reference in New Issue
Block a user