import argparse
import os
import sys
import time

import agents
import database
import milvusdb


def build_full_content(doc_path, max_chars=20000):
    import parse_doc

    paragraphs_multi, original_text_multi = parse_doc.get_paragraph_router(
        doc_path,
        extract_images=False,
    )
    full_content = []
    for group in original_text_multi or []:
        for text in group or []:
            if text and text.strip():
                full_content.append(text)
    merged = "\n".join(full_content)
    if len(merged) > max_chars:
        merged = merged[:max_chars] + "..."
    return merged


def backfill_doc(doc_id: int, sleep_sec: float = 0.0) -> tuple[bool, str]:
    doc_info = database.get_doc_info(doc_id)
    if not doc_info:
        return False, f"doc {doc_id} info not found"

    file_path = doc_info["file_path"]
    if not os.path.exists(file_path):
        return False, f"doc {doc_id} file not found: {file_path}"

    full_content = build_full_content(file_path)
    metadata = agents.doc_metadata_extractor(
        doc_info.get("file_name") or doc_info.get("doc_name") or f"doc-{doc_id}",
        full_content,
        file_path=file_path,
    )
    if not metadata:
        return False, f"doc {doc_id} metadata extractor returned empty result"

    file_size = 0
    try:
        file_size = os.path.getsize(file_path)
    except OSError:
        pass

    result = milvusdb.add_doc_metadata(
        doc_id=int(doc_info["id"]),
        name=metadata.get("doc_name") or doc_info.get("doc_name") or doc_info.get("file_name") or f"doc-{doc_id}",
        authors=metadata.get("authors", []),
        doc_type=metadata.get("doc_type", ""),
        size=int(file_size),
        related_timestamp=metadata.get("related_timestamp", 0),
        abstract=metadata.get("abstract", ""),
        owner_type=doc_info.get("type", "个人"),
        owner_id=int(doc_info.get("owner_id", 0) or 0),
        folder_ids=[int(doc_info["folder_id"])],
    )
    if not isinstance(result, dict) or result.get("success") is not True:
        return False, f"doc {doc_id} metadata upload failed: {result}"

    if sleep_sec > 0:
        time.sleep(sleep_sec)
    return True, f"doc {doc_id} metadata backfilled"


def main():
    parser = argparse.ArgumentParser(description="Backfill missing document metadata into Milvus")
    parser.add_argument("--doc-id", type=int, action="append", dest="doc_ids", help="Specific doc id(s) to backfill")
    parser.add_argument("--sleep-sec", type=float, default=0.0, help="Sleep between docs")
    args = parser.parse_args()

    if not args.doc_ids:
        print("No doc ids provided", file=sys.stderr)
        return 2

    failed = 0
    for doc_id in args.doc_ids:
        ok, message = backfill_doc(doc_id, sleep_sec=args.sleep_sec)
        print(message)
        if not ok:
            failed += 1
    return 1 if failed else 0


if __name__ == "__main__":
    raise SystemExit(main())
