import base64
import concurrent.futures
import json
import os
import statistics
import sys
import time
import urllib.error
import urllib.request

URL = "http://yunwo-ppocr-vl-gpu.tailb014e0.ts.net:5061/v1/ocr"
CHUNKS = [
    {"chunk": 0, "pages": 5, "path": "/tmp/ppocr_vl_bench_chunks/large_p001_005.pdf"},
    {"chunk": 1, "pages": 5, "path": "/tmp/ppocr_vl_bench_chunks/large_p006_010.pdf"},
    {"chunk": 2, "pages": 5, "path": "/tmp/ppocr_vl_bench_chunks/large_p011_015.pdf"},
    {"chunk": 3, "pages": 5, "path": "/tmp/ppocr_vl_bench_chunks/large_p016_020.pdf"},
]
CLIENT_WORKERS = 4
ROUND = sys.argv[1] if len(sys.argv) > 1 else "unknown"


def emit(payload):
    payload.setdefault("ts", time.strftime("%Y-%m-%dT%H:%M:%S%z"))
    print(json.dumps(payload, ensure_ascii=False), flush=True)


def ocr_chunk(item):
    path = item["path"]
    size = os.path.getsize(path)
    emit({"event": "chunk_start", "round": ROUND, "chunk": item["chunk"], "pages": item["pages"], "size_bytes": size, "size_mb": round(size / 1024 / 1024, 3), "path": path})
    t0 = time.perf_counter()
    with open(path, "rb") as pdf_file:
        encoded = base64.b64encode(pdf_file.read()).decode("ascii")
    t1 = time.perf_counter()
    payload = {
        "filename": os.path.basename(path),
        "file_base64": encoded,
        "restructure": True,
        "timeout_sec": 3600,
    }
    body = json.dumps(payload).encode("utf-8")
    t2 = time.perf_counter()
    req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"}, method="POST")
    try:
        with urllib.request.urlopen(req, timeout=3600) as response:
            raw = response.read()
            status = response.status
        t3 = time.perf_counter()
        data = json.loads(raw.decode("utf-8"))
        pages_out = data.get("pages") or []
        markdown_chars = sum(len(page.get("markdown") or "") for page in pages_out if isinstance(page, dict))
        result = {
            "event": "chunk_done",
            "round": ROUND,
            "chunk": item["chunk"],
            "pages": item["pages"],
            "http_status": status,
            "ok": True,
            "encode_sec": round(t1 - t0, 3),
            "json_sec": round(t2 - t1, 3),
            "request_sec": round(t3 - t2, 3),
            "total_sec": round(t3 - t0, 3),
            "server_elapsed_sec": data.get("elapsed_sec"),
            "usage_elapsed_sec": (data.get("usage_info") or {}).get("elapsed_sec"),
            "pages_returned": len(pages_out),
            "markdown_chars": markdown_chars,
            "model": data.get("model"),
        }
        emit(result)
        return result
    except urllib.error.HTTPError as exc:
        t3 = time.perf_counter()
        err = exc.read().decode("utf-8", errors="ignore")[:2000]
        result = {"event": "chunk_error", "round": ROUND, "chunk": item["chunk"], "pages": item["pages"], "ok": False, "http_status": exc.code, "total_sec": round(t3 - t0, 3), "error": err}
        emit(result)
        return result
    except Exception as exc:
        t3 = time.perf_counter()
        result = {"event": "chunk_error", "round": ROUND, "chunk": item["chunk"], "pages": item["pages"], "ok": False, "total_sec": round(t3 - t0, 3), "error_type": type(exc).__name__, "error": str(exc)[:2000]}
        emit(result)
        return result


started = time.perf_counter()
emit({"event": "bench_start", "round": ROUND, "client_workers": CLIENT_WORKERS, "chunks": len(CHUNKS), "total_pages": sum(item["pages"] for item in CHUNKS)})
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=CLIENT_WORKERS) as executor:
    futures = [executor.submit(ocr_chunk, item) for item in CHUNKS]
    for future in concurrent.futures.as_completed(futures):
        results.append(future.result())
wall_sec = time.perf_counter() - started
success = [result for result in results if result.get("ok")]
failures = [result for result in results if not result.get("ok")]
request_times = [result["request_sec"] for result in success]
total_pages = sum(result.get("pages", 0) for result in success)
summary = {
    "event": "bench_done",
    "round": ROUND,
    "wall_sec": round(wall_sec, 3),
    "success_count": len(success),
    "failure_count": len(failures),
    "total_pages": total_pages,
    "pages_per_min": round(total_pages / wall_sec * 60, 3) if wall_sec else None,
    "avg_request_sec": round(statistics.mean(request_times), 3) if request_times else None,
    "max_request_sec": round(max(request_times), 3) if request_times else None,
    "min_request_sec": round(min(request_times), 3) if request_times else None,
}
emit(summary)
sys.exit(0 if not failures else 2)
