import subprocess, re, pathlib, json, sys
WINDOW = sys.argv[1] if len(sys.argv) > 1 else "2026-06-13 19:40:00+08"
query = f"""
select id, task_arguments, coalesce(task_result,'')
from tasks
where task_type='文档解析'
  and deleted_at is null
  and task_status='failed'
  and updated_at >= timestamptz '{WINDOW}'
order by updated_at;
"""
raw = subprocess.check_output([
    'docker','exec','postgres_db','psql','-U','knowledge','-d','yunwoai','-F','\x1f','-R','\x1e','-Atc', query
], text=True, errors='ignore')
rows=[]
for rec in raw.split('\x1e'):
    if not rec.strip():
        continue
    parts=rec.split('\x1f',2)
    if len(parts)!=3:
        continue
    tid,args,result=parts
    rows.append((tid,args,result))
log = pathlib.Path('/root/knowledge/service/task_executor/logs/task_2026-06-13.log').read_text(errors='ignore')
counts={}
items=[]
for tid,args,result in rows:
    filename=''
    try:
        filename=json.loads(args).get('file_name','')
    except Exception:
        filename=''
    m=re.search(r'(?:PDF OCR失败（可重试）|PPTX转PDF后OCR失败（可重试）):\s*([^\n\r]+)', result)
    relpath=m.group(1).strip() if m else ''
    base=relpath.rsplit('/',1)[-1] if relpath else ''
    snippets=[]
    if base:
        # collect up to 4 contexts around exact file basename occurrences
        for mm in re.finditer(re.escape(base), log):
            snippets.append(log[max(0, mm.start()-5000): min(len(log), mm.end()+3000)])
            if len(snippets)>=4:
                break
    idx=log.rfind(f'[任务失败] 任务 {tid}')
    if idx>=0:
        snippets.append(log[max(0, idx-10000): min(len(log), idx+2000)])
    ctx='\n'.join(snippets)
    if 'Failed to open file' in ctx or 'FileDataError' in ctx or 'Failed to cut the PDF file' in ctx:
        bucket='源PDF打不开/损坏/切分失败'
    elif 'OCR processing failed after' in ctx or '个PDF块处理失败' in ctx:
        bucket='旧fallback切块OCR失败'
    elif 'TPM limit reached' in ctx or 'DeepSeek-OCR rate limited' in ctx or '429 Too Many Requests' in ctx:
        bucket='DeepSeek限流相关'
    elif 'Network is unreachable' in ctx or 'timed out' in ctx or 'timeout' in ctx or 'network error' in ctx:
        bucket='网络/超时相关'
    elif 'PPTX转PDF后OCR失败' in result:
        bucket='PPTX转PDF后OCR失败'
    else:
        bucket='未知/日志不足'
    counts[bucket]=counts.get(bucket,0)+1
    items.append((tid,bucket,filename,relpath))
print('WINDOW', WINDOW)
print('TOTAL', len(rows))
for k,v in sorted(counts.items(), key=lambda kv:(-kv[1],kv[0])):
    print('COUNT', k, v)
print('ITEMS')
for tid,bucket,filename,relpath in items:
    print(f'{tid}|{bucket}|{filename[:80]}|{relpath}')
