o
    -j                     @  sP  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZ ddlZddlZddlmZm Z  G d	d
 d
e!Z"eG dd dZ#eG dd dZ$eG dd dZ%eG dd dZ&g dZ'ddddZ(dddZ)dd d!Z*dd$d%Z+dd&d'Z,dd+d,Z-ddd/d0Z.dd4d5Z/d6Z0d7Z1d8Z2h d9Z3dd;d<Z4dd@dAZ5ddCdDZ6ddIdJZ7ddOdPZ8ddSdTZ9ddVdWZ:ddZd[Z;dd\d]Z<dd^d_Z=dd`daZ>ddbdcZ?ddfdgZ@ddidjZAddldmZBddndoZCddqdrZDddudvZEddwdxZFddd|d}ZGdddZHdddZIdddZJdddZKdddddZLdddZMdddZNdddZOdddZPdddZQdddZRdddZSdddddZTeUdejVZWdddZXdddZYdddńZZG ddǄ dǃZ[G ddɄ dɃZ\ddddddʜddd҄Z]dS (   am  
High-throughput SiliconFlow DeepSeek-OCR SDK.

The public shape intentionally mirrors the Mistral OCR SDK enough for the task
executor:

    client = SiliconFlowDeepSeekOCRClient(api_key="sk-...", base_url="https://api.siliconflow.cn/v1")
    response = client.ocr.process(
        model="deepseek-ai/DeepSeek-OCR",
        document={"type": "local_file", "file_path": "/path/to.pdf"},
    )
    for page in response.pages:
        print(page.index, page.markdown)

Unlike Mistral OCR, this client reads local PDFs directly, renders pages to
compressed JPEG, calls SiliconFlow per page, and merges page results locally.
    )annotationsN)ThreadPoolExecutoras_completed)	dataclassfield)Path)AnyCallableDictListOptionalTuple)Image	ImageStatc                   @  s   e Zd ZdS )SiliconFlowDeepSeekOCRErrorN)__name__
__module____qualname__ r   r   ./tmp/siliconflow_deepseek_ocr_sdk_candidate.pyr   *   s    r   c                   @  s&   e Zd ZU ded< ded< ded< dS )OCRPageDimensionsintwidthheightdpiN)r   r   r   __annotations__r   r   r   r   r   .   s   
 r   c                   @  sf   e Zd ZU ded< ded< ded< ded< ded< dZd	ed
< dZded< dZded< dZded< dS )OCRImagestridfloat
top_left_x
top_left_ybottom_right_xbottom_right_yNOptional[str]image_base64imagetypepdfsourceTboolsave)r   r   r   r   r%   r'   r)   r+   r   r   r   r   r   5   s   
 r   c                   @  sf   e Zd ZU ded< ded< eedZded< dZd	ed
< eedZded< dZ	ded< dZ
ded< dS )OCRPager   indexr   markdowndefault_factoryz	List[Any]imagesNzOptional[OCRPageDimensions]
dimensionsList[Dict[str, Any]]blocks raw_markdownOptional[Dict[str, Any]]usage)r   r   r   r   r   listr1   r2   r4   r6   r8   r   r   r   r   r,   B   s   
 r,   c                   @  s0   e Zd ZU ded< ded< eedZded< dS )	OCRResponsezList[OCRPage]pagesr   modelr/   Dict[str, Any]
usage_infoN)r   r   r   r   r   dictr>   r   r   r   r   r:   M   s   
 r:   )
$DEFAULT_DEEPSEEK_OCR_FALLBACK_PROMPTDEFAULT_DEEPSEEK_OCR_PROMPT&DEFAULT_DEEPSEEK_OCR_TRANSCRIBE_PROMPTr,   r   r   r:   SiliconFlowDeepSeekOCRClientr    run_siliconflow_deepseek_pdf_ocrFnamer   defaultr*   returnc                 C  s&   t | }|d u r|S |  dv S )N>   1ZyesytrueZon)osgetenvstriplowerrE   rF   valuer   r   r   	_env_boolb   s   
rQ   r   c                 C  D   t | }|d u s| dkr|S zt|W S  ty!   | Y S w Nr5   )rK   rL   rM   r   
ValueErrorrO   r   r   r   _env_inti      

rU   r   c                 C  rR   rS   )rK   rL   rM   r   rT   rO   r   r   r   
_env_floats   rV   rW   status_codebodyc                 C  s4   |pd  }| dkpd|v pd|v pd|v pd|v S )Nr5   i  z
rate limitztoo manyz	tpm limitz	rpm limit)rN   )rX   rY   Z
body_lowerr   r   r   _is_rate_limit_error}   s   rZ   c                 C  s   t | |rdS | dv S )NT>   i  i  i  i
  i  i  i  i  i  i  i  )rZ   )rX   rY   r   r   r   _is_retryable_http_error   s   
r[   headershttpx.HeadersOptional[float]c                 C  s<   |  d}|s	d S z
tdt| W S  ty   Y d S w )Nzretry-after        )getmaxr   rM   rT   )r\   retry_afterr   r   r   _retry_after_seconds   s   
rc   explicitr$   c                 C  sH   | r
|   r
|   S dD ]}t|}|r|  r|    S qtd)N)Z SILICONFLOW_DEEPSEEK_OCR_API_KEYZSILICONFLOW_OCR_API_KEYZSILICONFLOW_API_KEYzDSILICONFLOW_API_KEY or SILICONFLOW_DEEPSEEK_OCR_API_KEY is required.)rM   rK   rL   r   )rd   rE   rP   r   r   r   _api_key   s   
re   rawbytesr   c                 C  s   t jddd\}}z%t|d}||  W d    n1 s w   Y  W t|S W t|S  tyZ   zt| W n	 tyE   Y nw z
t|j	dd W   tyY   Y  w w )NZdeepseek_ocr_z.pdf)prefixsuffixwbTZ
missing_ok)
tempfileZmkstemprK   fdopenwrite	ExceptioncloseOSErrorr   unlink)rf   fdrE   fpr   r   r   _write_temp_pdf   s,   ru   z=Convert this document page to markdown. Return markdown only.zR<image>
<|grounding|>Convert this document page to markdown. Return markdown only.zb<image>
Transcribe all visible text in this image exactly. Preserve line breaks. Return text only.>	   ZfigureZgraphZdiagramZphotoZlogoZpicturer&   ZimgZchartpdf_pathc                 C  sb   d }z$t t| }|jr|dstd|  t|W |d ur&|  S S |d ur0|  w w )Nr5   PDF requires a password: )fitzopenr   
needs_passauthenticater   lenrp   )rv   docr   r   r   _page_count   s   


r~   r&   Image.Imagemax_edgec                 C  s   |dkr| S t | j| j}||kr| S |t| }t dt| j| t dt| j| f}ttdt}| ||j}| 	  |S )Nr      
Resampling)
ra   r   r   r   r   getattrr   ZresizeLANCZOSrp   )r&   r   Zlargest_edgeZscaleZnew_size
resamplingZresizedr   r   r   _resize_to_edge   s   (r   qualityc                 C  s"   t  }| j|d|dd | S )NJPEGTformatr   optimize)ioBytesIOr+   getvalue)r&   r   bufferr   r   r   
_save_jpeg   s   r   
page_indexr   	max_bytes/Tuple[bytes, OCRPageDimensions, Dict[str, Any]]c                C  s  d }d }d }zt t| }|jr|dstd|  || }	|	jt |d |d dd}tt	
|dd}t||}t||}
|}t|
|kr|dks[t|jd	kr|dkrd|d
8 }nt|td	tt|jd }t||}
t|
|kr|dks[t|jd	ks[t|j|j|d}|t|
|d}|
||fW |d urz|  W n	 ty   Y nw d }|d ur|  S S |d urz|  W n	 ty   Y nw d }|d ur|  w w )Nr5   rw         R@F)matrixalphapngRGB7   i     333333?)r   r   r   )jpeg_qualityZ	byte_sizer   )rx   ry   r   rz   r{   r   
get_pixmapMatrixr   r   r   tobytesconvertr   r   r|   ra   sizer   r   r   r   rp   ro   )rv   r   r   r   r   r   r}   pixr&   pagejpegZcurrent_qualityr2   metar   r   r   _render_page_jpeg   sZ   	

"

"


r   rect	fitz.Rect	page_rectOptional[fitz.Rect]c                 C  sT   t t|j| jt|j| jt|j| jt|j| j}|jdks&|j	dkr(d S |S )Nr   )
rx   Rectra   x0y0minx1y1r   r   )r   r   clippedr   r   r   _clip_rect_to_page"  s   r   r2   List[float]c              	   C  s   |j t|j d }|jt|jd }tdt|j | j|j | tdt|j| j|j | tdt|j | j|j | tdt|j| j|j | gS )Nư>r_   )r   ra   r   r   r   r   r   r   )r   r   r2   width_scaleheight_scaler   r   r   _scale_rect_to_rendered.  s   r   bboxc           
      C  s&  t | dk rd S dd | d d D \}}}}||k r ||}}||k r)||}}tt|t|t|t|dkr`|jdksD|jdkr`||j d }||j d }||j d }||j d }|jt|jd }|jt|jd }t|j||  |j||  |j||  |j||  }	t	|	|S )N   c                 S     g | ]}t |qS r   )r   ).0vr   r   r   
<listcomp>@      z%_bbox_to_pdf_rect.<locals>.<listcomp>  g     @@r   )
r|   ra   absr   r   rx   r   r   r   r   )
r   r   r2   r   r   r   r   r   r   r   r   r   r   _bbox_to_pdf_rect9  s,   

"
r   abc                 C  s   t | j|j}t | j|j}t| j|j}t| j|j}||ks$||kr&dS || ||  }t | j| j |j|j  | d}|| S )Nr_   r   )ra   r   r   r   r   r   r   r   )r   r   r   r   r   r   Zinterunionr   r   r   	_rect_iou]  s   "r   c                 C  s   t d| jt d| j S Nr_   )ra   r   r   r   r   r   r   
_rect_areai  s   r   c                 C  s\   t | j|j}t | j|j}t| j|j}t| j|j}||ks$||kr&dS || ||  S r   )ra   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   _rect_intersection_aream  s   r   c                 C  s,   t t| t|}|dkrdS t| || S )Nr   r_   )r   r   r   )r   r   Zsmallerr   r   r   _rect_smaller_overlap_ratiow  s   r   c                 C  s8   t t| j|jt| j|jt| j|jt| j|jS N)rx   r   r   r   r   ra   r   r   )r   r   r   r   r   _rect_union~  s   r   source_asource_bc                 C  sV   g }| |fD ]}t |pddD ]}| }|r"||vr"|| qqd|p*dS )Nr5   +merged)r   splitrM   appendjoin)r   r   partsr)   partr   r   r   _merge_sources  s   
r   /Tuple[float, float, float, float, float, float]c              	   C  s8   t | ||\}}}}||||td|| td|| fS r   )r   ra   )r   r   r2   r   r   r   r   r   r   r   _rendered_rect_values  s   $r   =Tuple[float, float, float, float, float, float, float, float]c                 C  s   t | ||\}}}}}}	t |||\}
}}}}}tdt|
| || }tdt|| || }tdt||t||
 }tdt||t|| }||||||	||fS r   )r   ra   r   )r   r   r   r2   Zax0Zay0Zax1Zay1awahZbx0Zby0Zbx1Zby1bwbhhorizontal_gapvertical_gap	overlap_x	overlap_yr   r   r   _rendered_gap_and_overlap  s   r   c                 C  s   t | ||\}}}}}}tdt|j|j }	|| |	 }
|tdt|j }|tdt|j }tdd}|
|koA|dkoA|dkS )N      ?Z3SILICONFLOW_DEEPSEEK_OCR_FULL_PAGE_IMAGE_AREA_RATIOg=
ףp=?g(\?)r   ra   r   r   r   rW   )r   r   r2   _x0_y0_x1_y1r   r   	page_area
area_ratiowidth_ratioZheight_ratioZ	thresholdr   r   r   _is_near_full_page_rect  s   
r   gap_limit_pxc                 C  s  t | ||}t |||}t| |}	||kr|	dkrdS |	dkr!dS t| |dkr*dS t| |||\}
}}}}}}}tdt||}tdt||}|| }|| }|
|kr[|dkr[dS ||kre|dkredS d|v smd|v r|
|kr||kr|d	ks}|d	krdS dS )
Ng?Fgףp=
?TgQ?r   gzG?deepseek_blockg
ףp=
?)r   r   r   r   ra   r   )r   r   r   r   r   r2   r   Za_fullZb_fullZsmaller_overlapr   r   r   r   r   r   r   r   Z	min_widthZ
min_heightZvertical_overlap_ratioZhorizontal_overlap_ratior   r   r   _should_merge_image_rects  s4   	
r   
candidatesList[Tuple[fitz.Rect, str]]c              
   C  s   t | dkr| S tdtdd}t| }d}|rxd}g }dgt | }t|D ]L\}\}	}
|| r2q'|	}|
}d||< t|d t |D ](}|| rJqC|| \}}t|||||||s[qCt||}t||}d||< d}qC|	||f q'|}|st
|dd d	S )
Nr   r_   Z+SILICONFLOW_DEEPSEEK_OCR_IMAGE_MERGE_GAP_PX      (@TFc                 S  s$   | d j | d j| d j| d jfS )Nr   r   r   r   r   )itemr   r   r   <lambda>  s   $ z._merge_image_rect_candidates.<locals>.<lambda>key)r|   ra   rW   r9   	enumerateranger   r   r   r   sorted)r   r   r2   r   r   Zchangedresultusedr-   r   r)   Zcurrent_rectZcurrent_sourceZother_indexZ
other_rectZother_sourcer   r   r   _merge_image_rect_candidates  sJ   	

#r   c                 C  s   t dtdd}|dkr| S t| ||\}}}}}}	|dks"|	dkr$| S t|t dt||	d }
|
|j t dt|j }|
|j t dt|j }t| j	| | j
| | j| | j| }t||pe| S )Nr_   Z.SILICONFLOW_DEEPSEEK_OCR_IMAGE_CROP_PADDING_PXr   r   g       @g?r   )ra   rW   r   r   r   r   r   rx   r   r   r   r   r   r   )r   r   r2   Z
padding_pxr   r   r   r   r   r   Zlimited_padding_pxZ	x_paddingZ	y_paddingZexpandedr   r   r   _expand_rect_for_crop  s    r   TrP   r   c                 C  sR   | d u r|S t | tr| S t | ttfrt| S t | tr%|   dvS t| S )N>   Zoff0Zfalsenon)
isinstancer*   r   r   r   rM   rN   )rP   rF   r   r   r   _coerce_bool3  s   

r  rectsList[fitz.Rect]c                   sN   g }t | dd ddD ] t fdd|D rq|  qt |dd dS )	Nc                 S  s   | j | j S r   )r   r   rr   r   r   r   A  s    z_dedupe_rects.<locals>.<lambda>T)r   reversec                 3  s    | ]
}t  |d kV  qdS )r   N)r   )r   existingr   r   r   	<genexpr>B  s    z _dedupe_rects.<locals>.<genexpr>c                 S  s   | j | j| j| jfS r   r   r  r   r   r   r   E  r   r   )r   anyr   )r  r   r   r   r   _dedupe_rects?  s   r  image_entryTuple[Any, ...]Tuple[int, int]c                 C  s8   zt | d pdt | d pdfW S  ty   Y dS w )N   r      )r   r   )r   ro   )r  r   r   r   _image_entry_pixel_sizeH  s
   "r  r}   fitz.Documentxrefcache%Dict[int, Optional[Dict[str, float]]]Optional[Dict[str, float]]c                 C  s
  ||v r|| S d }d }zz|  |}|d}|s=d ||< W W ||fD ]}|d ur:z|  W q$ ty9   Y q$w q$d S tt|d}|	 }t
tdt}|d|j |d}	|	 }
tdt|
}t|
dd  | }t|
d d | }|d	 d }t|jd
 }t|t|t|d}|||< |W W ||fD ]}|d urz|  W q ty   Y qw qS  ty   d ||< Y W ||fD ]}|d urz|  W q ty   Y qw qd S w ||fD ]}|d urz|  W q ty   Y qw qw )Nr&   r   r   )   r  Lr      d   ZHSVr   )white_ratio
dark_ratiosaturation_mean)Zextract_imager`   rp   ro   r   ry   r   r   r   copyr   Z	thumbnailr   Z	histogramra   sumr   r   ZStatZmeanr   )r}   r  r  	pil_imageZthumb	extractedimage_bytesr&   r   ZgrayZhisttotalr  r  Zsatr  profiler   r   r   _pdf_image_content_profileO  s   



r%  r$  c                 C  s8   | sdS |  dddko|  dddko|  dd	d
kS )NFr  r_   皙?r  r   gRQ?r  g     o@g      8@)r`   )r$  r   r   r   _is_text_raster_profile~  s   r'  text_raster_strip_countprofile_cacher)  c                C  s  t |||\}}}	}
td|	| }td|
| }|dks |dkr"dS || }tdt|j|j }|| }|tdt|j }t|t|d |t|d }t|\}}t||}t||}|dk sf|dk rhdS |r||r|t||dk r|t||d	k r|dS |d
krdS |dkr|dkr|dk rdS t|d }t| ||}t	|}|r|dkr|dkr|dkrdS |dkr|dkr|dkrdS |dk r|dkrdS |dkr|dkr|dk rdS dS )Nr_   r   Fr   r      i     P   g       @g?g      @g)\(?r  r&  ?g
ףp=
?g?g{Gz?g@x   g      @gQ?T)
r   ra   r   r   r   r  r   r   r%  r'  )r}   r  r   r   r2   r*  r)  r   r   r   r   display_widthdisplay_heightdisplay_arear   r   r   aspectZpixel_widthZpixel_heightZmin_display_sideZmax_display_sider  r$  Zlooks_like_text_rasterr   r   r   _should_keep_pdf_image_rect  sJ   


$r4  r4   r3   List[OCRImage]c           &      C  s`  d }zzqt t| }|jr|dstd|  || }|j}g }i }g }	|D ]/}
t|
dp4d 	 }|
d}|t
vsHt|tsIq+t|||}|d urZ||df q+|jddD ]/}|sfqa|d }z||}W n	 tyz   Y qaw |D ]}t||}|d ur|	||f q}qad}|	D ]h\}}t|||\}}}}td	|| }td	|| }|| }td
t|j|j }|| }|td
t|j }t|t|d |t|d }t|d }t|||}t|r|dkr|dkr|dkr|d7 }q|	D ]\}}t|||||||dr||df q g }t|||} tdd}!t| D ]>\}"\}}#t|||}t|||\}}}}||ksJ||krLq*|!pUt ||| }$|t!d|" |||||#|$d q*|W W |d urv|"  S S  ty }% z!t#d|d  d|% dd g W  Y d }%~%W |d ur|"  S S d }%~%ww |d ur|"  w w )Nr5   rw   r'   r   r   T)Zfullr   r_   r   r   r&  r.  g?r   r(  r(   Z.SILICONFLOW_DEEPSEEK_OCR_SAVE_FULL_PAGE_IMAGESFimg-)r   r    r!   r"   r#   r)   r+   z5DeepSeek-OCR image region extraction failed for page : flush)$rx   ry   r   rz   r{   r   r   r`   rM   rN   _IMAGE_BLOCK_TYPESr  r9   r   r   Z
get_imagesZget_image_rectsro   r   r   ra   r   r   r   r   r%  r'  r4  r   rQ   r   r   r   r   rp   print)&rv   r   r2   r4   r}   r   r   r  r*  Zpdf_image_candidatesblockZ
block_typer   r   r  r  Zimage_rectsr   r)  r   r   r   r   r0  r1  r2  r   r   r   r3  r$  r1   Zmerged_rectsZsave_full_page_imagesidxr)   Z
save_imageexcr   r   r   _extract_pdf_image_rects  s   

	






r?  r=   c              
   C  s*   | j | j| j| j| j| j| j| j| jd	S )N	r   r    r!   r"   r#   r%   r'   r)   r+   r@  )r&   r   r   r   _image_to_dict+  s   rA  image_idurlc                 C  s   d|  d| dS )Nz![z]()r   )rB  rC  r   r   r   _image_markdown_url9  s   rE  r.   r1   max_markdown_charsc                C  s   |dkst | pd |krdS tdt|j|j }|D ]'}tdt|jt|j }tdt|jt|j	 }|| | dkrD dS qdS )Nr   r5   Fr   r_   g
ףp=
?T)
r|   rM   ra   r   r   r   r"   r    r#   r!   )r.   r1   r2   rF  r   r&   r   r   r   r   r   _needs_full_page_transcription=  s   rG  c                  C  sF   t dd} zdd l}t|d| } W n	 ty   Y nw t| dS )NZTASK_EXECUTOR_STORAGE_URLz)https://dev.knowledge.yunwoai.com/storager   Z
storageUrl/)rK   rL   rP   r   ro   r   rstrip)rP   Z
task_valuer   r   r   _storage_urlO  s   rJ  ATuple[Optional[Callable[..., Any]], Optional[Callable[..., Any]]]c                  C  s8   zdd l } t| dd t| dd fW S  ty   Y dS w )Nr   get_file_by_obj_keyinsert_file)NN)databaser   ro   )rN  r   r   r   _db_file_helpersZ  s   rO  page_numberoutput_pathpage_dimensionsr7   c                 C  sr  d }d }d }zzKt | }|jr;|ds;W W |d ur-z|  W n	 ty,   Y nw d }|d ur9|  dS dS |d }|dk sI|t|krmW W |d ur_z|  W n	 ty^   Y nw d }|d urk|  dS dS || }	|	j}
t|pxi 	dp~|
j
}t|pi 	dp|
j}t|pi 	dpd}|
j
t|d	 }|
jt|d	 }t |
j|j|  |
j|j|  |
j|j|  |
j|j|  }t||
}|d u rW W |d urz|  W n	 ty   Y nw d }|d ur|  dS dS t td
|d td
|d }|	j||dd}tt|dd}tjtj |dd |j!|dddd W W |d urFz|  W n
 tyE   Y nw d }|d urS|  dS dS  ty } z4t"d| dd W Y d }~W |d urz|  W n
 ty   Y nw d }|d ur|  dS dS d }~ww |d urz|  W n
 ty   Y nw d }|d ur|  w w )Nr5   Fr   r   r   r   r   i,  r   H   r   )r   Zclipr   r   r   Texist_okr   \   r   z DeepSeek-OCR image crop failed: r8  )#rx   ry   rz   r{   rp   ro   r|   r   r   r`   r   r   r   ra   r   r   r    r   r!   r"   r#   r   r   r   r   r   r   r   r   rK   makedirspathdirnamer+   r;  )rv   rP  r&   rQ  rR  r}   r   r   r   r   r   Z	ocr_widthZ
ocr_heightr   r   r   r   r   r>  r   r   r   _crop_pdf_imagec  s   

 






rZ  r5   file_remarkr;   
output_dirr\  1Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]c          #      C  s  t t| d }|r|}n"tt }t|j	dkr%|j	d d nt
 d }t|d | }tj|dd t }t \}	}
g }d}d}d}|D ]}t|dpVd}|d	p^g }g }|D ]}t|trn|}nNt|trtt|d
p|dpd| t|dpdt|dpdt|dpdt|dpd|dt|dpdt|ddd}nqc|js|d7 }qc|d7 }d| d|j d}tj||}d}|jr*z.tjtj|dd t|d}|t|j W d    n	1 sw   Y  d}W n t y) } zt!d| dd W Y d }~nd }~ww |s8t"| ||||d}|s<qcz't|d}|# }W d    n	1 sRw   Y  t | }t|}W n
 t ym   Y qcw d | d!| }|
d urz%|	d ur|	|nd }|d u r|
d"| d!| ||d#||d d|d$	 W n t y } zt!d%| dd W Y d }~nd }~ww | | }|j|d||||j$|j%|j&|j'd&|d|j(d'	} |)|  |)|  |d7 }t|d(pd)}!d*|j d+}"|"|!v r|!*|"d*| d+}!n||!vr |!+  d,t,|j| - }!|!|d(< qc||d	< qL|s0|r?t!d-| d!| d.| dd ||fS )/Nzutf-8r  ZstorageZ
pdf_imagesTrT  r   rP  r1   r   rB  r6  r    r!   r"   r#   r%   r)   r(   r+   )r   r    r!   r"   r#   r%   r)   r+   r   Zpage_Z_chunk_0_img_z.jpegFrj   z'DeepSeek-OCR image base64 save failed: r8  r2   rbz/pdf_images/rH  zstorage/pdf_images/r   r[  z.DeepSeek-OCR image file record insert failed: )r    r!   r"   r#   )	rB  rP  chunk_index
image_pathrC  obj_keyZcoordinatesr2   r)   r.   r5   (rD  

z%DeepSeek-OCR image extraction: saved z image(s), skipped ).hashlibZmd5r   encodeZ	hexdigestr   __file__resolver|   parentscwdrK   rW  rJ  rO  r   r`   r  r   r?   r   r  r+   r   rX  r   r%   rY  ry   rn   base64	b64decodero   r;  rZ  readr    r!   r"   r#   r)   r   replacerI  rE  rM   )#rv   r;   r]  r\  Zpdf_md5Z	image_dirZmodule_pathZstorage_rootZstorage_urlrL  rM  extracted_imagesr#  ZsavedZskippedr   rP  Zpage_imagesZnormalized_page_imagesr&   Z	image_objZimage_filenamera  ZsuccessZimg_filer>  r"  Z	image_md5Z
image_sizerb  exists	image_urlr!  r.   Zplaceholderr   r   r   _save_pdf_images  s   





 








rr  z3<\|ref\|>(.*?)<\|/ref\|>\s*<\|det\|>(.*?)<\|/det\|>det_textOptional[List[float]]c                 C  s   zGt | }t|tr<|r?t|d tr|d n|}t|trBt|dkrEt|d t|d t|d t|d gW S W d S W d S W d S W d S  tyQ   Y d S w )Nr   r   r   r  r  )jsonloadsr  r9   r|   r   ro   )rs  Zparsed	candidater   r   r   _parse_bbox+  s"   
.rx   Tuple[str, List[Dict[str, Any]]]c                 C  s   | pd}t t|}g }t|D ]F\}}| }|d t|k r)||d   nt|}|||  }|d p=d|d}	t	|d }
|
d urR|
|	d< |
|	 qtd|}tdd| }||fS )	Nr5   r   textr'   rz  r  r   z\n{3,}rd  )r9   _REF_DET_REfinditerr   endr|   startrM   grouprx  r   subre)rf   rz  matchesr4   r=  matchr  r~  Z
block_textr<  r   Zcleanedr   r   r   _clean_markdown_and_blocks7  s"   (r  datac                 C  s   |  d}t|tr|stdt| d d  |d pi }| dp%i }| d}t|tr2|S t|trgg }|D ]&}t|trV| dpK| d}|rU|t| q;|d ura|t| q;d|S |d u rmd	S t|S )
Nchoicesz&SiliconFlow response missing choices: r   r   messagecontentrz  
r5   )r`   r  r9   r   r   r?   r   r   )r  r  firstr  r  r   r   rz  r   r   r   _extract_message_contentM  s*   





r  c                   @  s&   e Zd ZdddZdddddZdS )_OCRResourceclient'SiliconFlowDeepSeekOCRClient'rG   Nonec                 C  s
   || _ d S r   )_clientselfr  r   r   r   __init__d  s   
z_OCRResource.__init__N)r<   r<   r$   documentr=   _r   r:   c                K  s   | j j||dS )Nr<   r  )r  process_ocr)r  r<   r  r  r   r   r   processg  s   z_OCRResource.process)r  r  rG   r  )r<   r$   r  r=   r  r   rG   r:   )r   r   r   r  r  r   r   r   r   r  c  s    
r  c                   @  s   e Zd Zddddddddddd	ddd
eeedddddddddded6d7Zdfdgd;d<Zdhd>d?Zdid@dAZ	djdCdDZ
dkdGdHZdhdIdJZdldNdOZdmdRdSZdndUdVZdodXdYZdpdZd[Zdqd`daZdrdcddZdS )srC   https://api.siliconflow.cn/v1Ndeepseek-ai/DeepSeek-OCR      ^@r   F
   T      $@      highr      `	  V     p )base_urlapi_baser<   timeout_secpage_workersmax_concurrent_requests	trust_envmax_retriesrate_limit_retry_foreverchannel_retry_foreverrate_limit_initial_wait_secrate_limit_max_wait_secrate_limit_callback
max_tokenspromptfallback_prompttranscribe_promptfull_page_transcribefull_page_transcribe_max_charsimage_detail	max_pagesr   max_image_edger   max_image_bytesapi_keyr   r  r  r$   r<   r  r   r  r   r  Optional[int]r  r*   r  r  r  r  r  r  *Optional[Callable[[Dict[str, Any]], None]]r  r  r  r  r  r  r  r  r   r  r   r  rG   r  c                C  sf  t || _|p|d| _|| _|| _tdt|| _|d ur!|n| j}t	
td|| _t|| _tdt|	| _t|
| _t|| _tdt|| _t| jt|| _|| _tdt|| _|| _|| _|| _t|| _tdt|| _|| _tdt|| _tdt|| _tdt|| _tdtdt|| _ td	t|| _!t"| | _#t	$ | _%d | _&d S )
NrH  r   r   r      rS  (   _      )'re   r  rI  r  r<   r  ra   r   r  	threadingZBoundedSemaphore
_semaphorer*   r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r   r  r   r   r  r  ocrZLock_client_lockr  )r  r  r  r  r<   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r   r  limitr   r   r   r  l  s@   







z%SiliconFlowDeepSeekOCRClient.__init__r5   rX   rY   c                 C  s4   |d u r| j S t||r| jp| j S | j ot||S r   )r  rZ   r  r[   )r  rX   rY   r   r   r   _should_retry_forever  s
   
z2SiliconFlowDeepSeekOCRClient._should_retry_foreverhttpx.Clientc                 C  st   | j }|d u s
|jr8| j# | j }|d ur!|js!|W  d    S |  }|| _ W d    |S 1 s3w   Y  |S r   )r  Z	is_closedr  _make_http_clientr  r   r   r   _http_client  s   
z)SiliconFlowDeepSeekOCRClient._http_clientc                 C  sb   | j  | j}d | _W d    n1 sw   Y  |d ur/z|  W d S  ty.   Y d S w d S r   )r  r  rp   ro   r  r   r   r   rp     s   z"SiliconFlowDeepSeekOCRClient.closer  c                 C  s   | S r   r   )r  r   r   r   	__enter__  s   z&SiliconFlowDeepSeekOCRClient.__enter__r  r   c                 G  s   |    d S r   )rp   )r  r  r   r   r   __exit__  s   z%SiliconFlowDeepSeekOCRClient.__exit__c                 C  sN   t j| jtd| jd}t jtd| jtd| jd d}t j|| j|ddS )Ng      .@)Zconnectr   r  )Zmax_keepalive_connectionsZmax_connectionsF)timeoutr  limitsZhttp2)	httpxZTimeoutr  r   ZLimitsra   r  ZClientr  )r  r  r  r   r   r   r    s   
z.SiliconFlowDeepSeekOCRClient._make_http_clientr  r=   Tuple[Path, Optional[Path]]c                 C  s~  t |dpd }|dp|d}|r/tt |  }| s+tt ||d fS |dv rR|drRtt |d   }| sNtt ||d fS |dp`|dp`|d	}|rxt	t |
d
dd }t|}||fS |d}|rt |}	|	drt|	dd    }| stt ||d fS |	dr|  |	}
|
  t|
j}||fS td)Nr'   r5   	file_pathrX  >   file
local_filedocument_urlZdocument_base64Zfile_base64rk  ,r   zfile://   )zhttp://zhttps://zGdocument must contain local file_path, document_base64, or document_url)r   r`   rN   r   
expanduserrh  rp  FileNotFoundErrorrk  rl  r   ru   
startswithr  Zraise_for_statusr  r   )r  r  Zdoc_typer  rX  Zencodedrf   Ztempr  rC  responser   r   r   _resolve_pdf_path  s>   



z.SiliconFlowDeepSeekOCRClient._resolve_pdf_pathr"  rg   c                 C  sF   t |d}|ddd| | jddd|dgd	gd
|ddS )Nasciiuserrq  zdata:image/jpeg;base64,)rC  Zdetail)r'   rq  rz  r{  )Zroler  Fr   )r<   messagesstreamr  Ztemperature)rk  Z	b64encodedecoder  )r  r"  r<   r  r  Z
image_datar   r   r   _request_payload  s    z-SiliconFlowDeepSeekOCRClient._request_payloadcurrent_max_tokensc                 C  s>   |dkrd S |dkrdS |dkrdS |dkrdS t d|d S )Ni   r  i   i   r  )ra   )r  r  r   r   r   _next_safe_max_tokens  s   z2SiliconFlowDeepSeekOCRClient._next_safe_max_tokensretry_countc                 C  s   t ddtd| S )Ng      >@g      ?r   )r   ra   )r  r  r   r   r   _retry_wait_seconds  s   z0SiliconFlowDeepSeekOCRClient._retry_wait_secondsc                 C  s   d| j  dd}| j d}d }| j}d}d}	d}
	 z| ||||}| j |  j|||d}W d    n1 s=w   Y  |jdk rz| W W S  tj	y } zqt
d	|jd d
  }|}| jr|	d7 }	| |	}td|	 d|dd|jd d  dd t| W Y d }~W q|	| jk r|	d7 }	| |	}td|	 d| j d|dd|jd d  dd t| W Y d }~W q||d }~ww |jd d
 }t
d|j d| }|jdkrd|v rd|v r| |}|d ur|dk r|}|}|d7 }td| ddd W qt|j|r|}|
d7 }
t|j}|}|d u r:t| j| jdt|
d d  }| jd urkz| |j||
||d W n tyj } ztd| dd W Y d }~nd }~ww td |
 d|dd|d d  dd t| | |j|rW q|	| jk r|	d7 }	W qW n| |j|r|}|	d7 }	| |	}td!|	 d|dd"|j d#|d d  dd t| W q|}|	| jk r|	d7 }	| |	}td$|	 d| j d|dd%|j d#|d d  
dd t| W q| tjtjtjtj fyw } z_|}|  r?|	d7 }	| |	}td&|	 d|dd| dd t| W Y d }~q|	| jk rm|	d7 }	| |	}td'|	 d| j d|dd| dd t| W Y d }~qW Y d }~nd }~ww t
d(| |))NzBearer zapplication/json)ZAuthorizationzContent-Typez/chat/completionsr   T)r\   ru  i  z(SiliconFlow returned non-JSON response: r   r   z:DeepSeek-OCR non-JSON response, retrying forever (attempt=z, wait=z.1fzs):    r8  z&DeepSeek-OCR non-JSON response, retry rH  z in zs: zSiliconFlow HTTP r7  r  Zmax_seq_len   z#DeepSeek-OCR reduced max_tokens to z after context-limit errorr  )rX   rY   rate_limit_attemptZretry_after_secwait_secz)DeepSeek-OCR rate-limit callback failed: z@DeepSeek-OCR rate limited, retrying forever (rate_limit_attempt=z=DeepSeek-OCR retryable HTTP error, retrying forever (attempt=z
s, status=z): zDeepSeek-OCR HTTP error, retry z
s (status=z6DeepSeek-OCR network error, retrying forever (attempt=z"DeepSeek-OCR network error, retry z)SiliconFlow DeepSeek-OCR request failed: )!r  r  r  r  r  r  ZpostrX   ru  ZJSONDecodeErrorr   rz  r  r  r;  timesleepr  r  rZ   rc   r\   r   r  r  r  ro   r  r  ZConnectError	ReadErrorZRemoteProtocolErrorZTimeoutException)r  r"  r<   r  r\   rC  Z
last_errorr  Zcontext_limit_adjustmentsr  r  Zpayloadr  r>  errorr  rY   Znext_max_tokensrb   Zcallback_errorr   r   r   
_post_page!  s|  












	







 




z'SiliconFlowDeepSeekOCRClient._post_pagerv   r   r   r,   c              
   C  s  t ||| j| j| j| jd\}}}t }| ||| j}t|}	d}
|		 s>| j
r>| j
| jkr>| ||| j
}t|}	d}
t|	\}}t||||}| jr| jrt|||| jdrz(| ||| j}t|}t|\}}t|	 t|	 kr|}|}	|}d}
W n ty } ztd| dd W Y d }~nd }~ww t|tr|d	nd }|d u ri }t|}tt | d
|d< ||d< |
|d< t||||||	|dS )N)r   r   r   r   Zprimaryfallback)rF  Z
transcribez6DeepSeek-OCR full-page transcription fallback failed: Tr8  r8   r  elapsed_secr&   Zprompt_variant)r-   r.   r1   r2   r4   r6   r8   )r   r   r  r   r  r  r  r  r  rM   r  r  r?  r  r  rG  r  r|   ro   r;  r  r?   r`   roundr,   )r  rv   r   r<   r"  r2   Z
image_metastartedr  r6   Zused_promptr.   r4   r1   Ztranscribe_dataZtranscribed_rawZtranscribed_markdownr  r>  r8   r   r   r   	_ocr_page  sv   
z&SiliconFlowDeepSeekOCRClient._ocr_pager:   c                  s0  t   }|pj|\}zt}|dkr td jdkr+t|j}td d| dj d dd d g| }t	t
d	jd
t  fddt|D }zLt|D ]E}|| }	z$| ||	< td|	d	  d| dt||	 r||	 jnd dd W q` ty   td|	d	  d| ddd t   w W n ty   |D ]}
|
  q w W d    n1 sw   Y  dd |D }ddt|tt   | djd}t||dW |d urz|jdd W S  ty   Y S w S |d urz|jdd W w  ty   Y w w w )Nr   zPDF has no pages: zDeepSeek-OCR start: file=z, pages=z
, workers=z, model=Tr8  r   )Zmax_workersc                   s    i | ]}  j||qS r   )Zsubmitr  )r   r   ZexecutorZ
model_namerv   r  r   r   
<dictcomp>  s    z<SiliconFlowDeepSeekOCRClient.process_ocr.<locals>.<dictcomp>zDeepSeek-OCR page rH  z done, chars=r5   z failedc                 S  s   g | ]}|d ur|qS r   r   r   r   r   r   r   r     s    z<SiliconFlowDeepSeekOCRClient.process_ocr.<locals>.<listcomp>ZsiliconflowZdeepseek_ocrr  )providerZengine
page_countr  r  )r;   r<   r>   rk   )r  r<   r  r~   r   r  r   r;  r  r   ra   r   r   r   r|   r.   ro   	traceback	print_excZcancelr  r:   rr   )r  r<   r  r  Zcleanup_pathr  r;   ZfuturesZfuturer   Zpendingnormalized_pagesr>   r   r  r   r    s   




z(SiliconFlowDeepSeekOCRClient.process_ocr)6r  r   r  r   r  r$   r<   r   r  r   r  r   r  r  r  r*   r  r   r  r*   r  r*   r  r   r  r   r  r  r  r   r  r   r  r$   r  r$   r  r*   r  r   r  r   r  r   r   r   r  r   r   r   r  r   rG   r  rS   )rX   r  rY   r   rG   r*   )rG   r  )rG   r  )rG   r  )r  r   rG   r  )r  r=   rG   r  )
r"  rg   r<   r   r  r   r  r   rG   r=   )r  r   rG   r  )r  r   rG   r   )r"  rg   r<   r   r  r   rG   r=   )rv   r   r   r   r<   r   rG   r,   )r<   r$   r  r=   rG   r:   )r   r   r   rA   r@   rB   r  r  r  rp   r  r  r  r  r  r  r  r  r  r  r   r   r   r   rC   k  sR    ?






	
%


 
9rC   )save_imagesr]  r  r  r  r  r  r  r  &Optional[SiliconFlowDeepSeekOCRClient]Optional[Tuple[Any, ...]]c                C  s  |pt dVi dt|d|ptdtdddtddd	td
ddtdtdddtdtdddtdddtdtdddtdddtdddtdd d!td"dd#td$td%d&d'td(td)td*t	d+td,t
d-td.dd/tdtd0d1d2td3d4d5td6dd7td8td9d:d;td<d=d>td?td@tdAdBdCtdDtdEdF}|jj|jdG| dHdI}g }g }	|jD ]0}
|
jpdJ}| r|| |	|
jd |ddKdL |
jD |
jr|
jjnd |
jdM|jdN qdOdPdQ |D }g }|r:t| |	|dR|  dS\}	}dTdL |	D }dOdUdQ |D }| s5d S ||	|fS | sAd S ||	fS )WNr  r  Z!SILICONFLOW_DEEPSEEK_OCR_BASE_URLZSILICONFLOW_BASE_URLr  r<   ZSILICONFLOW_DEEPSEEK_OCR_MODELr  r  Z$SILICONFLOW_DEEPSEEK_OCR_TIMEOUT_SECr  r  r   Z%SILICONFLOW_DEEPSEEK_OCR_PAGE_WORKERSr   r  Z+SILICONFLOW_DEEPSEEK_OCR_CLIENT_CONCURRENCYr  Z"SILICONFLOW_DEEPSEEK_OCR_TRUST_ENVFr  r   Z SILICONFLOW_DEEPSEEK_OCR_RETRIESr  r  Z*SILICONFLOW_DEEPSEEK_OCR_429_RETRY_FOREVERTr  Z&SILICONFLOW_DEEPSEEK_OCR_RETRY_FOREVERr  Z-SILICONFLOW_DEEPSEEK_OCR_429_INITIAL_WAIT_SECr  r  Z)SILICONFLOW_DEEPSEEK_OCR_429_MAX_WAIT_SECr  r  Z#SILICONFLOW_DEEPSEEK_OCR_MAX_TOKENSr  r  ZSILICONFLOW_DEEPSEEK_OCR_PROMPTr  Z(SILICONFLOW_DEEPSEEK_OCR_FALLBACK_PROMPTr  Z*SILICONFLOW_DEEPSEEK_OCR_TRANSCRIBE_PROMPTr  Z-SILICONFLOW_DEEPSEEK_OCR_FULL_PAGE_TRANSCRIBEr  Z7SILICONFLOW_DEEPSEEK_OCR_FULL_PAGE_TRANSCRIBE_MAX_CHARSr  r  Z%SILICONFLOW_DEEPSEEK_OCR_IMAGE_DETAILr  r  Z"SILICONFLOW_DEEPSEEK_OCR_MAX_PAGESr   rS  ZSILICONFLOW_DEEPSEEK_OCR_DPIr  r  Z'SILICONFLOW_DEEPSEEK_OCR_MAX_IMAGE_EDGEr  r   r  r  Z%SILICONFLOW_DEEPSEEK_OCR_JPEG_QUALITYr  r  r  Z(SILICONFLOW_DEEPSEEK_OCR_MAX_IMAGE_BYTESr  r  )r'   r  r  r5   c                 S  r   r   )rA  )r   r&   r   r   r   r   o  r   z4run_siliconflow_deepseek_pdf_ocr.<locals>.<listcomp>Zsiliconflow_deepseek_ocr)rP  r.   r`  r1   r2   r4   r  r<   rd  c                 s      | ]	}|  r|V  qd S r   rM   r   r   r   r   r   r	  w      z3run_siliconflow_deepseek_pdf_ocr.<locals>.<genexpr>zattachment for r[  c                 S  s   g | ]}t |d pdqS )r.   r5   )r   r`   r  r   r   r   r     s    c                 s  r  r   r  r  r   r   r   r	    r  r   )rC   re   rK   rL   rW   ra   rU   rQ   rA   r@   rB   r   r  r  r<   r;   r.   rM   r   r-   r1   r2   __dict__r4   r   rr  )rv   r  r]  r  r  r  Z
ocr_clientr  Zmarkdown_partsr  r   r.   Zmarkdown_resultro  r   r   r   rD   /  s   	

	






 
!"
#$%'






rD   )F)rE   r   rF   r*   rG   r*   )rE   r   rF   r   rG   r   )rE   r   rF   r   rG   r   )rX   r   rY   r   rG   r*   )r\   r]   rG   r^   r   )rd   r$   rG   r   )rf   rg   rG   r   )rv   r   rG   r   )r&   r   r   r   rG   r   )r&   r   r   r   rG   rg   )rv   r   r   r   r   r   r   r   r   r   r   r   rG   r   )r   r   r   r   rG   r   )r   r   r   r   r2   r   rG   r   )r   r   r   r   r2   r   rG   r   )r   r   r   r   rG   r   )r   r   rG   r   )r   r   r   r   rG   r   )r   r   r   r   rG   r   )r   r   r   r   r2   r   rG   r   )
r   r   r   r   r   r   r2   r   rG   r   )r   r   r   r   r2   r   rG   r*   )r   r   r   r   r   r   r   r   r   r   r2   r   r   r   rG   r*   )r   r   r   r   r2   r   rG   r   )r   r   r   r   r2   r   rG   r   )T)rP   r   rF   r*   rG   r*   )r  r  rG   r  )r  r  rG   r  )r}   r  r  r   r  r  rG   r  )r$  r  rG   r*   )r}   r  r  r  r   r   r   r   r2   r   r*  r  r)  r   rG   r*   )
rv   r   r   r   r2   r   r4   r3   rG   r5  )r&   r   rG   r=   )rB  r   rC  r   rG   r   )
r.   r   r1   r5  r2   r   rF  r   rG   r*   )rG   r   )rG   rK  )rv   r   rP  r   r&   r   rQ  r   rR  r7   rG   r*   )
rv   r   r;   r3   r]  r$   r\  r   rG   r^  )rs  r   rG   rt  )rf   r   rG   ry  )r  r=   rG   r   )rv   r   r  r*   r]  r$   r  r$   r  r$   r  r  rG   r   )^__doc__Z
__future__r   rk  re  r   ru  rK   r  rl   r  r  r  Zconcurrent.futuresr   r   Zdataclassesr   r   Zpathlibr   typingr   r	   r
   r   r   r   rx   r  ZPILr   r   RuntimeErrorr   r   r   r,   r:   __all__rQ   rU   rW   rZ   r[   rc   re   ru   rA   r@   rB   r:  r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r%  r'  r4  r?  rA  rE  rG  rJ  rO  rZ  rr  compileSr|  rx  r  r  r  rC   rD   r   r   r   r   <module>   s    














0


$





	


	


2
1

	

/
<
g




	: 


   J