o
    d-j                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZ ddlZddlZddlmZm Z  G d	d
 d
e!Z"eG dd dZ#eG dd dZ$eG dd dZ%eG dd dZ&g dZ'ddddZ(dddZ)dd d!Z*dd$d%Z+dd&d'Z,dd+d,Z-ddd/d0Z.dd4d5Z/d6Z0d7Z1d8Z2h d9Z3dd;d<Z4dd@dAZ5ddCdDZ6ddIdJZ7ddOdPZ8ddSdTZ9ddVdWZ:ddZd[Z;dd^d_Z<ddcddZ=ddkdlZ>ddndoZ?ddpddsdtZ@ddxdyZAdd{d|ZBdddZCdddZDdddZEdddZFdddZGdddddZHeIdejJZKdddZLdddZMdddZNG dd dZOG dd dZPdddddddddZQdS )am  
High-throughput SiliconFlow DeepSeek-OCR SDK.

The public shape intentionally mirrors the Mistral OCR SDK enough for the task
executor:

    client = SiliconFlowDeepSeekOCRClient(api_key="sk-...", base_url="https://api.siliconflow.cn/v1")
    response = client.ocr.process(
        model="deepseek-ai/DeepSeek-OCR",
        document={"type": "local_file", "file_path": "/path/to.pdf"},
    )
    for page in response.pages:
        print(page.index, page.markdown)

Unlike Mistral OCR, this client reads local PDFs directly, renders pages to
compressed JPEG, calls SiliconFlow per page, and merges page results locally.
    )annotationsN)ThreadPoolExecutoras_completed)	dataclassfield)Path)AnyCallableDictListOptionalTuple)Image	ImageStatc                   @  s   e Zd ZdS )SiliconFlowDeepSeekOCRErrorN)__name__
__module____qualname__ r   r   /tmp/yunwo_deepseek_sdk_test.pyr   *   s    r   c                   @  s&   e Zd ZU ded< ded< ded< dS )OCRPageDimensionsintwidthheightdpiN)r   r   r   __annotations__r   r   r   r   r   .   s   
 r   c                   @  sZ   e Zd ZU ded< ded< ded< ded< ded< dZd	ed
< dZded< dZded< dS )OCRImagestridfloat
top_left_x
top_left_ybottom_right_xbottom_right_yNOptional[str]image_base64imagetypepdfsource)r   r   r   r   r%   r'   r)   r   r   r   r   r   5   s   
 r   c                   @  sf   e Zd ZU ded< ded< eedZded< dZd	ed
< eedZded< dZ	ded< dZ
ded< dS )OCRPager   indexr   markdowndefault_factoryz	List[Any]imagesNzOptional[OCRPageDimensions]
dimensionsList[Dict[str, Any]]blocks raw_markdownOptional[Dict[str, Any]]usage)r   r   r   r   r   listr/   r0   r2   r4   r6   r   r   r   r   r*   A   s   
 r*   c                   @  s0   e Zd ZU ded< ded< eedZded< dS )	OCRResponsezList[OCRPage]pagesr   modelr-   Dict[str, Any]
usage_infoN)r   r   r   r   r   dictr<   r   r   r   r   r8   L   s   
 r8   )
$DEFAULT_DEEPSEEK_OCR_FALLBACK_PROMPTDEFAULT_DEEPSEEK_OCR_PROMPT&DEFAULT_DEEPSEEK_OCR_TRANSCRIBE_PROMPTr*   r   r   r8   SiliconFlowDeepSeekOCRClientr    run_siliconflow_deepseek_pdf_ocrFnamer   defaultboolreturnc                 C  s&   t | }|d u r|S |  dv S )N>   1ytrueZonZyes)osgetenvstriplowerrC   rD   valuer   r   r   	_env_boola   s   
rP   r   c                 C  D   t | }|d u s| dkr|S zt|W S  ty!   | Y S w Nr3   )rJ   rK   rL   r   
ValueErrorrN   r   r   r   _env_inth      

rT   r   c                 C  rQ   rR   )rJ   rK   rL   r   rS   rN   r   r   r   
_env_floatr   rU   rV   status_codebodyc                 C  s4   |pd  }| dkpd|v pd|v pd|v pd|v S )Nr3   i  z
rate limitztoo manyz	tpm limitz	rpm limit)rM   )rW   rX   Z
body_lowerr   r   r   _is_rate_limit_error|   s   rY   c                 C  s   t | |rdS | dv S )NT>   i  i  i  i
  i  i  i  i  i  i  i  )rY   )rW   rX   r   r   r   _is_retryable_http_error   s   
rZ   headershttpx.HeadersOptional[float]c                 C  s<   |  d}|s	d S z
tdt| W S  ty   Y d S w )Nzretry-after        )getmaxr   rL   rS   )r[   retry_afterr   r   r   _retry_after_seconds   s   
rb   explicitr$   c                 C  sH   | r
|   r
|   S dD ]}t|}|r|  r|    S qtd)N)Z SILICONFLOW_DEEPSEEK_OCR_API_KEYZSILICONFLOW_OCR_API_KEYZSILICONFLOW_API_KEYzDSILICONFLOW_API_KEY or SILICONFLOW_DEEPSEEK_OCR_API_KEY is required.)rL   rJ   rK   r   )rc   rC   rO   r   r   r   _api_key   s   
rd   rawbytesr   c                 C  s   t jddd\}}z%t|d}||  W d    n1 s w   Y  W t|S W t|S  tyZ   zt| W n	 tyE   Y nw z
t|j	dd W   tyY   Y  w w )NZdeepseek_ocr_z.pdf)prefixsuffixwbTZ
missing_ok)
tempfilemkstemprJ   fdopenwrite	ExceptioncloseOSErrorr   unlink)re   fdrC   fpr   r   r   _write_temp_pdf   s,   ru   z=Convert this document page to markdown. Return markdown only.zR<image>
<|grounding|>Convert this document page to markdown. Return markdown only.zb<image>
Transcribe all visible text in this image exactly. Preserve line breaks. Return text only.>	   ZimgZphotoZdiagramZgraphr&   ZlogoZpictureZchartZfigurepdf_pathc                 C  sb   d }z$t t| }|jr|dstd|  t|W |d ur&|  S S |d ur0|  w w )Nr3   PDF requires a password: )fitzopenr   
needs_passauthenticater   lenrp   )rv   docr   r   r   _page_count   s   


r~   r&   Image.Imagemax_edgec                 C  s   |dkr| S t | j| j}||kr| S |t| }t dt| j| t dt| j| f}ttdt}| ||j}| 	  |S )Nr      
Resampling)
r`   r   r   r   r   getattrr   ZresizeLANCZOSrp   )r&   r   Zlargest_edgeZscaleZnew_size
resamplingZresizedr   r   r   _resize_to_edge   s   (r   qualityc                 C  s"   t  }| j|d|dd | S )NJPEGTformatr   optimize)ioBytesIOsavegetvalue)r&   r   bufferr   r   r   
_save_jpeg   s   r   
page_indexr   	max_bytes/Tuple[bytes, OCRPageDimensions, Dict[str, Any]]c                C  s  d }d }d }zt t| }|jr|dstd|  || }	|	jt |d |d dd}tt	
|dd}t||}t||}
|}t|
|kr|dks[t|jd	kr|dkrd|d
8 }nt|td	tt|jd }t||}
t|
|kr|dks[t|jd	ks[t|j|j|d}|t|
|d}|
||fW |d urz|  W n	 ty   Y nw d }|d ur|  S S |d urz|  W n	 ty   Y nw d }|d ur|  w w )Nr3   rw         R@F)matrixalphapngRGB7   i     333333?)r   r   r   )jpeg_qualityZ	byte_sizer   )rx   ry   r   rz   r{   r   
get_pixmapMatrixr   r   r   tobytesconvertr   r   r|   r`   sizer   r   r   r   rp   ro   )rv   r   r   r   r   r   r}   pixr&   pagejpegZcurrent_qualityr0   metar   r   r   _render_page_jpeg   sZ   	

"

"


r   rect	fitz.Rect	page_rectOptional[fitz.Rect]c                 C  sT   t t|j| jt|j| jt|j| jt|j| j}|jdks&|j	dkr(d S |S )Nr   )
rx   Rectr`   x0y0minx1y1r   r   )r   r   clippedr   r   r   _clip_rect_to_page!  s   r   r0   List[float]c              	   C  s   |j t|j d }|jt|jd }tdt|j | j|j | tdt|j| j|j | tdt|j | j|j | tdt|j| j|j | gS )Nư>r^   )r   r`   r   r   r   r   r   r   )r   r   r0   width_scaleheight_scaler   r   r   _scale_rect_to_rendered-  s   r   bboxc           
      C  s&  t | dk rd S dd | d d D \}}}}||k r ||}}||k r)||}}tt|t|t|t|dkr`|jdksD|jdkr`||j d }||j d }||j d }||j d }|jt|jd }|jt|jd }t|j||  |j||  |j||  |j||  }	t	|	|S )N   c                 S     g | ]}t |qS r   )r   ).0vr   r   r   
<listcomp>?      z%_bbox_to_pdf_rect.<locals>.<listcomp>  g     @@r   )
r|   r`   absr   r   rx   r   r   r   r   )
r   r   r0   r   r   r   r   r   r   r   r   r   r   _bbox_to_pdf_rect8  s,   

"
r   abc                 C  s   t | j|j}t | j|j}t| j|j}t| j|j}||ks$||kr&dS || ||  }t | j| j |j|j  | d}|| S )Nr^   r   )r`   r   r   r   r   r   r   r   )r   r   r   r   r   r   Zinterunionr   r   r   	_rect_iou\  s   "r   rectsList[fitz.Rect]c                   sN   g }t | dd ddD ] t fdd|D rq|  qt |dd dS )	Nc                 S  s   | j | j S N)r   r   rr   r   r   <lambda>j  s    z_dedupe_rects.<locals>.<lambda>T)keyreversec                 3  s    | ]
}t  |d kV  qdS )r   N)r   )r   Zexistingr   r   r   	<genexpr>k  s    z _dedupe_rects.<locals>.<genexpr>c                 S  s   | j | j| j| jfS r   )r   r   r   r   r   r   r   r   r   n  r   )r   )sortedanyappend)r   resultr   r   r   _dedupe_rectsh  s   r   image_entryTuple[Any, ...]Tuple[int, int]c                 C  s8   zt | d pdt | d pdfW S  ty   Y dS w )N   r      )r   r   )r   ro   )r   r   r   r   _image_entry_pixel_sizeq  s
   "r   r}   fitz.Documentxrefcache%Dict[int, Optional[Dict[str, float]]]Optional[Dict[str, float]]c                 C  s
  ||v r|| S d }d }zz|  |}|d}|s=d ||< W W ||fD ]}|d ur:z|  W q$ ty9   Y q$w q$d S tt|d}|	 }t
tdt}|d|j |d}	|	 }
tdt|
}t|
dd  | }t|
d d | }|d	 d }t|jd
 }t|t|t|d}|||< |W W ||fD ]}|d urz|  W q ty   Y qw qS  ty   d ||< Y W ||fD ]}|d urz|  W q ty   Y qw qd S w ||fD ]}|d urz|  W q ty   Y qw qw )Nr&   r   r   )   r   Lr      d   ZHSVr   )white_ratio
dark_ratiosaturation_mean)Zextract_imager_   rp   ro   r   ry   r   r   r   copyr   Z	thumbnailr   Z	histogramr`   sumsplitr   ZStatmeanr   )r}   r   r   	pil_imageZthumb	extractedimage_bytesr&   r   ZgrayZhisttotalr   r   Zsatr   profiler   r   r   _pdf_image_content_profilex  s   



r   r   c                 C  s8   | sdS |  dddko|  dddko|  dd	d
kS )NFr   r^   皙?r         ?gRQ?r   g     o@g      8@)r_   )r   r   r   r   _is_text_raster_profile  s   r   text_raster_strip_countprofile_cacher   c                C  s  t |||\}}}	}
td|	| }td|
| }|dks |dkr"dS || }tdt|j|j }|| }|tdt|j }t|t|d |t|d }t|\}}t||}t||}|dk sf|dk rhdS |r||r|t||dk r|t||d	k r|dS |d
krdS |dkr|dkr|dk rdS t|d }t| ||}t	|}|r|dkr|dkr|dkrdS |dkr|dkr|dkrdS |dk r|dkrdS |dkr|dkr|dk rdS dS )Nr^   r   Fr   r      i     P   g       @g?g      @g)\(?r   r   ?g
ףp=
?g?g{Gz?g@x   g      @gQ?T)
r   r`   r   r   r   r   r   r   r   r   )r}   r   r   r   r0   r   r   r   r   r   r   display_widthdisplay_heightdisplay_area	page_area
area_ratiowidth_ratioaspectZpixel_widthZpixel_heightZmin_display_sideZmax_display_sider   r   Zlooks_like_text_rasterr   r   r   _should_keep_pdf_image_rect  sJ   


$r  r2   r1   List[OCRImage]c           &      C  sb  d }zzrt t| }|jr|dstd|  || }|j}g }i }g }	|D ]/}
t|
dp4d 	 }|
d}|t
vsHt|tsIq+t|||}|d urZ||df q+|jddD ]/}|sfqa|d }z||}W n	 tyz   Y qaw |D ]}t||}|d ur|	||f q}qad}|	D ]h\}}t|||\}}}}td	|| }td	|| }|| }td
t|j|j }|| }|td
t|j }t|t|d |t|d }t|d }t|||}t|r|dkr|dkr|dkr|d7 }q|	D ]\}}t|||||||dr||df q g }tdd |D } t| D ]A\}!}t|||\}}}}||ks@||krBq(d}"|D ]\}#}$t||#dkrV|$}" nqF|td|! |||||"d q(|W W |d urw|   S S  ty }% z!t!d|d  d|% dd g W  Y d }%~%W |d ur|   S S d }%~%ww |d ur|   w w )Nr3   rw   r'   r   Zdeepseek_blockT)Zfullr   r^   r   r   r   r   g?r   r   r(   c                 S  s   g | ]\}}|qS r   r   )r   r   Z_sourcer   r   r   r   8  r   z,_extract_pdf_image_rects.<locals>.<listcomp>r   img-)r   r    r!   r"   r#   r)   z5DeepSeek-OCR image region extraction failed for page : flush)"rx   ry   r   rz   r{   r   r   r_   rL   rM   _IMAGE_BLOCK_TYPES
isinstancer7   r   r   Z
get_imagesZget_image_rectsro   r   r   r`   r   r   r   r   r   r   r  r   	enumerater   r   rp   print)&rv   r   r0   r2   r}   r   r   r   r   Zpdf_image_candidatesblockZ
block_typer   r   r   r   Zimage_rectsr   r   r   r   r   r   r   r   r   r   r  r  r  r   r/   Zdeduped_rectsidxr)   Zoriginal_rectZoriginal_sourceexcr   r   r   _extract_pdf_image_rects  s   

	






r  r;   c              	   C  s&   | j | j| j| j| j| j| j| jdS )Nr   r    r!   r"   r#   r%   r'   r)   r  )r&   r   r   r   _image_to_dictU  s   r  image_idurlc                 C  s   d|  d| dS )Nz![z]()r   )r  r  r   r   r   _image_markdown_urlb  s   r  r,   r/   max_markdown_charsc                C  s   |dkst | pd |krdS tdt|j|j }|D ]'}tdt|jt|j }tdt|jt|j	 }|| | dkrD dS qdS )Nr   r3   Fr   r^   g
ףp=
?T)
r|   rL   r`   r   r   r   r"   r    r#   r!   )r,   r/   r0   r  r   r&   r   r   r   r   r   _needs_full_page_transcriptionf  s   r  c                  C  sF   t dd} zdd l}t|d| } W n	 ty   Y nw t| dS )NZTASK_EXECUTOR_STORAGE_URLz)https://dev.knowledge.yunwoai.com/storager   Z
storageUrl/)rJ   rK   rO   r   ro   r   rstrip)rO   Z
task_valuer   r   r   _storage_urlx  s   r  ATuple[Optional[Callable[..., Any]], Optional[Callable[..., Any]]]c                  C  s8   zdd l } t| dd t| dd fW S  ty   Y dS w )Nr   get_file_by_obj_keyinsert_file)NN)databaser   ro   )r   r   r   r   _db_file_helpers  s   r!  page_numberoutput_pathpage_dimensionsr5   c                 C  sr  d }d }d }zzKt | }|jr;|ds;W W |d ur-z|  W n	 ty,   Y nw d }|d ur9|  dS dS |d }|dk sI|t|krmW W |d ur_z|  W n	 ty^   Y nw d }|d urk|  dS dS || }	|	j}
t|pxi 	dp~|
j
}t|pi 	dp|
j}t|pi 	dpd}|
j
t|d	 }|
jt|d	 }t |
j|j|  |
j|j|  |
j|j|  |
j|j|  }t||
}|d u rW W |d urz|  W n	 ty   Y nw d }|d ur|  dS dS t td
|d td
|d }|	j||dd}tt|dd}tjtj |dd |j!|dddd W W |d urFz|  W n
 tyE   Y nw d }|d urS|  dS dS  ty } z4t"d| dd W Y d }~W |d urz|  W n
 ty   Y nw d }|d ur|  dS dS d }~ww |d urz|  W n
 ty   Y nw d }|d ur|  w w )Nr3   Fr   r   r   r   r   i,  r   H   r   )r   Zclipr   r   r   Texist_okr   \   r   z DeepSeek-OCR image crop failed: r  )#rx   ry   rz   r{   rp   ro   r|   r   r   r_   r   r   r   r`   r   r   r    r   r!   r"   r#   r   r   r   r   r   r   r   r   rJ   makedirspathdirnamer   r  )rv   r"  r&   r#  r$  r}   r   r   r   r   r   Z	ocr_widthZ
ocr_heightr   r   r   r   r   r  r   r   r   _crop_pdf_image  s   

 






r,  r3   file_remarkr9   
output_dirr.  1Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]c          "      C  sX  t t| d }|r|}n"tt }t|j	dkr%|j	d d nt
 d }t|d | }tj|dd t }t \}	}
g }d}d}|D ]}t|dpTd}|d	p\g }g }|D ]}t|trl|}nGt|trtt|d
p|dpd| t|dpdt|dpdt|dpdt|dpd|dt|dpdd}nqa|d7 }d| d|j d}tj||}d}|jrz-tjtj|dd t|d}|t|j W d    n1 sw   Y  d}W n ty } ztd| dd W Y d }~nd }~ww |s&t | ||||d}|s*qaz't|d}|! }W d    n	1 s@w   Y  t | }t|}W n
 ty[   Y qaw d| d | }|
d urz%|	d urs|	|nd }|d u r|
d!| d | ||d"||d d|d#	 W n ty } ztd$| dd W Y d }~nd }~ww | | }|j|d||||j"|j#|j$|j%d%|d|j&d&	}|'| |'| |d7 }t|d'pd(} d)|j d*}!|!| v r| (|!d)| d*} n|| vr| )  d+t*|j| + } | |d'< qa||d	< qJ|r(td,| d | d-dd ||fS ).Nzutf-8r   ZstorageZ
pdf_imagesTr&  r   r"  r/   r   r  r  r    r!   r"   r#   r%   r)   r(   )r   r    r!   r"   r#   r%   r)   r   Zpage_Z_chunk_0_img_z.jpegFri   z'DeepSeek-OCR image base64 save failed: r  r0   rbz/pdf_images/r  zstorage/pdf_images/r   r-  z.DeepSeek-OCR image file record insert failed: )r    r!   r"   r#   )	r  r"  chunk_index
image_pathr  obj_keyZcoordinatesr0   r)   r,   r3   (r  

z%DeepSeek-OCR image extraction: saved z	 image(s)),hashlibZmd5r   encode	hexdigestr   __file__resolver|   parentscwdrJ   r)  r  r!  r   r_   r  r   r=   r   r   r*  joinr%   r+  ry   rn   base64	b64decodero   r  r,  readr    r!   r"   r#   r)   r   replacer  r  rL   )"rv   r9   r/  r.  Zpdf_md5Z	image_dirZmodule_pathZstorage_rootZstorage_urlr  r  extracted_imagesr   Zsavedr   r"  Zpage_imagesZnormalized_page_imagesr&   Z	image_objZimage_filenamer3  ZsuccessZimg_filer  r   Z	image_md5Z
image_sizer4  exists	image_urlr   r,   Zplaceholderr   r   r   _save_pdf_images  s   





 









rF  z3<\|ref\|>(.*?)<\|/ref\|>\s*<\|det\|>(.*?)<\|/det\|>det_textOptional[List[float]]c                 C  s   zGt | }t|tr<|r?t|d tr|d n|}t|trBt|dkrEt|d t|d t|d t|d gW S W d S W d S W d S W d S  tyQ   Y d S w )Nr   r   r   r   r   )jsonloadsr  r7   r|   r   ro   )rG  Zparsed	candidater   r   r   _parse_bboxK  s"   
.rL   Tuple[str, List[Dict[str, Any]]]c                 C  s   | pd}t t|}g }t|D ]F\}}| }|d t|k r)||d   nt|}|||  }|d p=d|d}	t	|d }
|
d urR|
|	d< |
|	 qtd|}tdd| }||fS )	Nr3   r   textr'   rN  r   r   z\n{3,}r6  )r7   _REF_DET_REfinditerr  endr|   startrL   grouprL  r   subre)re   rN  Zmatchesr2   r  matchrS  rR  Z
block_textr  r   Zcleanedr   r   r   _clean_markdown_and_blocksW  s"   (rX  datac                 C  s   |  d}t|tr|stdt| d d  |d pi }| dp%i }| d}t|tr2|S t|trgg }|D ]&}t|trV| dpK| d}|rU|t| q;|d ura|t| q;d|S |d u rmd	S t|S )
Nchoicesz&SiliconFlow response missing choices: r   r   messagecontentrN  
r3   )r_   r  r7   r   r   r=   r   r>  )rY  rZ  firstr[  r\  partsitemrN  r   r   r   _extract_message_contentm  s*   





ra  c                   @  s&   e Zd ZdddZdddddZdS )_OCRResourceclient'SiliconFlowDeepSeekOCRClient'rF   Nonec                 C  s
   || _ d S r   )_clientselfrc  r   r   r   __init__  s   
z_OCRResource.__init__N)r:   r:   r$   documentr;   _r   r8   c                K  s   | j j||dS )Nr:   rj  )rf  process_ocr)rh  r:   rj  rk  r   r   r   process  s   z_OCRResource.process)rc  rd  rF   re  )r:   r$   rj  r;   rk  r   rF   r8   )r   r   r   ri  rn  r   r   r   r   rb    s    
rb  c                   @  s   e Zd Zddddddddddd	ddd
eeedddddddddded6d7Zdfdgd;d<Zdhd>d?Zdid@dAZ	djdCdDZ
dkdGdHZdhdIdJZdldNdOZdmdRdSZdndUdVZdodXdYZdpdZd[Zdqd`daZdrdcddZdS )srA   https://api.siliconflow.cn/v1Ndeepseek-ai/DeepSeek-OCR      ^@r   F
   T      $@      highr      `	  V     p )base_urlapi_baser:   timeout_secpage_workersmax_concurrent_requests	trust_envmax_retriesrate_limit_retry_foreverchannel_retry_foreverrate_limit_initial_wait_secrate_limit_max_wait_secrate_limit_callback
max_tokenspromptfallback_prompttranscribe_promptfull_page_transcribefull_page_transcribe_max_charsimage_detail	max_pagesr   max_image_edger   max_image_bytesapi_keyr   r{  r|  r$   r:   r}  r   r~  r   r  Optional[int]r  rE   r  r  r  r  r  r  *Optional[Callable[[Dict[str, Any]], None]]r  r  r  r  r  r  r  r  r   r  r   r  rF   re  c                C  sf  t || _|p|d| _|| _|| _tdt|| _|d ur!|n| j}t	
td|| _t|| _tdt|	| _t|
| _t|| _tdt|| _t| jt|| _|| _tdt|| _|| _|| _|| _t|| _tdt|| _|| _tdt|| _tdt|| _tdt|| _tdtdt|| _ td	t|| _!t"| | _#t	$ | _%d | _&d S )
Nr  r   r   r      r%  (   _      )'rd   r  r  r{  r:   r}  r`   r   r~  	threadingZBoundedSemaphore
_semaphorerE   r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r   r  r   r   r  rb  ocrZLock_client_lockrf  )rh  r  r{  r|  r:   r}  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r   r  limitr   r   r   ri    s@   







z%SiliconFlowDeepSeekOCRClient.__init__r3   rW   rX   c                 C  s4   |d u r| j S t||r| jp| j S | j ot||S r   )r  rY   r  rZ   )rh  rW   rX   r   r   r   _should_retry_forever  s
   
z2SiliconFlowDeepSeekOCRClient._should_retry_foreverhttpx.Clientc                 C  st   | j }|d u s
|jr8| j# | j }|d ur!|js!|W  d    S |  }|| _ W d    |S 1 s3w   Y  |S r   )rf  Z	is_closedr  _make_http_clientrg  r   r   r   _http_client  s   
z)SiliconFlowDeepSeekOCRClient._http_clientc                 C  sb   | j  | j}d | _W d    n1 sw   Y  |d ur/z|  W d S  ty.   Y d S w d S r   )r  rf  rp   ro   rg  r   r   r   rp     s   z"SiliconFlowDeepSeekOCRClient.closerd  c                 C  s   | S r   r   )rh  r   r   r   	__enter__  s   z&SiliconFlowDeepSeekOCRClient.__enter__rk  r   c                 G  s   |    d S r   )rp   )rh  rk  r   r   r   __exit__  s   z%SiliconFlowDeepSeekOCRClient.__exit__c                 C  sN   t j| jtd| jd}t jtd| jtd| jd d}t j|| j|ddS )Ng      .@)Zconnectr   r   )Zmax_keepalive_connectionsZmax_connectionsF)timeoutr  limitsZhttp2)	httpxZTimeoutr}  r   ZLimitsr`   r~  ZClientr  )rh  r  r  r   r   r   r    s   
z.SiliconFlowDeepSeekOCRClient._make_http_clientrj  r;   Tuple[Path, Optional[Path]]c                 C  s~  t |dpd }|dp|d}|r/tt |  }| s+tt ||d fS |dv rR|drRtt |d   }| sNtt ||d fS |dp`|dp`|d	}|rxt	t |
d
dd }t|}||fS |d}|rt |}	|	drt|	dd    }| stt ||d fS |	dr|  |	}
|
  t|
j}||fS td)Nr'   r3   	file_pathr*  >   
local_filefiledocument_urlZdocument_base64Zfile_base64r?  ,r   zfile://   )zhttp://zhttps://zGdocument must contain local file_path, document_base64, or document_url)r   r_   rM   r   
expanduserr;  rD  FileNotFoundErrorr?  r@  r   ru   
startswithr  Zraise_for_statusr\  r   )rh  rj  Zdoc_typer  r*  Zencodedre   Ztempr  r  responser   r   r   _resolve_pdf_path  s>   



z.SiliconFlowDeepSeekOCRClient._resolve_pdf_pathr   rf   c                 C  sF   t |d}|ddd| | jddd|dgd	gd
|ddS )NasciiuserrE  zdata:image/jpeg;base64,)r  Zdetail)r'   rE  rN  rO  )Zroler\  Fr   )r:   Zmessagesstreamr  Ztemperature)r?  Z	b64encodedecoder  )rh  r   r:   r  r  Z
image_datar   r   r   _request_payload  s    z-SiliconFlowDeepSeekOCRClient._request_payloadcurrent_max_tokensc                 C  s>   |dkrd S |dkrdS |dkrdS |dkrdS t d|d S )Ni   rt  i   i   r   )r`   )rh  r  r   r   r   _next_safe_max_tokens3  s   z2SiliconFlowDeepSeekOCRClient._next_safe_max_tokensretry_countc                 C  s   t ddtd| S )Ng      >@g      ?r   )r   r`   )rh  r  r   r   r   _retry_wait_seconds>  s   z0SiliconFlowDeepSeekOCRClient._retry_wait_secondsc                 C  s   d| j  dd}| j d}d }| j}d}d}	d}
	 z| ||||}| j |  j|||d}W d    n1 s=w   Y  |jdk rz| W W S  tj	y } zqt
d	|jd d
  }|}| jr|	d7 }	| |	}td|	 d|dd|jd d  dd t| W Y d }~W q|	| jk r|	d7 }	| |	}td|	 d| j d|dd|jd d  dd t| W Y d }~W q||d }~ww |jd d
 }t
d|j d| }|jdkrd|v rd|v r| |}|d ur|dk r|}|}|d7 }td| ddd W qt|j|r|}|
d7 }
t|j}|}|d u r:t| j| jdt|
d d  }| jd urkz| |j||
||d W n tyj } ztd| dd W Y d }~nd }~ww td |
 d|dd|d d  dd t| | |j|rW q|	| jk r|	d7 }	W qW n| |j|r|}|	d7 }	| |	}td!|	 d|dd"|j d#|d d  dd t| W q|}|	| jk r|	d7 }	| |	}td$|	 d| j d|dd%|j d#|d d  
dd t| W q| tjtjtjtj fyw } z_|}|  r?|	d7 }	| |	}td&|	 d|dd| dd t| W Y d }~q|	| jk rm|	d7 }	| |	}td'|	 d| j d|dd| dd t| W Y d }~qW Y d }~nd }~ww t
d(| |))NzBearer zapplication/json)ZAuthorizationzContent-Typez/chat/completionsr   T)r[   rI  i  z(SiliconFlow returned non-JSON response: r   r   z:DeepSeek-OCR non-JSON response, retrying forever (attempt=z, wait=z.1fzs):    r  z&DeepSeek-OCR non-JSON response, retry r  z in zs: zSiliconFlow HTTP r  r  Zmax_seq_len   z#DeepSeek-OCR reduced max_tokens to z after context-limit errorr   )rW   rX   rate_limit_attemptZretry_after_secwait_secz)DeepSeek-OCR rate-limit callback failed: z@DeepSeek-OCR rate limited, retrying forever (rate_limit_attempt=z=DeepSeek-OCR retryable HTTP error, retrying forever (attempt=z
s, status=z): zDeepSeek-OCR HTTP error, retry z
s (status=z6DeepSeek-OCR network error, retrying forever (attempt=z"DeepSeek-OCR network error, retry z)SiliconFlow DeepSeek-OCR request failed: )!r  r{  r  r  r  r  ZpostrW   rI  JSONDecodeErrorr   rN  r  r  r  timesleepr  r  rY   rb   r[   r   r  r  r  ro   r  r  ZConnectError	ReadErrorZRemoteProtocolErrorZTimeoutException)rh  r   r:   r  r[   r  Z
last_errorr  Zcontext_limit_adjustmentsr  r  Zpayloadr  r  errorr  rX   Znext_max_tokensra   Zcallback_errorr   r   r   
_post_pageA  s|  












	







 




z'SiliconFlowDeepSeekOCRClient._post_pagerv   r   r   r*   c              
   C  s  t ||| j| j| j| jd\}}}t }| ||| j}t|}	d}
|		 s>| j
r>| j
| jkr>| ||| j
}t|}	d}
t|	\}}t||||}| jr| jrt|||| jdrz(| ||| j}t|}t|\}}t|	 t|	 kr|}|}	|}d}
W n ty } ztd| dd W Y d }~nd }~ww t|tr|d	nd }|d u ri }t|}tt | d
|d< ||d< |
|d< t||||||	|dS )N)r   r   r   r   Zprimaryfallback)r  Z
transcribez6DeepSeek-OCR full-page transcription fallback failed: Tr  r6   r   elapsed_secr&   prompt_variant)r+   r,   r/   r0   r2   r4   r6   )r   r   r  r   r  r  r  r  ra  rL   r  rX  r  r  r  r  r  r|   ro   r  r  r=   r_   roundr*   )rh  rv   r   r:   r   r0   Z
image_metastartedrY  r4   Zused_promptr,   r2   r/   Ztranscribe_dataZtranscribed_rawZtranscribed_markdownrk  r  r6   r   r   r   	_ocr_page  sv   
z&SiliconFlowDeepSeekOCRClient._ocr_pager8   c                  s0  t   }|pj|\}zt}|dkr td jdkr+t|j}td d| dj d dd d g| }t	t
d	jd
t  fddt|D }zLt|D ]E}|| }	z$| ||	< td|	d	  d| dt||	 r||	 jnd dd W q` ty   td|	d	  d| ddd t   w W n ty   |D ]}
|
  q w W d    n1 sw   Y  dd |D }ddt|tt   | djd}t||dW |d urz|jdd W S  ty   Y S w S |d urz|jdd W w  ty   Y w w w )Nr   zPDF has no pages: zDeepSeek-OCR start: file=z, pages=z
, workers=z, model=Tr  r   )Zmax_workersc                   s    i | ]}  j||qS r   )Zsubmitr  )r   r   ZexecutorZ
model_namerv   rh  r   r   
<dictcomp>'  s    z<SiliconFlowDeepSeekOCRClient.process_ocr.<locals>.<dictcomp>zDeepSeek-OCR page r  z done, chars=r3   z failedc                 S  s   g | ]}|d ur|qS r   r   r   r   r   r   r   r   >  s    z<SiliconFlowDeepSeekOCRClient.process_ocr.<locals>.<listcomp>ZsiliconflowZdeepseek_ocrr   )providerZengine
page_countr  r~  )r9   r:   r<   rj   )r  r:   r  r~   r   r  r   r  r~  r   r`   ranger   r   r|   r,   ro   	tracebackZ	print_excZcancelr  r8   rr   )rh  r:   rj  r  Zcleanup_pathr  r9   ZfuturesZfuturer   pendingnormalized_pagesr<   r   r  r   rm    s   




z(SiliconFlowDeepSeekOCRClient.process_ocr)6r  r   r{  r   r|  r$   r:   r   r}  r   r~  r   r  r  r  rE   r  r   r  rE   r  rE   r  r   r  r   r  r  r  r   r  r   r  r$   r  r$   r  rE   r  r   r  r   r  r   r   r   r  r   r   r   r  r   rF   re  rR   )rW   r  rX   r   rF   rE   )rF   r  )rF   re  )rF   rd  )rk  r   rF   re  )rj  r;   rF   r  )
r   rf   r:   r   r  r   r  r   rF   r;   )r  r   rF   r  )r  r   rF   r   )r   rf   r:   r   r  r   rF   r;   )rv   r   r   r   r:   r   rF   r*   )r:   r$   rj  r;   rF   r8   )r   r   r   r?   r>   r@   ri  r  r  rp   r  r  r  r  r  r  r  r  r  rm  r   r   r   r   rA     sR    ?






	
%


 
9rA   )save_imagesr/  r  r{  rc  r  r  r{  rc  &Optional[SiliconFlowDeepSeekOCRClient]Optional[Tuple[Any, ...]]c                C  s  |pt dVi dt|d|ptdtdddtddd	td
ddtdtdddtdtdddtdddtdtdddtdddtdddtdd d!td"dd#td$td%d&d'td(td)td*t	d+td,t
d-td.dd/tdtd0d1d2td3d4d5td6dd7td8td9d:d;td<d=d>td?td@tdAdBdCtdDtdEdF}|jj|jdG| dHdI}g }g }	|jD ]0}
|
jpdJ}| r|| |	|
jd |ddKdL |
jD |
jr|
jjnd |
jdM|jdN qdOdPdQ |D }g }|r:t| |	|dR|  dS\}	}dTdL |	D }dOdUdQ |D }| s5d S ||	|fS | sAd S ||	fS )WNr  r{  Z!SILICONFLOW_DEEPSEEK_OCR_BASE_URLZSILICONFLOW_BASE_URLro  r:   ZSILICONFLOW_DEEPSEEK_OCR_MODELrp  r}  Z$SILICONFLOW_DEEPSEEK_OCR_TIMEOUT_SECrq  r~  r   Z%SILICONFLOW_DEEPSEEK_OCR_PAGE_WORKERSr   r  Z+SILICONFLOW_DEEPSEEK_OCR_CLIENT_CONCURRENCYr  Z"SILICONFLOW_DEEPSEEK_OCR_TRUST_ENVFr  r   Z SILICONFLOW_DEEPSEEK_OCR_RETRIESrr  r  Z*SILICONFLOW_DEEPSEEK_OCR_429_RETRY_FOREVERTr  Z&SILICONFLOW_DEEPSEEK_OCR_RETRY_FOREVERr  Z-SILICONFLOW_DEEPSEEK_OCR_429_INITIAL_WAIT_SECrs  r  Z)SILICONFLOW_DEEPSEEK_OCR_429_MAX_WAIT_SECr  r  Z#SILICONFLOW_DEEPSEEK_OCR_MAX_TOKENSrt  r  ZSILICONFLOW_DEEPSEEK_OCR_PROMPTr  Z(SILICONFLOW_DEEPSEEK_OCR_FALLBACK_PROMPTr  Z*SILICONFLOW_DEEPSEEK_OCR_TRANSCRIBE_PROMPTr  Z-SILICONFLOW_DEEPSEEK_OCR_FULL_PAGE_TRANSCRIBEr  Z7SILICONFLOW_DEEPSEEK_OCR_FULL_PAGE_TRANSCRIBE_MAX_CHARSru  r  Z%SILICONFLOW_DEEPSEEK_OCR_IMAGE_DETAILrv  r  Z"SILICONFLOW_DEEPSEEK_OCR_MAX_PAGESr   r%  ZSILICONFLOW_DEEPSEEK_OCR_DPIrw  r  Z'SILICONFLOW_DEEPSEEK_OCR_MAX_IMAGE_EDGErx  r   r  r  Z%SILICONFLOW_DEEPSEEK_OCR_JPEG_QUALITYry  r  r  Z(SILICONFLOW_DEEPSEEK_OCR_MAX_IMAGE_BYTESrz  r  )r'   r  rl  r3   c                 S  r   r   )r  )r   r&   r   r   r   r     r   z4run_siliconflow_deepseek_pdf_ocr.<locals>.<listcomp>Zsiliconflow_deepseek_ocr)r"  r,   r2  r/   r0   r2   r  r:   r6  c                 s      | ]	}|  r|V  qd S r   rL   r   partr   r   r   r         z3run_siliconflow_deepseek_pdf_ocr.<locals>.<genexpr>zattachment for r-  c                 S  s   g | ]}t |d pdqS )r,   r3   )r   r_   r  r   r   r   r     s    c                 s  r  r   r  r  r   r   r   r     r  r   )rA   rd   rJ   rK   rV   r`   rT   rP   r?   r>   r@   r   r  rn  r:   r9   r,   rL   r   r+   r/   r0   __dict__r2   r>  rF  )rv   r  r/  r  r{  rc  Z
ocr_clientr  Zmarkdown_partsr  r   r,   Zmarkdown_resultrC  r   r   r   rB   O  s   	

	






 
!"
#$%'






rB   )F)rC   r   rD   rE   rF   rE   )rC   r   rD   r   rF   r   )rC   r   rD   r   rF   r   )rW   r   rX   r   rF   rE   )r[   r\   rF   r]   r   )rc   r$   rF   r   )re   rf   rF   r   )rv   r   rF   r   )r&   r   r   r   rF   r   )r&   r   r   r   rF   rf   )rv   r   r   r   r   r   r   r   r   r   r   r   rF   r   )r   r   r   r   rF   r   )r   r   r   r   r0   r   rF   r   )r   r   r   r   r0   r   rF   r   )r   r   r   r   rF   r   )r   r   rF   r   )r   r   rF   r   )r}   r   r   r   r   r   rF   r   )r   r   rF   rE   )r}   r   r   r   r   r   r   r   r0   r   r   r   r   r   rF   rE   )
rv   r   r   r   r0   r   r2   r1   rF   r  )r&   r   rF   r;   )r  r   r  r   rF   r   )
r,   r   r/   r  r0   r   r  r   rF   rE   )rF   r   )rF   r  )rv   r   r"  r   r&   r   r#  r   r$  r5   rF   rE   )
rv   r   r9   r1   r/  r$   r.  r   rF   r0  )rG  r   rF   rH  )re   r   rF   rM  )rY  r;   rF   r   )rv   r   r  rE   r/  r$   r  r$   r{  r$   rc  r  rF   r  )R__doc__Z
__future__r   r?  r7  r   rI  rJ   rV  rk   r  r  r  Zconcurrent.futuresr   r   Zdataclassesr   r   Zpathlibr   typingr   r	   r
   r   r   r   rx   r  ZPILr   r   RuntimeErrorr   r   r   r*   r8   __all__rP   rT   rV   rY   rZ   rb   rd   ru   r?   r>   r@   r
  r~   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r!  r,  rF  compileSrP  rL  rX  ra  rb  rA   rB   r   r   r   r   <module>   s    














0


$

	

/
<
h




	: 


   J