o
    bsg                     @   s  d dl mZ d dlZd dlZd dlmZ d dlmZ ejdej	fddZ
d#dejd	ed
eejejf fddZejdej	fddZd#dejdejd	ed
ejfddZdd dD Zejeddgdejdej	dej	dej	dej	dej	f
ddZdejdejdejd ejfd!d"ZdS )$    )TupleNConfig
BLOCK_SIZEc           	      C   s   t jdd}|| t d| }t | | t j}t t |d }|| }||jj	}t 
|| | t 
|| | d S )Nr   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr   pidoffsxsy r   H/root/paddlejob/workspace/env_run/models/DeepSeek-V3/inference/kernel.pyact_quant_kernel	   s   r      r   
block_sizereturnc                    s      sJ  d| dksJ tj tjd} jg   d d  d| R dtji} fdd}t|  |||d ||fS )Nr   r   r   c                    s   t   | d fS Nr   )tritoncdivnumelmetar   r   r   <lambda>   s    zact_quant.<locals>.<lambda>r   )is_contiguoussizetorch
empty_likefloat8_e4m3fn	new_emptyr   r   )r   r   r   r   gridr   r)   r   	act_quant   s   2r3   c                 C   s   t jdd}t jdd}t ||}|| t d| }	|| t d| }
|	d d d f | |
d d d f  }|	d d d f |k |
d d d f |k @ }t j| | |dt j}t |||  | }|| }t j|| ||d d S )Nr   r      mask)r   r	   r%   r
   r   r   r   r   )r   r   r   MNr   pid_mpid_nnoffs_moffs_nr   r6   r   r   r   r   r   r   weight_dequant_kernel   s   $(r>   r   c                    sx   |   r|  s
J |  dkr| dksJ |  \ tj| t d} fdd}t| | || |d |S )N   r"   c                    s    t  | d t | d fS r#   r$   r%   r'   r7   r8   r   r   r*   3        z weight_dequant.<locals>.<lambda>r+   )r,   dimr-   r.   r/   get_default_dtyper>   )r   r   r   r   r2   r   rA   r   weight_dequant.   s   rE   c              	   C   s6   g | ]}d D ]}dD ]}t ||dd|ddq
qqS ))    @   r   )            r   )BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_K   )
num_stagesZ	num_warpsr   ).0Zblock_mZblock_nrP   r   r   r   
<listcomp>8   s    
rR   )   rF   rG   r8   K)ZconfigskeyrL   rM   rN   c                 C   s<  t jdd}t jdd}t ||
}|| t d| | }||	 t d|	 | }t d|
}| |d d d f |  |d d d f  }||d d d f |  |d d d f  }|||  }|||
 |  }t j||	ft jd}t|D ]\}t j||d d d f |||
  k dd}t j||d d d f |||
  k dd}t |}t |}|t |||d d d f  |d d d f  7 }||
7 }||
7 }|d7 }|d7 }qt|	|j
j}|| t d| }||	 t d|	 }||d d d f |  |d d d f  }|d d d f |k |d d d f |k @ }t j|||d d S )Nr   r   r4   r"   g        )r6   otherr5   )r   r	   r%   r
   zerosr   ranger   dotr   r   r   r   )Za_ptrZb_ptrZc_ptrZa_s_ptrZb_s_ptrr7   r8   rT   rL   rM   rN   r9   r:   kr<   r=   Zoffs_kZa_ptrsZb_ptrsZa_s_ptrsZb_s_ptrsZaccumulatoriaba_sb_scZc_ptrsr6   r   r   r   fp8_gemm_kernel=   s6   ((((

0
((ra   r\   r^   r]   r_   c              	      s   |   r|  s
J |  r|  sJ | d}|  |  |d| jg |  d d R dt i} fdd}t| | |||| | |S )Nr!   r   r   c                    s    t  | d t | d fS )NrL   rM   r@   )ZMETArA   r   r   r*   j   rB   zfp8_gemm.<locals>.<lambda>)r,   r-   r&   r1   r.   rD   ra   )r\   r^   r]   r_   rT   r`   r2   r   rA   r   fp8_gemmc   s   

*rb   )r   )typingr   r.   r$   Ztriton.languagelanguager   r   jitZ	constexprr   Tensorintr3   r>   rE   Zfp8_gemm_configsZautotunera   rb   r   r   r   r   <module>   s8    &
"
&$