From c9b19831b3dbe32c157b0a68eae96d8b544c26e0 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 10:28:10 +0100 Subject: [PATCH 01/12] [sw/lib] add r4-type CFU instruction primitive --- sw/lib/include/neorv32_cpu_cfu.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sw/lib/include/neorv32_cpu_cfu.h b/sw/lib/include/neorv32_cpu_cfu.h index d51af2324..69abd5a49 100644 --- a/sw/lib/include/neorv32_cpu_cfu.h +++ b/sw/lib/include/neorv32_cpu_cfu.h @@ -51,12 +51,15 @@ int neorv32_cpu_cfu_available(void); /**@{*/ /** R3-type CFU custom instruction prototype */ #define neorv32_cfu_r3_instr(funct7, funct3, rs1, rs2) CUSTOM_INSTR_R3_TYPE(funct7, rs2, rs1, funct3, RISCV_OPCODE_CUSTOM0) +/** R4-type CFU custom instruction prototype */ +#define neorv32_cfu_r4_instr(funct3, rs1, rs2, rs3) CUSTOM_INSTR_R4_TYPE(rs3, rs2, rs1, funct3, RISCV_OPCODE_CUSTOM1) /**@}*/ /**********************************************************************//** - * @name Backward-compatibility layer (before version v1.7.8.x) - * do not use for new designs! + * @name Backward-compatibility layer (before version v1.7.8.2) + * @note DO NOT USE FOR NEW DESIGNS! + * @note THESE WRAPPERS WILL BE REMOVED IN THE FUTURE! **************************************************************************/ /**@{*/ /** R3-type CFU custom instruction 0 (funct3 = 000) */ From d7d356c0597b85125e7e78c3a7be8842d44e337e Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 10:29:08 +0100 Subject: [PATCH 02/12] [docs/figures] fix/update r-type instructions --- docs/figures/cfu_r2type_instruction.png | Bin 3599 -> 0 bytes docs/figures/cfu_r3type_instruction.png | Bin 0 -> 3781 bytes docs/figures/cfu_r4type_instruction.png | Bin 0 -> 3960 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 docs/figures/cfu_r2type_instruction.png create mode 100644 docs/figures/cfu_r3type_instruction.png create mode 100644 docs/figures/cfu_r4type_instruction.png diff --git a/docs/figures/cfu_r2type_instruction.png b/docs/figures/cfu_r2type_instruction.png deleted file mode 100644 index 4c397d964384401aef51682d665eb849de96c370..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3599 zcmZ9Pc{mhWAIFVBDq|NOmWGQ#b_Tv`EjcIn$4?g#QtCJn0*gXfeh7X5gp06adwomoP zzhyA=JHggE60$*4UPKaf^b`&oWWsv?^l-IbUAw{SU5d zoVr$vm7|fxq{NkI%BY!r{`%L-k6Sw2O2v6Ngr96-@kE$EUT5YjVNpxR zueJYU_g4-axqk@MSr2F5`*|*`Y2OiA7L%8q$mp6p_HtG3u=m)nqKk`3J^ICdg}x&Z zSvQX>suobrx|Rf6C)Ea4%*wk*E@l;rpUc@k)7O2}fzT=zWpdjaR1Jfgc{)JmlLgsD z+1>BvSS@ukJ*T%a7FWFayGHTkz3+Ab)Wko0Zd#;yl?qBhLJEv@rsoexgXJfjrlohs zi`wxQOC6~{BJ%GulE2?a@qx5w$FEB@-g%W%8YwpbHr=6qOkTxa$q+v+I4>oJI(;3? zk4(TiXQLHCotZtx8@JiAVP}MuJ&uWkVnh{SQ1DwhV?v=nA?SHP)0f4W207s`&euOv zpVd}I7oLvvKFKqhjeUkqR||ozzDUXh_HR#t*yg>}iaQ+cC>&q+`wdh_`k}1g8gvVE zRX#5NzDZm`2~zxiwFfx)iiUrU$Ajge#%XjInO5OW%NTGvojYSUq6+307XeBgEj)^p z3Zb8#Z>)K6N~@v^Ufms8KsoiKB4H_K&2$JUHSCM{wu63Xv|1i&oocc>PSvK4j=|k; z^_A4%_GaA`^Hp{><)RDfhX%1WJVKL?2xiS2qW-YbQWv^UA}%72(N*WfA+t%U{CcSo z{}s*dvusn{loUp)t0hic4ee{v4cyMS2U`)jxIno+>%zlF z9Zu6@_OJ&0zPRa0oa}Hmc5l;*zHI>`;iH+szaX zT-(h)UsgoOu*0CSt*s(|b!Ou$L@}VAQP?^b;T-X|?afT?y@C6;G@ys~ z9O+R}k&)W|p?y)4A0LaJMrJim%o>GT(ie3l1^)aj|H+sOA_F)24zF4(5wQ*2P4Vo4 z*ja2)CMYzMr3P+^yMt321W_0i&Gzk7J}=oxm4wV9Ifk3IGaNTPs5()G`du!+a~Y1E z_@%j+2>pZpDr{=IynpW?b{>ELs<+2+8BQCkli>vx?H`yMojhtrKzx@45Pwf5s@YU7 zH?jAbXA5LXcL?iMp?-`V933-NOn8J8;E)lFd<6MEFBId2U=)WQatBT_HSFpQ0w>L0 z^or&==lt&X9dTn$EbZF^gF2j6t-fwVsaU zy!P3pnfMiqKtx$B4EjQI29bCjhlD|G)5bgaz{wF+llWZMqppa(T!8pD#+8*0tIeENon>;* zwqpnMO~;-5odbg?zqO4HW|^I~EWC|wwUHTCatD#!O?~1Lr0bN~(G}f9H=NzSi}qeu z!>fdjtz|d$7K+x8!5M8yv_@ww*M6)`zU;i7r1tZIy)HZVUhK;p(jQY}2YhTgLO)Xh zsY7J~Z#?xYg&zuBvieb30f_B=RZYMxr!Ss7GKj6(_ENMi{gOk`Y@925nKV5iy$g$n zP+KgI%Ype_Mf`Q%FNi#&p~C|9Ul#{igk7wuxGj!sM2EIdVeXI_SX*6v`3q5QftZV8 zs1yBf_vLfW6rqRdHpf0n?A;tLe+LM=_?*7*xO>{?R=cDP^WAJ~!tS4OMyhvY79Q4u zR(BU`%^cTLKl3Tdfs-kB{BPCjcKHyeROjkmmJQ$DEaj_vK7~tYtKD*IpfnRX`Wp^z zuAgUUQnWR(FdiV}ev$O=Q@}T+B9Lw53{IJl{1*&;apNS-z$kMdl=(z3l z$#_Um%bUPl0Mb-s+BgoinAow~(11UW_HQQKsC82!DLzZLdrcDM5#N6?aOqSMtN+FK zcYnNG&y2`WJECx;B{SlD3-c#tNC)zaUgDT?nvsmKh*qSU)XrsSt z&a&#pSnYme-^%rCd2^nj`Kwf0iDdQpFrbS$U;&^)i!7HuxZ$BGdmy>&--3w*C#!7{ zmSM$G&CrpG_L;oKL2cwc6fFZRv(bZk)0yp1}5FdHj|wDgD;NgdujN(#B=968K!QU1icMIyDDe29d#(WR z^Y>K|0>gZcXmEZ*_5nB<^NNL~1<(XDMDIhB)~6Rcq{)J7Zr+I5v~YD+yFX=lHj(CBsu%VLN%hP}GzL_^E0{FQyrPT&7`+<{#n+0((!ZVFNO8k&=PW#A`@UfFsKtew%qu9v{YP~xY?A1 z_HjcOznlk&NgTcvDL@TNTjLa9<=OfqR#%x`#Pqkq6NH&yj1lQXTR^Be5dYKg?{*$B zStMu|?A(#P0O~YmLBY!<^`D0fVBL^`5Irx%_5fPZN}goTVnIa;3kom3|CK07!!Rf% zYN!iyaLTPb&p+#|CB>;`+j56OgZMaKj<Jg4lxW-mdW~~Q@H)@;`arxtADBP! zdMgPUg2^U}Gu5sptU=*j~OxFooh+%tQb5Jab@h`X?WocfXlWy1q( zswF}qQ?~E1UMtZD0b5FV`0vQ~@AP-qYBDLirD0m1=daHWD;pm2=K@ZO{WBf@Cn!dr zW4%z$g=!ps3y=L+ejvY%`Q(m)-9D{40>xab-YUDf8Ck=F&XG0_&s4uA;kPT`)wb{K z?_OFsea}4{{bkwYRi-(6Krd0z6-FFYOf^E=z`3J&ql;%~1+Tx^Ej}kh#9xbVs=Q1Z zI?{PXxm?5?DzmLadpbA%-fYQRS?0jFQ_z}=B?vAXoti$B^7{Kl$5MD`na>4yKq)^P=Kbn8&7jFgUWzkghsy9YP#Gcd0VdX>GPRkAC zRb^k4pCXU=dE?xW&YV~w-{q3UQrV~NjaCm%w4F)GsM~-?JGv9Yo8HjoKsRu)NR1A=Z=@(-bxy@U!Mc?W`v&vR0EJx4ZTJo_2s1CT2<62ffl^K zHm6Y5bhSp;X^5X{`_N{C`ruLJWpwBbNIaIUC_Sn?fOPP z4@pSF1DstiHzy$$?_PYyuWVYDo?tKi4wG}aNE+faEI|k&gg}a<$Gdfq>b^Q!Dml`% zyuPQx%4?#wJie=ks6JVhmY^KaH@=nN2p+CgfXpHyMG}(Zr1MCdbXAlsnolUhA_m7? z&E~Qd*cK!PksR9XHy5Nl(8B|m=Y{C!IXJG?z%MpixKE?aCF7N=*Aymv&_Uj)o^()* z>^4n)Xq_8AX8N?{DMzW`w4XuMc+&!6)O_QqnBMuIGer9%HGSM44O)&i5&=IoRs%)!aISUlh;dt#j(UvFMTw&(OZ#T{6Eafnov= zhsL*vAxCUpgvi>3w-?saV`>9Sz)?i_d|O6f6JKUhui5+En5#-c|NC+w&NPpOi{IQs sfRjUKZ65v4m20vH;8ahD&-NjJVWam5a8?+^`slHl8d(~a8r**P51^FT`~Uy| diff --git a/docs/figures/cfu_r3type_instruction.png b/docs/figures/cfu_r3type_instruction.png new file mode 100644 index 0000000000000000000000000000000000000000..8d0cd9d9647d6c63b0b78accfcb22d222265625e GIT binary patch literal 3781 zcmb7Hc{mhY-<}zyC>ln}lBuYX*<6n)h|-|C^md^##=0w!cw*3h*qV;7qfT_~paE zQt~Z_L8B}C8IqiSr7$P}xE}!k9mWB8e>c5OF81dIzTEepz~6m!N@yUvHrkfF*Y{mp z-8aoBi;3hcKacAx4)V+GT|zm^%z1PL-?g{tshhn!={)< zwMcpRHehx_it`Owm9DN2aBX8lEn4ShwJN<$1u6HYO3;KoQqBB)8C*Id6pkEP4pC<& zTEB{&R&G45-NshMleF&p1AUc*__2egbUkbb@#cm(CT831UVR$t%wC&7u-!^VVd35S ztQ&p>Xsw#`8-fZ-atxSv(ek^yn2Fm2%a#qA@=X;d-FefDMA1)nkzG_nCl4VC1ZwP> zE;AJ;0y+YE;o4mdbU@zuUgY>3d-K%McQv${l_pAy@B!}g=Y{qH&SqUG~ZP2xpE;^r{=~?^2oxr=j;?(p2A={=Za$rU1 zphCDjUj(e-sar;n0$-DK>*Zgg`*d&RnU}x3dK&^wDKy8qr)9WPPF5t+(y2S2GDTS# z=9&#@p68N_()|tO7EJ_ScP;TI0E)^z@LpmvD*Rbv^^$0mXjy^iYPgNM3 z(K%51Hk~(#({ttUkO$09|Gy065)QxL9LFmnYWct1Tnqq_K)GuckrhKl@<7Cp*(_i=Sat1nOMO-}09{AGB*0m24bH{w{@L)p97dK{??>V#@?qo%yhVa; zqmV$HU-`wB>+viLUkoRE%hrH_R^*GwJEUwr1|f;@U4h7Tlsp2BOOazs6UKuRxJ%=2 zHU(x!njY-cH1se8iWoNg&1C_!jAnDr$#IU$Y4A1a`Moke-n%q;9Ia0V7^DIFMM zAB7sewOY1?{6$}$Y+Z9`gyuDVB8Ff)9VK;lmUat|AP<tnX6Z)v%h<7{eNd$5`hl}tD0Gwy~g^+09J8<5EKg(ppiGdk2Wu<^0$8?q}js1%M5>}1_<(3r+U_ME3Uq!2g;b7eE6tlM#5SQifm*%(9HvT2Mwd~m1NIr z(MM%G&jp2*ol<((@e>}lYu((6&Hiy~!T{JkM1I1gr51H7?B0*2J*)xctcTQ9AGSYv zn$g`JK*a|Gv(~o@e;JjdFS8VKG8kyKaK$(XH@A<9HI_l_k5`hTbaZ zm$Vb|8qP)`DS*ngrd7X_f&y_r>{ogi3biohFajSnU>IHQ`|Feq&kRkZ5#GBRbARzNwKKNSy%DT7fwj9KZA#FYh}8?)wOM5B~(>w0Rw zOP~ZpY0LVDKS4}+-{h7xA(FWMwdto4;e*SUz=fTF+wcsC+LBLA@t(i{J)2 zBkQP=P$#V`YT)FRsY;0DMVEsA<|&vb2NoD34S_PiY81kY<%;Aoj&W_h|IsKNj)dj!+#0fex9gfd9`>1Tb6} z;iQP474+bA^sm|8{n}5fweDJT2d_{ShnHMi(g*)Ymchp}9LvbjVwHODR*)SfI|X`r zY{{_^I7U$ z0;&RBll@#XHJEo)C$p?85ZBnL;$se~NsV;)<7McA&sLYo zMSW62TU4%wwn=;sED%l8Nh5qK8&o7uWO>>ryUt~pf~9l0nK^x`9nZs$Z_$@C)ZsdJ zXm2+B$p8zbte`N49X7U|&4fT(i5H-}yC=-G)7C)abIaF(*W(RFwmcD+*D_--jOfis zMruqGXb($_vnCWsLyIB9+lWL~>h`DyJR*;ue1Bn(ng&N0F?8TtZk%?hIr1)zy_AtY zqm98*Hr4ndB=GM1!2MTv-5!;Zux|hs&k2d4a)0Pc+ODu+h)$gXmzt(?CP3$}C_MfAxz) zss5S_$YSNZT#)Kh#B!^2hNuu@A@>?4u@B;PN^a1?fPx#(Wi=N7WF=)M-#_3+2EcQXWWbd+0j<*Ie z2VaOTdOg9(nT>m+mfoz69G2PkxFICEPFG>3frgRqU`? zzG2c|HF|~V8H%sU)9<*m9O%1u7LF9&t8}DH&WcI3JbCTh5rCEb;brvw@qaV`Dhr%I yNOj+fE(K=+N|v(K{;LCsR{*~K87IISdQ!}y$H&kU6Ws49;HsgS!Mi`5!v6&|5H@K5 literal 0 HcmV?d00001 diff --git a/docs/figures/cfu_r4type_instruction.png b/docs/figures/cfu_r4type_instruction.png new file mode 100644 index 0000000000000000000000000000000000000000..a20fcf5109f585aa1edd05e5236dcb25db50bf53 GIT binary patch literal 3960 zcmZvf2{aVk`^Qbgpo}HuX9<(6A!Oe&7<-6%iy?_HLX#!SSfeb_Ad<#V7?EYjS{T`v z7z`OpWyv06tTUE|pZC1y{NMNepWivpx#v00dG5LQx%b@9_qhqztW3Zs1y3?DF@eoY zVYW<6K*+JL!3H`WS%k4bCMGUDGnoF3U?)oU-Hdrj&ZpWdnG6LVvXXz87Eb2G(n&M^ zM2F8WYUKulo>!YHPmjz<6Wj|s~pDgdP zRckC=c-HuIp~8BIgD7=_K=Zuklo2ktVVxDq_K}i{F(Auc%a3&UVY-$&$R|_$bpGM< zB+>*z?Zeq4`2BAF4iEbq7}ebD0V3@M-_Rp)Luy8jD`LUFisF6S^H)dw`N{`8S#_$< z`t8kF*T#_^&D0HDGE6AJ?ah^#lODb&0i(J%{KpJHm(+L?m(BpJGFkJU7VPn#ZACej zblP&HK$TFkKXfAklez)Kn0KvW#ZnooVs}{Vb&BgeJaNCYh z_O}NYVTgk1QC7nUR_4~m4z}?pWxUJ6{HbBkuID-_8Jg8toD%8Nu$48`1v#FQYGCgd zRKWQwpFrn?B~G>h()n!<NH@%R5s5I zSm{hW6M+y@xy+M!R-wF=to>0h4e#H5na@wbB|k(^#mo3=^tyX0Q|n|Ny;SXG&xUJI z-qlN>B6N|kUxo2zuY`t)l+BBM>!Y7Z zHD5eBo@nl!Lq}k?Ui#LoAl*17^QE|7XLU~u4`c>?H*Z#bqXIV7Wkz2UUqi;XZ+arWscZBmk$!(Cw8L;bRu+PrC9PbDf|}US-okIq@qJ5iFMl+s`bc=?K^OnrlX)+={8-d9`ykwJ!+Etk z0jqf8;~xQ5Mh8 z>nUKiPMO5BUe0_H)Pxvtw=TRb)m{3YfJ+oh{!GT5erTTuw=*z$xf#Sfo8Iv()aifZ zofk?;^os(sNva8PCtc^0T??^KZ+QdUis?|Z;WhX8@6myTaLq5>+6A3UY|~*P|D)l z!{3b?bw*#tjC2PyD*$^{l9$WlP8UdaW*NAoz8k~mjwLhaM}9sx^#1mS(4=F?@;kJRA`m~8f=%|p&e ztOO>ZBl4?Q#C4&C)yyk9hs1aNJXFgg;`j0{+I=VrDT;gd*0G%GGX>wqmnFH)hZ_Q; z$?ZuyxkN4%(OR-}BKME@9Sdbt$tSVB|UifjA<^&ULeQ~=PY`BKvC z(>KflmoRm!C9)fNTSL43+-?0J96mdh>s>Yc&X1^S1IYT9T!_EEp~kA6PKH=a=UW*R z@*6yO^&%9hlsBo@<1y!J4jc-dwK?=5LzD2t$#-_-VR>HO z{y=OlJX@@~sB3WF12`m&^Hi*^P)(#+I;h~6#GE&hJJ%&WC}a7VgIHoWtYPo|>(8D@ zl6C_b*#y{l0R44tR*3t@GedfO--^Bi(8gU$qLa+Cs@&cTnj0{QJ?hDEI6SRYDJ?Fv zTt9kZlEV62ww~?JdtG(+65vYk3`9H++qs|6JL~l1d6$NEI7I135uI0Ah+C6)4>ZSh z;huaCY@inET77&H%!47GVIJ|1!E8b)p8ca{P1-Z6iKK6Q^8BQ$a)%`T#&M&4Lzha$ zL~Lj^@`>g4xkknXr4N7)Hh1D7ib=yOp*FUSCq|1+~{dJDj@)JA{PRTQh&~{aRz+wtQz}>X4_@wiLxu(vFP~}m&>~2bZV2kbwlq*N;!IsXmTAQHoLKHNf?hRof|G>Wf+-;mz_MLAh?bFj|*TaBf$+X*vLV_vO;#oa^GBontHowL6ggWGd-Dirf z;rU+fwz?_=b@e^nX%dO<1Tp6i{6!K~#l#o%_7>2bNZE|3@|j8wrRHeT`kT*8R%iCu zD*!Dj?!ai@FD@Or3jZT71>9|$;)(M(+nK{EJCmz?t%TTH(Ftvz`x z{xmE$kI%!i1q36|o_o#L@Syei8=GUwur)HZOVmdy0W`Fe`q*_lsJEc?^&7 z>cI97vlr~bfYD{T_+z-UIp6G)N*cy%2{B|3W|Ey}>=0&uOVMBQ<9b35gIz+$8T`Md zG^JVZSilqoOrx0pX|>1wL?u|2RSa+%@(079CohYqn<76Y4nIPPTW-AX_+`QPkX zSs@E-UlLs1wOtzBS@<2vyEM<(qUIbpWF_b92`p>-qp@~Wovl3l(;9SY2toOdn4SE< z6z{K1agaJ1D$H_?tzLapfjH$byn2{fHGcL`;0}oJ4NRZYox!haX-^1$68&_`c>jvS zr=4-iV3=FrB;5GTx5mZ9%d*5-CDp^sgaEd@o-3hk5DD8)lP9kqcnf$=B1~uV6AT-W z+|KN-l<=cwUq)uqd-aZaa4dWIe31kpn$#;#jwm@@-l4)0}*@O4@8{%Qu6 z;~r{+)Zlv;t3N(wnuvnzcvl~pKk~WBs@p}$mctHfPNMC z4D3rm##M~HOAn52la*$4W>UxsQVe)KB23h;VRa7t+)4kX32{ts2H+-~=j%`vbver^ z$~oy4rpIlf>Q66nriBb-S^5VR#juE*nKV{7dtznB$&bc7_)zB}z_BW!-+eFoO0K57 z$LsQ7r$~aajhExfH+fZXe6FE(1Os$;~({k9U;hVNqJKyOsv1i+Wa!P#Z>iX{b8lb`8JEt$Hqb`K`Cv)B~>cc z?3}32YOI5e#x5eN3*E0zJ-7Zj=Xd)$za!{|hQQ;><7FdO?pkHxPU@)Wra)t{l(Wr1 z8&Rc#t0+3Bw+VDX94#ItB38pHvi(+)QZ})I@qavbVA_(D_9CEqGmE0_)WpM_(db0D z^`+3J?L*})3oWsv@7@o*pyqM$5$8b5coNRRj88px8FA`yJaBmpJFkDn`lTGIN^J%+ zyr~Wlr<(+pSjNsnE{@nFQXTHUGv12pLDyNoz{^gUg7pUqRk&0ZLu)d@-4=h;#lyL% zd$Ql$PZcM<7cMs%>3rxnOn=e$K9+&%r!7o9kbKXq4BC1`SXoa-X}N!WTGw@Y@GH9~ zS7~`&92q4_R$-q{(P)SaC?f;~W;T_Dn5^7pc)5libeZ}m5O5^{dhl5scBJ1mV{}5( z0^H+0pTE&{Gx;-NORGg@ozT_IzJX*?>-PRAFfg5eFEqyG!%4VQBO literal 0 HcmV?d00001 From 94de91e911297447e3ecb927af1b0029825f8645 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 10:29:41 +0100 Subject: [PATCH 03/12] [README] add CFU R-type instructions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 32475f7f0..3630bcd0e 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ and *Privileged Architecture Specification* ([pdf](https://github.com/stnolting/ * implements **all** standard RISC-V exceptions and interrupts (including MTI, MEI & MSI) * 16 fast interrupt request channels as NEORV32-specific extension * custom functions unit ([CFU](https://stnolting.github.io/neorv32/#_custom_functions_unit_cfu) as `Zxcfu` ISA extension) -for up to 1024 _custom RISC-V instructions_ +for up to 1024 R3-type and up to 8 R4-type _custom RISC-V instructions_ * _intrinsic_ libraries for the `B` and `Zfinx` extensions **Memory** From f94a4b662facaa4e9111b09989bec6e7bfeadad8 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 10:52:51 +0100 Subject: [PATCH 04/12] [CHANGELOG] add v1.7.8.2 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b95921bce..6bb792dfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ mimpid = 0x01040312 => Version 01.04.03.12 => v1.4.3.12 | Date (*dd.mm.yyyy*) | Version | Comment | |:-------------------:|:-------:|:--------| +| 03.12.2022 | 1.7.8.2 | :sparkles: new option to add custom R4-type RISC-V instructions to **CFU**; rework CFU hardware module, intrinsic library and example program; [#449](https://github.com/stnolting/neorv32/pull/449) | | 01.12.2022 | 1.7.8.1 | package cleanup; [#447](https://github.com/stnolting/neorv32/pull/447) | | 28.11.2022 | [**:rocket:1.7.8**](https://github.com/stnolting/neorv32/releases/tag/v1.7.8) | **New release** | | 14.11.2022 | 1.7.7.9 | minor rtl edits and code optimizations; [#442](https://github.com/stnolting/neorv32/pull/442) | From 9a06cf3c9012789a8f8c44f9053e794df2756725 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 10:53:09 +0100 Subject: [PATCH 05/12] [docs] update Zxcfu section --- docs/datasheet/cpu.adoc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/datasheet/cpu.adoc b/docs/datasheet/cpu.adoc index 47987c477..c30a6042e 100644 --- a/docs/datasheet/cpu.adoc +++ b/docs/datasheet/cpu.adoc @@ -654,24 +654,26 @@ Any additional flags within the `fence.i` instruction word are ignore by the har ==== **`Zxcfu`** Custom Instructions Extension (CFU) -The `Zxcfu` presents a NEORV32-specific _custom RISC-V_ ISA extension (`Z` = sub-extension, `x` = platform-specific +The `Zxcfu` presents a NEORV32-specific extension to the RISC-V ISA (`Z` = sub-extension, `x` = platform-specific custom extension, `cfu` = name of the custom extension). When enabled via the <<_cpu_extension_riscv_zxcfu>> configuration generic, this ISA extensions adds the <<_custom_functions_unit_cfu>> to the CPU core. The CFU is a module that allows to add **custom RISC-V instructions** to the processor core. -The CPU is implemented as ALU co-processor and is integrated right into the CPU's pipeline providing minimal data -transfer latency as it has direct access to the core's register file. Up to 1024 custom instructions can be -implemented within the CFU. These instructions are mapped to an OPCODE space that has been explicitly reserved by +The CPU is implemented as additional ALU co-processor and is integrated right into the CPU's pipeline providing minimal +data transfer latency as it has direct access to the core's register file. The CFU supports **RISC-V R3-type** instructions +as well as **RISC-V R4-type** instructions. Up to 1024 custom R3-type instructions and up to 8 custom R4-type instruction +can be implemented within the CFU. These instructions are mapped to an opcode space that has been explicitly reserved by the RISC-V spec for custom extensions. Software can utilize the custom instructions by using _intrinsic functions_, which are inline assembly functions that -behave like "regular" C functions. +behave like regular C functions. [TIP] -For more information regarding the CFU see section <<_custom_functions_unit_cfu>>. +For more detailed information regarding the CFU, it's hardware and the according software interface +see section <<_custom_functions_unit_cfu>>. [TIP] -The CFU / `Zxcfu` ISA extension is intended for application-specific _instructions_. +The CFU module / `Zxcfu` ISA extension is intended for user-defined **instructions**. If you like to add more complex accelerators or interfaces that can also operate independently of the CPU take a look at the memory-mapped <<_custom_functions_subsystem_cfs>>. From f1650b852e4d1db609aaedffd0ccdd7201e0eaee Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 10:55:05 +0100 Subject: [PATCH 06/12] [rtl] add third register operand infrastructure third register file read port --- rtl/core/neorv32_cpu.vhd | 16 +++- rtl/core/neorv32_cpu_alu.vhd | 3 + rtl/core/neorv32_cpu_regfile.vhd | 14 +++- rtl/core/neorv32_package.vhd | 136 +++++++++++++++++-------------- 4 files changed, 102 insertions(+), 67 deletions(-) diff --git a/rtl/core/neorv32_cpu.vhd b/rtl/core/neorv32_cpu.vhd index ffa8b84ea..bd54d54ef 100644 --- a/rtl/core/neorv32_cpu.vhd +++ b/rtl/core/neorv32_cpu.vhd @@ -120,10 +120,15 @@ architecture neorv32_cpu_rtl of neorv32_cpu is constant XLEN : natural := 32; -- data path width -- ---------------------------------------------------------------------------------------------- + -- local constants -- + constant regfile_rs3_en_c : boolean := CPU_EXTENSION_RISCV_Zxcfu or CPU_EXTENSION_RISCV_Zfinx; -- third register file read port (rs3) + -- local signals -- signal ctrl : std_ulogic_vector(ctrl_width_c-1 downto 0); -- main control bus signal imm : std_ulogic_vector(XLEN-1 downto 0); -- immediate - signal rs1, rs2 : std_ulogic_vector(XLEN-1 downto 0); -- source registers + signal rs1 : std_ulogic_vector(XLEN-1 downto 0); -- source register 1 + signal rs2 : std_ulogic_vector(XLEN-1 downto 0); -- source register 2 + signal rs3 : std_ulogic_vector(XLEN-1 downto 0); -- source register 3 signal alu_res : std_ulogic_vector(XLEN-1 downto 0); -- alu result signal alu_add : std_ulogic_vector(XLEN-1 downto 0); -- alu address result signal alu_cmp : std_ulogic_vector(1 downto 0); -- comparator result @@ -338,8 +343,9 @@ begin -- ------------------------------------------------------------------------------------------- neorv32_cpu_regfile_inst: neorv32_cpu_regfile generic map ( - XLEN => XLEN, -- data path width - CPU_EXTENSION_RISCV_E => CPU_EXTENSION_RISCV_E -- implement embedded RF extension? + XLEN => XLEN, -- data path width + CPU_EXTENSION_RISCV_E => CPU_EXTENSION_RISCV_E, -- implement embedded RF extension? + RS3_EN => regfile_rs3_en_c -- enable third read port ) port map ( -- global control -- @@ -352,7 +358,8 @@ begin pc2_i => next_pc, -- next PC -- data output -- rs1_o => rs1, -- operand 1 - rs2_o => rs2 -- operand 2 + rs2_o => rs2, -- operand 2 + rs3_o => rs3 -- operand 3 ); @@ -379,6 +386,7 @@ begin -- data input -- rs1_i => rs1, -- rf source 1 rs2_i => rs2, -- rf source 2 + rs3_i => rs3, -- rf source 3 pc_i => curr_pc, -- current PC imm_i => imm, -- immediate -- data output -- diff --git a/rtl/core/neorv32_cpu_alu.vhd b/rtl/core/neorv32_cpu_alu.vhd index 60b66a02f..a99f52ddc 100644 --- a/rtl/core/neorv32_cpu_alu.vhd +++ b/rtl/core/neorv32_cpu_alu.vhd @@ -62,6 +62,7 @@ entity neorv32_cpu_alu is -- data input -- rs1_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 1 rs2_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 2 + rs3_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 3 pc_i : in std_ulogic_vector(XLEN-1 downto 0); -- current PC imm_i : in std_ulogic_vector(XLEN-1 downto 0); -- immediate -- data output -- @@ -272,6 +273,7 @@ begin cmp_i => cmp, -- comparator status rs1_i => rs1_i, -- rf source 1 rs2_i => rs2_i, -- rf source 2 + rs3_i => rs3_i, -- rf source 3 -- result and status -- res_o => cp_result(3), -- operation result fflags_o => fpu_flags_o, -- exception flags @@ -304,6 +306,7 @@ begin -- data input -- rs1_i => rs1_i, -- rf source 1 rs2_i => rs2_i, -- rf source 2 + rs3_i => rs3_i, -- rf source 3 -- result and status -- res_o => cp_result(4), -- operation result valid_o => cp_valid(4) -- data output valid diff --git a/rtl/core/neorv32_cpu_regfile.vhd b/rtl/core/neorv32_cpu_regfile.vhd index 32160d87b..5cb38349d 100644 --- a/rtl/core/neorv32_cpu_regfile.vhd +++ b/rtl/core/neorv32_cpu_regfile.vhd @@ -53,7 +53,8 @@ use neorv32.neorv32_package.all; entity neorv32_cpu_regfile is generic ( XLEN : natural; -- data path width - CPU_EXTENSION_RISCV_E : boolean -- implement embedded RF extension? + CPU_EXTENSION_RISCV_E : boolean; -- implement embedded RF extension? + RS3_EN : boolean -- enable third read port ); port ( -- global control -- @@ -66,7 +67,8 @@ entity neorv32_cpu_regfile is pc2_i : in std_ulogic_vector(XLEN-1 downto 0); -- next PC -- data output -- rs1_o : out std_ulogic_vector(XLEN-1 downto 0); -- operand 1 - rs2_o : out std_ulogic_vector(XLEN-1 downto 0) -- operand 2 + rs2_o : out std_ulogic_vector(XLEN-1 downto 0); -- operand 2 + rs3_o : out std_ulogic_vector(XLEN-1 downto 0) -- operand 3 ); end neorv32_cpu_regfile; @@ -84,6 +86,7 @@ architecture neorv32_cpu_regfile_rtl of neorv32_cpu_regfile is signal rd_zero : std_ulogic; -- writing to x0? signal opa_addr : std_ulogic_vector(4 downto 0); -- rs1/dst address signal opb_addr : std_ulogic_vector(4 downto 0); -- rs2 address + signal opc_addr : std_ulogic_vector(4 downto 0); -- rs3 address begin @@ -114,6 +117,9 @@ begin end if; rs1_o <= reg_file(to_integer(unsigned(opa_addr(4 downto 0)))); rs2_o <= reg_file(to_integer(unsigned(opb_addr(4 downto 0)))); + if (RS3_EN = true) then -- implement third read port? + rs3_o <= reg_file(to_integer(unsigned(opc_addr(4 downto 0)))); + end if; end if; end process rf_access; end generate; @@ -129,6 +135,9 @@ begin end if; rs1_o <= reg_file_emb(to_integer(unsigned(opa_addr(3 downto 0)))); rs2_o <= reg_file_emb(to_integer(unsigned(opb_addr(3 downto 0)))); + if (RS3_EN = true) then -- implement third read port? + rs3_o <= reg_file(to_integer(unsigned(opc_addr(3 downto 0)))); + end if; end if; end process rf_access; end generate; @@ -142,6 +151,7 @@ begin ctrl_i(ctrl_rf_rd_adr4_c downto ctrl_rf_rd_adr0_c) when (ctrl_i(ctrl_rf_wb_en_c) = '1') else -- rd ctrl_i(ctrl_rf_rs1_adr4_c downto ctrl_rf_rs1_adr0_c); -- rs1 opb_addr <= ctrl_i(ctrl_rf_rs2_adr4_c downto ctrl_rf_rs2_adr0_c); -- rs2 + opc_addr <= ctrl_i(ctrl_rf_rs3_adr4_c downto ctrl_rf_rs3_adr0_c); -- rs3 end neorv32_cpu_regfile_rtl; diff --git a/rtl/core/neorv32_package.vhd b/rtl/core/neorv32_package.vhd index 5b07230e8..566c5edeb 100644 --- a/rtl/core/neorv32_package.vhd +++ b/rtl/core/neorv32_package.vhd @@ -62,7 +62,7 @@ package neorv32_package is -- Architecture Constants (do not modify!) ------------------------------------------------ -- ------------------------------------------------------------------------------------------- - constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01070801"; -- NEORV32 version - no touchy! + constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01070802"; -- NEORV32 version - no touchy! constant archid_c : natural := 19; -- official RISC-V architecture ID - hands off! -- Check if we're inside the Matrix ------------------------------------------------------- @@ -364,6 +364,8 @@ package neorv32_package is constant instr_rs1_msb_c : natural := 19; -- source register 1 address bit 4 constant instr_rs2_lsb_c : natural := 20; -- source register 2 address bit 0 constant instr_rs2_msb_c : natural := 24; -- source register 2 address bit 4 + constant instr_rs3_lsb_c : natural := 27; -- source register 3 address bit 0 + constant instr_rs3_msb_c : natural := 31; -- source register 3 address bit 4 constant instr_funct7_lsb_c : natural := 25; -- funct7 bit 0 constant instr_funct7_msb_c : natural := 31; -- funct7 bit 6 constant instr_funct12_lsb_c : natural := 20; -- funct12 bit 0 @@ -394,9 +396,11 @@ package neorv32_package is constant opcode_system_c : std_ulogic_vector(6 downto 0) := "1110011"; -- system/csr access (type via funct3) -- floating point operations -- constant opcode_fop_c : std_ulogic_vector(6 downto 0) := "1010011"; -- dual/single operand instruction - -- official "custom0/1" RISC-V opcodes - free for custom instructions -- - constant opcode_cust0_c : std_ulogic_vector(6 downto 0) := "0001011"; -- custom instructions 0 ---constant opcode_cust1_c : std_ulogic_vector(6 downto 0) := "0101011"; -- custom instructions 1 + -- official *custom* RISC-V opcodes - free for custom instructions -- + constant opcode_cust0_c : std_ulogic_vector(6 downto 0) := "0001011"; -- custom-0 + constant opcode_cust1_c : std_ulogic_vector(6 downto 0) := "0101011"; -- custom-1 +--constant opcode_cust2_c : std_ulogic_vector(6 downto 0) := "1011011"; -- custom-2 +--constant opcode_cust3_c : std_ulogic_vector(6 downto 0) := "1111011"; -- custom-3 -- RISC-V Funct3 -------------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- @@ -756,67 +760,72 @@ package neorv32_package is constant ctrl_rf_rs2_adr2_c : natural := 8; -- source register 2 address bit 2 constant ctrl_rf_rs2_adr3_c : natural := 9; -- source register 2 address bit 3 constant ctrl_rf_rs2_adr4_c : natural := 10; -- source register 2 address bit 4 - constant ctrl_rf_rd_adr0_c : natural := 11; -- destination register address bit 0 - constant ctrl_rf_rd_adr1_c : natural := 12; -- destination register address bit 1 - constant ctrl_rf_rd_adr2_c : natural := 13; -- destination register address bit 2 - constant ctrl_rf_rd_adr3_c : natural := 14; -- destination register address bit 3 - constant ctrl_rf_rd_adr4_c : natural := 15; -- destination register address bit 4 - constant ctrl_rf_mux0_c : natural := 16; -- input source select lsb - constant ctrl_rf_mux1_c : natural := 17; -- input source select msb - constant ctrl_rf_zero_we_c : natural := 18; -- allow/force write access to x0 + constant ctrl_rf_rs3_adr0_c : natural := 11; -- source register 3 address bit 0 + constant ctrl_rf_rs3_adr1_c : natural := 12; -- source register 3 address bit 1 + constant ctrl_rf_rs3_adr2_c : natural := 13; -- source register 3 address bit 2 + constant ctrl_rf_rs3_adr3_c : natural := 14; -- source register 3 address bit 3 + constant ctrl_rf_rs3_adr4_c : natural := 15; -- source register 3 address bit 4 + constant ctrl_rf_rd_adr0_c : natural := 16; -- destination register address bit 0 + constant ctrl_rf_rd_adr1_c : natural := 17; -- destination register address bit 1 + constant ctrl_rf_rd_adr2_c : natural := 18; -- destination register address bit 2 + constant ctrl_rf_rd_adr3_c : natural := 19; -- destination register address bit 3 + constant ctrl_rf_rd_adr4_c : natural := 20; -- destination register address bit 4 + constant ctrl_rf_mux0_c : natural := 21; -- input source select lsb + constant ctrl_rf_mux1_c : natural := 22; -- input source select msb + constant ctrl_rf_zero_we_c : natural := 23; -- allow/force write access to x0 -- alu -- - constant ctrl_alu_op0_c : natural := 19; -- ALU operation select bit 0 - constant ctrl_alu_op1_c : natural := 20; -- ALU operation select bit 1 - constant ctrl_alu_op2_c : natural := 21; -- ALU operation select bit 2 - constant ctrl_alu_opa_mux_c : natural := 22; -- operand A select (0=rs1, 1=PC) - constant ctrl_alu_opb_mux_c : natural := 23; -- operand B select (0=rs2, 1=IMM) - constant ctrl_alu_unsigned_c : natural := 24; -- is unsigned ALU operation - constant ctrl_alu_frm0_c : natural := 25; -- FPU rounding mode bit 0 - constant ctrl_alu_frm1_c : natural := 26; -- FPU rounding mode bit 1 - constant ctrl_alu_frm2_c : natural := 27; -- FPU rounding mode bit 2 + constant ctrl_alu_op0_c : natural := 24; -- ALU operation select bit 0 + constant ctrl_alu_op1_c : natural := 25; -- ALU operation select bit 1 + constant ctrl_alu_op2_c : natural := 26; -- ALU operation select bit 2 + constant ctrl_alu_opa_mux_c : natural := 27; -- operand A select (0=rs1, 1=PC) + constant ctrl_alu_opb_mux_c : natural := 28; -- operand B select (0=rs2, 1=IMM) + constant ctrl_alu_unsigned_c : natural := 29; -- is unsigned ALU operation + constant ctrl_alu_frm0_c : natural := 30; -- FPU rounding mode bit 0 + constant ctrl_alu_frm1_c : natural := 31; -- FPU rounding mode bit 1 + constant ctrl_alu_frm2_c : natural := 32; -- FPU rounding mode bit 2 -- alu co-processor trigger (one-hot selection) -- - constant ctrl_cp_trig0_c : natural := 28; -- trigger CP0 - constant ctrl_cp_trig1_c : natural := 29; -- trigger CP1 - constant ctrl_cp_trig2_c : natural := 30; -- trigger CP2 - constant ctrl_cp_trig3_c : natural := 31; -- trigger CP3 - constant ctrl_cp_trig4_c : natural := 32; -- trigger CP4 - constant ctrl_cp_trig5_c : natural := 33; -- trigger CP5 + constant ctrl_cp_trig0_c : natural := 33; -- trigger CP0 + constant ctrl_cp_trig1_c : natural := 34; -- trigger CP1 + constant ctrl_cp_trig2_c : natural := 35; -- trigger CP2 + constant ctrl_cp_trig3_c : natural := 36; -- trigger CP3 + constant ctrl_cp_trig4_c : natural := 37; -- trigger CP4 + constant ctrl_cp_trig5_c : natural := 38; -- trigger CP5 -- bus interface -- - constant ctrl_bus_req_c : natural := 34; -- trigger memory request - constant ctrl_bus_mo_we_c : natural := 35; -- memory address and data output register write enable - constant ctrl_bus_fence_c : natural := 36; -- fence operation - constant ctrl_bus_fencei_c : natural := 37; -- fence.i operation - constant ctrl_bus_priv_c : natural := 38; -- effective privilege level for load/store + constant ctrl_bus_req_c : natural := 39; -- trigger memory request + constant ctrl_bus_mo_we_c : natural := 40; -- memory address and data output register write enable + constant ctrl_bus_fence_c : natural := 41; -- fence operation + constant ctrl_bus_fencei_c : natural := 42; -- fence.i operation + constant ctrl_bus_priv_c : natural := 43; -- effective privilege level for load/store -- instruction word control blocks -- - constant ctrl_ir_funct3_0_c : natural := 39; -- funct3 bit 0 - constant ctrl_ir_funct3_1_c : natural := 40; -- funct3 bit 1 - constant ctrl_ir_funct3_2_c : natural := 41; -- funct3 bit 2 - constant ctrl_ir_funct12_0_c : natural := 42; -- funct12 bit 0 - constant ctrl_ir_funct12_1_c : natural := 43; -- funct12 bit 1 - constant ctrl_ir_funct12_2_c : natural := 44; -- funct12 bit 2 - constant ctrl_ir_funct12_3_c : natural := 45; -- funct12 bit 3 - constant ctrl_ir_funct12_4_c : natural := 46; -- funct12 bit 4 - constant ctrl_ir_funct12_5_c : natural := 47; -- funct12 bit 5 - constant ctrl_ir_funct12_6_c : natural := 48; -- funct12 bit 6 - constant ctrl_ir_funct12_7_c : natural := 49; -- funct12 bit 7 - constant ctrl_ir_funct12_8_c : natural := 50; -- funct12 bit 8 - constant ctrl_ir_funct12_9_c : natural := 51; -- funct12 bit 9 - constant ctrl_ir_funct12_10_c : natural := 52; -- funct12 bit 10 - constant ctrl_ir_funct12_11_c : natural := 53; -- funct12 bit 11 - constant ctrl_ir_opcode7_0_c : natural := 54; -- opcode7 bit 0 - constant ctrl_ir_opcode7_1_c : natural := 55; -- opcode7 bit 1 - constant ctrl_ir_opcode7_2_c : natural := 56; -- opcode7 bit 2 - constant ctrl_ir_opcode7_3_c : natural := 57; -- opcode7 bit 3 - constant ctrl_ir_opcode7_4_c : natural := 58; -- opcode7 bit 4 - constant ctrl_ir_opcode7_5_c : natural := 59; -- opcode7 bit 5 - constant ctrl_ir_opcode7_6_c : natural := 60; -- opcode7 bit 6 + constant ctrl_ir_funct3_0_c : natural := 44; -- funct3 bit 0 + constant ctrl_ir_funct3_1_c : natural := 45; -- funct3 bit 1 + constant ctrl_ir_funct3_2_c : natural := 46; -- funct3 bit 2 + constant ctrl_ir_funct12_0_c : natural := 47; -- funct12 bit 0 + constant ctrl_ir_funct12_1_c : natural := 48; -- funct12 bit 1 + constant ctrl_ir_funct12_2_c : natural := 49; -- funct12 bit 2 + constant ctrl_ir_funct12_3_c : natural := 50; -- funct12 bit 3 + constant ctrl_ir_funct12_4_c : natural := 51; -- funct12 bit 4 + constant ctrl_ir_funct12_5_c : natural := 52; -- funct12 bit 5 + constant ctrl_ir_funct12_6_c : natural := 53; -- funct12 bit 6 + constant ctrl_ir_funct12_7_c : natural := 54; -- funct12 bit 7 + constant ctrl_ir_funct12_8_c : natural := 55; -- funct12 bit 8 + constant ctrl_ir_funct12_9_c : natural := 56; -- funct12 bit 9 + constant ctrl_ir_funct12_10_c : natural := 57; -- funct12 bit 10 + constant ctrl_ir_funct12_11_c : natural := 58; -- funct12 bit 11 + constant ctrl_ir_opcode7_0_c : natural := 59; -- opcode7 bit 0 + constant ctrl_ir_opcode7_1_c : natural := 60; -- opcode7 bit 1 + constant ctrl_ir_opcode7_2_c : natural := 61; -- opcode7 bit 2 + constant ctrl_ir_opcode7_3_c : natural := 62; -- opcode7 bit 3 + constant ctrl_ir_opcode7_4_c : natural := 63; -- opcode7 bit 4 + constant ctrl_ir_opcode7_5_c : natural := 64; -- opcode7 bit 5 + constant ctrl_ir_opcode7_6_c : natural := 65; -- opcode7 bit 6 -- cpu status -- - constant ctrl_priv_mode_c : natural := 61; -- effective privilege mode - constant ctrl_sleep_c : natural := 62; -- set when CPU is in sleep mode - constant ctrl_trap_c : natural := 63; -- set when CPU is entering trap execution - constant ctrl_debug_running_c : natural := 64; -- set when CPU is in debug mode + constant ctrl_priv_mode_c : natural := 66; -- effective privilege mode + constant ctrl_sleep_c : natural := 67; -- set when CPU is in sleep mode + constant ctrl_trap_c : natural := 68; -- set when CPU is entering trap execution + constant ctrl_debug_running_c : natural := 69; -- set when CPU is in debug mode -- control bus size -- - constant ctrl_width_c : natural := 65; -- control bus size + constant ctrl_width_c : natural := 70; -- control bus size -- Comparator Bus ------------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- @@ -1299,7 +1308,8 @@ package neorv32_package is component neorv32_cpu_regfile generic ( XLEN : natural; -- data path width - CPU_EXTENSION_RISCV_E : boolean -- implement embedded RF extension? + CPU_EXTENSION_RISCV_E : boolean; -- implement embedded RF extension? + RS3_EN : boolean -- enable third read port ); port ( -- global control -- @@ -1312,7 +1322,8 @@ package neorv32_package is pc2_i : in std_ulogic_vector(XLEN-1 downto 0); -- next PC -- data output -- rs1_o : out std_ulogic_vector(XLEN-1 downto 0); -- operand 1 - rs2_o : out std_ulogic_vector(XLEN-1 downto 0) -- operand 2 + rs2_o : out std_ulogic_vector(XLEN-1 downto 0); -- operand 2 + rs3_o : out std_ulogic_vector(XLEN-1 downto 0) -- operand 3 ); end component; @@ -1339,6 +1350,7 @@ package neorv32_package is -- data input -- rs1_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 1 rs2_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 2 + rs3_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 3 pc_i : in std_ulogic_vector(XLEN-1 downto 0); -- current PC imm_i : in std_ulogic_vector(XLEN-1 downto 0); -- immediate -- data output -- @@ -1436,6 +1448,7 @@ package neorv32_package is cmp_i : in std_ulogic_vector(1 downto 0); -- comparator status rs1_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 1 rs2_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 2 + rs3_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 3 -- result and status -- res_o : out std_ulogic_vector(XLEN-1 downto 0); -- operation result fflags_o : out std_ulogic_vector(4 downto 0); -- exception flags @@ -1458,6 +1471,7 @@ package neorv32_package is -- data input -- rs1_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 1 rs2_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 2 + rs3_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 3 -- result and status -- res_o : out std_ulogic_vector(XLEN-1 downto 0); -- operation result valid_o : out std_ulogic -- data output valid From 22167995ca15f86cf8829f9a5979661b71548cd8 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 21:19:08 +0100 Subject: [PATCH 07/12] [rtl/core] add custom-1 opcode (CFU) --- rtl/core/neorv32_cpu_control.vhd | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/rtl/core/neorv32_cpu_control.vhd b/rtl/core/neorv32_cpu_control.vhd index 0ccb74baf..c1a61c4c3 100644 --- a/rtl/core/neorv32_cpu_control.vhd +++ b/rtl/core/neorv32_cpu_control.vhd @@ -759,6 +759,7 @@ begin -- register addresses -- ctrl_o(ctrl_rf_rs1_adr4_c downto ctrl_rf_rs1_adr0_c) <= execute_engine.i_reg(instr_rs1_msb_c downto instr_rs1_lsb_c); ctrl_o(ctrl_rf_rs2_adr4_c downto ctrl_rf_rs2_adr0_c) <= execute_engine.i_reg(instr_rs2_msb_c downto instr_rs2_lsb_c); + ctrl_o(ctrl_rf_rs3_adr4_c downto ctrl_rf_rs3_adr0_c) <= execute_engine.i_reg(instr_rs3_msb_c downto instr_rs3_lsb_c); ctrl_o(ctrl_rf_rd_adr4_c downto ctrl_rf_rd_adr0_c) <= execute_engine.i_reg(instr_rd_msb_c downto instr_rd_lsb_c); -- instruction's function blocks -- ctrl_o(ctrl_ir_opcode7_6_c downto ctrl_ir_opcode7_0_c) <= execute_engine.i_reg(instr_opcode_msb_c downto instr_opcode_lsb_c); @@ -1080,7 +1081,7 @@ begin end if; - when opcode_cust0_c => -- CFU: custom RISC-V instructions (CUSTOM0 OPCODE space) + when opcode_cust0_c | opcode_cust1_c => -- CFU: custom RISC-V instructions (CUSTOM0/1 OPCODE space) -- ------------------------------------------------------------ if (CPU_EXTENSION_RISCV_Zxcfu = true) then ctrl_nxt(ctrl_cp_trig0_c + cp_sel_cfu_c) <= '1'; -- trigger CFU CP @@ -1435,12 +1436,24 @@ begin else illegal_cmd <= '1'; end if; - illegal_reg <= execute_engine.i_reg(instr_rs2_msb_c) or execute_engine.i_reg(instr_rs1_msb_c) or execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? + illegal_reg <= execute_engine.i_reg(instr_rs2_msb_c) or + execute_engine.i_reg(instr_rs1_msb_c) or + execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? - when opcode_cust0_c => -- CFU: custom instructions + when opcode_cust0_c => -- CFU: custom0 instructions (r3-type) -- ------------------------------------------------------------ illegal_cmd <= not bool_to_ulogic_f(CPU_EXTENSION_RISCV_Zxcfu); -- CFU extension not implemented - illegal_reg <= execute_engine.i_reg(instr_rs2_msb_c) or execute_engine.i_reg(instr_rs1_msb_c) or execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? + illegal_reg <= execute_engine.i_reg(instr_rs2_msb_c) or + execute_engine.i_reg(instr_rs1_msb_c) or + execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? + + when opcode_cust1_c => -- CFU: custom1 instructions (r4-type) + -- ------------------------------------------------------------ + illegal_cmd <= not bool_to_ulogic_f(CPU_EXTENSION_RISCV_Zxcfu); -- CFU extension not implemented + illegal_reg <= execute_engine.i_reg(instr_rs3_msb_c) or + execute_engine.i_reg(instr_rs2_msb_c) or + execute_engine.i_reg(instr_rs1_msb_c) or + execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? when others => -- illegal opcode -- ------------------------------------------------------------ From 2f962961177be44016cb5b3825ae84c0c2e861e1 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 21:19:44 +0100 Subject: [PATCH 08/12] [rtl/core] FPU: add third register operand not used yet; might be used in future for fmadd instructions --- rtl/core/neorv32_cpu_cp_fpu.vhd | 1 + 1 file changed, 1 insertion(+) diff --git a/rtl/core/neorv32_cpu_cp_fpu.vhd b/rtl/core/neorv32_cpu_cp_fpu.vhd index cd5187adb..dd98a7ba9 100644 --- a/rtl/core/neorv32_cpu_cp_fpu.vhd +++ b/rtl/core/neorv32_cpu_cp_fpu.vhd @@ -69,6 +69,7 @@ entity neorv32_cpu_cp_fpu is cmp_i : in std_ulogic_vector(1 downto 0); -- comparator status rs1_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 1 rs2_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 2 + rs3_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 3 -- result and status -- res_o : out std_ulogic_vector(XLEN-1 downto 0); -- operation result fflags_o : out std_ulogic_vector(4 downto 0); -- exception flags From b2cece78513c1befcf870742156e6f70dbb5521d Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 21:30:31 +0100 Subject: [PATCH 09/12] [rtl/core] CFU: add r4-type instruction support add multiply-add example --- rtl/core/neorv32_cpu_cp_cfu.vhd | 200 ++++++++++++++++++++++++-------- 1 file changed, 149 insertions(+), 51 deletions(-) diff --git a/rtl/core/neorv32_cpu_cp_cfu.vhd b/rtl/core/neorv32_cpu_cp_cfu.vhd index 42585001e..abf2f2f42 100644 --- a/rtl/core/neorv32_cpu_cp_cfu.vhd +++ b/rtl/core/neorv32_cpu_cp_cfu.vhd @@ -1,8 +1,8 @@ -- ################################################################################################# -- # << NEORV32 - CPU Co-Processor: Custom (Instructions) Functions Unit >> # -- # ********************************************************************************************* # --- # Intended for user-defined custom RISC-V instructions (R2-type format only). See the CPU's # --- # documentation for more information. # +-- # Intended for user-defined custom RISC-V instructions (R3-type and R4-type formats). See the # +-- # CPU's documentation for more information. # -- # # -- # NOTE: Take a look at the "software-counterpart" of this CFU example in 'sw/example/demo_cfu'. # -- # ********************************************************************************************* # @@ -57,6 +57,7 @@ entity neorv32_cpu_cp_cfu is -- data input -- rs1_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 1 rs2_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 2 + rs3_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 3 -- result and status -- res_o : out std_ulogic_vector(XLEN-1 downto 0); -- operation result valid_o : out std_ulogic -- data output valid @@ -65,16 +66,37 @@ end neorv32_cpu_cp_cfu; architecture neorv32_cpu_cp_cfu_rtl of neorv32_cpu_cp_cfu is - -- CFU controller - do not modify -- +-- **************************************************************************************************************************** +-- CFU controller - do not modify! +-- **************************************************************************************************************************** + type control_t is record busy : std_ulogic; -- CFU is busy done : std_ulogic; -- set to '1' when processing is done result : std_ulogic_vector(XLEN-1 downto 0); -- user's processing result (for write-back to register file) + rtype : std_ulogic; -- 0 = r3-type instruction, 1 = r4-type instruction funct3 : std_ulogic_vector(2 downto 0); -- "funct3" bit-field from custom instruction funct7 : std_ulogic_vector(6 downto 0); -- "funct7" bit-field from custom instruction end record; signal control : control_t; +-- **************************************************************************************************************************** +-- User Logic +-- **************************************************************************************************************************** + + -- multiply-add unit -- + type madd_t is record + sreg : std_ulogic_vector(2 downto 0); -- 3 cycles latency = 3 bits in arbitration shift register + done : std_ulogic; + -- + opa : std_ulogic_vector(XLEN-1 downto 0); + opb : std_ulogic_vector(XLEN-1 downto 0); + opc : std_ulogic_vector(XLEN-1 downto 0); + mul : std_ulogic_vector(2*XLEN-1 downto 0); + res : std_ulogic_vector(2*XLEN-1 downto 0); + end record; + signal madd : madd_t; + begin -- **************************************************************************************************************************** @@ -106,7 +128,8 @@ begin -- CPU feedback -- valid_o <= control.busy and control.done; -- set one cycle before result data - -- pack user-defined instruction function bits -- + -- pack user-defined instruction type/function bits -- + control.rtype <= ctrl_i(ctrl_ir_opcode7_5_c); control.funct3 <= ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c); control.funct7 <= ctrl_i(ctrl_ir_funct12_11_c downto ctrl_ir_funct12_5_c); @@ -116,36 +139,44 @@ begin -- **************************************************************************************************************************** -- ---------------------------------------------------------------------------------------- - -- CFU Instruction Format + -- CFU Instruction Formats -- ---------------------------------------------------------------------------------------- - -- The CFU only supports the R2-type RISC-V instruction format. This format consists of two source registers (rs1 and rs2), - -- a destination register (rd) and two "immediate" bit-fields (funct7 and funct3). It is up to the user to decide which - -- of these instruction bit fields are actually used by the CFU logic. + -- The CFU supports two instruction types that comply to the RISC-V ISA definition. + -- + -- RISC-V R3-Type Instruction: + -- This format consists of two source registers ('rs1', 'rs2'), a destination register ('rd') and two "immediate" bit-fields + -- ('funct7' and 'funct3'). + -- + -- RISC-V R4-Type Instruction: + -- RISC-V R4-Type Instruction: + -- This format consists of three source registers ('rs1', 'rs2', 'rs3'), a destination register ('rd') and one "immediate" + -- bit-field ('funct7'). -- ---------------------------------------------------------------------------------------- -- Input Operands -- ---------------------------------------------------------------------------------------- - -- > rs1_i (input, 32-bit): source register 1 ("data") - -- > rs2_i (input, 32-bit): source register 2 ("data") - -- > control.funct3 (input, 3-bit): 3-bit function select / immediate, driven by instruction word's funct3 bit field ("data" / "control") - -- > control.funct7 (input, 7-bit): 7-bit function select / immediate, driven by instruction word's funct7 bit field ("data" / "control") + -- > rs1_i (input, 32-bit): source register 1; selected by 'rs1' bit-field + -- > rs2_i (input, 32-bit): source register 2; selected by 'rs2' bit-field + -- > rs3_i (input, 32-bit): source register 3; selected by 'rs3' bit-field + -- > control.rtype (input, 1-bit): R3-type instruction when 0, R4-type instruction when 1; selected by OPCODE + -- > control.funct3 (input, 3-bit): 3-bit function select / immediate value; driven by instruction word's 'funct3' bit-field + -- > control.funct7 (input, 7-bit): 7-bit function select / immediate value; driven by instruction word's 'funct7' bit-field -- - -- The two signal rs1_i and rs2_i provide the data read from the CPU's register file, which is adressed by the - -- instruction word's rs1 and rs2 bit-fields. + -- The general instruction type is identified by the . It is 0 for the CUSTOM0 OPCODE indicating a R3-type + -- instruction format. The signal is 1 for the CUSTOM1 OPCODE indicating a R4-type instruction format. -- - -- The actual CFU operation can be defined by using the funct3 and funct7 signals. Both signals are directly driven by - -- the according bit-fields of the custom instruction. Note that these signals represent "immediates" that have to be - -- static already at compile time. These immediates can be used to select the actual function to be executed or as small - -- literals for certain operations (like shift amounts, addresses offsets, multiplication factors, ...). + -- The three signals , and provide the source operand data read from the CPU's register file. The source + -- register are adressed by the custom instruction word's 'rs1', 'rs2' and 'rs3' (R4-type only) bit-fields. + -- + -- The actual CFU operation can be defined by using the and/or signals. Both signals are + -- directly driven by the according bit-fields of the custom instruction word. These immediates can be used to select the + -- actual function or to provide small literals for certain operations (like shift amounts, offsets, multiplication factors, ...). -- - -- [NOTE] rs1_i and rs2_i are directly driven by the register file (block RAM). For complex CFU designs it is recommended - -- to buffer these signals using CFU-internal registers before using them for computations as the rs1 and rs2 nets - -- need to drive a lot of logic in the CPU. Obviously, this will increase the CFU latency by one cycle. + -- [NOTE] , and are directly driven by the register file (e.g. block RAM). For complex CFU designs + -- it is recommended to buffer these signals using CFU-internal registers before actually using them. -- - -- [NOTE] It is not possible for the CFU and it's according instruction words to cause any kind of exception. The CPU - -- control logic only verifies the custom instructions OPCODE and checks if the CFU is implemented at all. No - -- combinations of funct7 and funct3 will cause an exception. + -- [NOTE] The CFU cannot cause any kind of exception at all. -- ---------------------------------------------------------------------------------------- @@ -153,59 +184,126 @@ begin -- ---------------------------------------------------------------------------------------- -- > control.result (output, 32-bit): processing result ("data") -- - -- When the CFU has completed computations, the data in the control.result signal will be written to the CPU's register - -- file. The destination register is addressed by the rd bit-field in the instruction. The CFU result output is registered + -- When the CFU has completed computations, the data send via the signal will be written to the CPU's register + -- file. The destination register is addressed by the bit-field in the instruction word. The CFU result output is registered -- in the CFU controller (see above) - so do not worry too much about increasing the CPU's critical path with your custom -- logic. -- ---------------------------------------------------------------------------------------- - -- Control + -- Processing Control -- ---------------------------------------------------------------------------------------- -- > rstn_i (input, 1-bit): asynchronous reset, low-active -- > clk_i (input, 1-bit): main clock, triggering on rising edge -- > start_i (input, 1-bit): operation trigger (start processing, high for one cycle) -- > control.done (output, 1-bit): set high when processing is done -- - -- For pure-combinatorial instructions (without internal states) a subset of these signals is sufficient; see the minimal - -- example below. If the CFU shall also include states (like memories, registers or "buffers") the start_i signal can be - -- used to trigger a new iterative CFU operation. As soon as all internal computations have completed, the control.done - -- signal has to be set to indicate completion. This will finish CFU instruction operation and will write the processing - -- result (control.result) back to the CPU register file. + -- For pure-combinatorial instructions (completing within 1 clock cycle) can be tied to 1 ignoring all other + -- signals. If the CFU requires several clock cycles for processing the signal can be used to trigger a new iterative + -- CFU operation. As soon as all internal computations have completed, the signal has to be set to indicate + -- completion. This will completes CFU instruction operation and will also write the processing result back to + -- the CPU register file. -- - -- [NOTE] The control.done **has to be set at some time**, otherwise the CPU will get stalled forever. + -- [NOTE] The **has to be set at some time**, otherwise the CPU will get stalled forever. -- ---------------------------------------------------------------------------------------- -- Final Notes -- ---------------------------------------------------------------------------------------- - -- The "cfu_control" instance provides something like a "keeper" that ensures correct functionality (we do not want to + -- The record provides something like a "keeper" that ensures correct functionality (we do not want to -- stall the CPU forever) and also a simple-to-use interface hardware designers can start with. Obviously, the control - -- instance add one additional cycle of latency. Advanced users can remove this default control instance to obtain + -- instance adds one additional cycle of latency. Advanced users can remove this default control instance to obtain -- maximum throughput. -- **************************************************************************************************************************** --- Actual CFU user Logic - Add your custom logic below +-- Actual CFU User Logic Example - replace this with your custom logic -- **************************************************************************************************************************** - -- User Logic Example --------------------------------------------------------------------- + -- Iterative Multiply-Add Unit - Iteration Control ---------------------------------------- -- ------------------------------------------------------------------------------------------- - user_logic_function_select: process(control, rs1_i, rs2_i) + madd_control: process(rstn_i, clk_i) begin - -- This is a simple ALU that implements four pure-combinatorial instructions. - -- The actual function to-be-executed is selected by the "funct3" bit-field of the custom instruction. - case control.funct3 is - when "000" => control.result <= bin_to_gray_f(rs1_i); -- funct3 = "000": convert rs1 from binary to gray - when "001" => control.result <= gray_to_bin_f(rs1_i); -- funct3 = "001": convert rs1 from gray to binary - when "010" => control.result <= bit_rev_f(rs1_i); -- funct3 = "010": bit-reversal of rs1 - when "011" => control.result <= rs1_i xnor rs2_i; -- funct3 = "011": XNOR input operands - when others => control.result <= (others => '0'); -- not implemented, set to zero - end case; - end process user_logic_function_select; - - -- processing done? -- - control.done <= '1'; -- we are just doing pure-combinatorial data processing here, which is done "immediately" + if (rstn_i = '0') then + madd.sreg <= (others => '0'); + elsif rising_edge(clk_i) then + -- operation trigger -- + if (control.busy = '0') and -- CFU is idle (ready for next operation) + (start_i = '1') and -- CFU is actually triggered by a custom instruction word + (control.rtype = '1') and -- this is an R4-type instruction + (control.funct3(2 downto 1) = "00") then -- trigger only for specific funct3 values + madd.sreg(0) <= '1'; + else + madd.sreg(0) <= '0'; + end if; + -- simple shift register -- + madd.sreg(madd.sreg'left downto 1) <= madd.sreg(madd.sreg'left-1 downto 0); -- shift left + end if; + end process madd_control; + + -- processing has reached last stage (=done) when sreg's MSB is set -- + madd.done <= madd.sreg(madd.sreg'left); + + + -- Iterative Multiply-Add Unit - Arithmetic Core ------------------------------------------ + -- ------------------------------------------------------------------------------------------- + madd_core: process(clk_i) + begin + if rising_edge(clk_i) then + -- stage 0: buffer input operands -- + madd.opa <= rs1_i; + madd.opb <= rs2_i; + madd.opc <= rs3_i; + -- stage 1: multiply rs1 and rs2 -- + madd.mul <= std_ulogic_vector(unsigned(madd.opa) * unsigned(madd.opb)); + -- stage 2: add rs3 to multiplication result -- + madd.res <= std_ulogic_vector(unsigned(madd.mul) + unsigned(madd.opc)); + end if; + end process madd_core; + + + -- Output select -------------------------------------------------------------------------- + -- ------------------------------------------------------------------------------------------- + out_select: process(control, rs1_i, rs2_i, madd) + begin + -- -------------------------------------------------------- + if (control.rtype = '0') then -- R3-type instruction + -- -------------------------------------------------------- + + -- This is a simple ALU that implements four pure-combinatorial instructions. + -- The actual function is selected by the "funct3" bit-field of the custom instruction. + case control.funct3 is + when "000" => -- funct3 = "000": bit-reversal of rs1 + control.result <= bit_rev_f(rs1_i); + control.done <= '1'; -- pure-combinatorial, so we are done "immediately" + when "001" => -- funct3 = "001": XNOR input operands + control.result <= rs1_i xnor rs2_i; + control.done <= '1'; -- pure-combinatorial, so we are done "immediately" + when others => -- not implemented + control.result <= (others => '0'); + control.done <= '1'; -- set high to prevent permanent CPU stall + end case; + + -- -------------------------------------------------------- + else -- R4-type instruction + -- -------------------------------------------------------- + + -- This is an iterative multiply-and-add unit that requires several cycles for processing. + -- The actual function is selected by the lowest bit of the "funct3" bit-field. + case control.funct3 is + when "000" => -- funct3 = "000": multiply-add low-part result: rs1*rs2+r3 [31:0] + control.result <= madd.res(31 downto 0); + control.done <= madd.done; -- iterative, wait for unit to finish + when "001" => -- funct3 = "001": multiply-add high-part result: rs1*rs2+r3 [63:32] + control.result <= madd.res(63 downto 32); + control.done <= madd.done; -- iterative, wait for unit to finish + when others => -- not implemented + control.result <= (others => '0'); + control.done <= '1'; -- set high to prevent permanent CPU stall + end case; + + end if; + end process out_select; end neorv32_cpu_cp_cfu_rtl; From 791cfe364a13a5fe8d95f6dc0a05f1c7dc96faf9 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 3 Dec 2022 21:45:22 +0100 Subject: [PATCH 10/12] [sw/example] add r4-type instructions to CFU example program --- sw/example/demo_cfu/main.c | 128 +++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/sw/example/demo_cfu/main.c b/sw/example/demo_cfu/main.c index 869a11d78..225569417 100644 --- a/sw/example/demo_cfu/main.c +++ b/sw/example/demo_cfu/main.c @@ -1,5 +1,5 @@ // ################################################################################################# -// # << NEORV32 - CFU Custom Instructions Example Program >> # +// # << NEORV32 - CFU: Custom Instructions Example Program >> # // # ********************************************************************************************* # // # BSD 3-Clause License # // # # @@ -37,6 +37,8 @@ * @file demo_cfu/main.c * @author Stephan Nolting * @brief Example program showing how to use the CFU's custom instructions. + * Take a look at the "hardware-counterpart" of this CFU example in + * 'rtl/core/neorv32_cpu_cp_cfu.vhd'. **************************************************************************/ #include @@ -89,8 +91,7 @@ int main() { return 1; // UART0 not available, exit } - // check if the CFU is implemented at all - // note that the CFU is wrapped in the core's "Zxcfu" ISA extension + // check if the CFU is implemented at all (the CFU is wrapped in the core's "Zxcfu" ISA extension) if (neorv32_cpu_cfu_available() == 0) { neorv32_uart0_printf("ERROR! CFU ('Zxcfu' ISA extensions) not implemented!\n"); return 1; @@ -98,79 +99,82 @@ int main() { // intro - neorv32_uart0_printf("\n<<< NEORV32 Custom Functions Unit (CFU) 'Custom Instructions' Example Program >>>\n\n"); - - neorv32_uart0_printf("NOTE: This program assumes the _default_ CFU hardware module, which implements\n" - " four simple data conversion instructions.\n\n"); - - neorv32_uart0_printf("NOTE: This program (and it's comments) just shows how to USE the CFU's custom\n" - " instructions. The actual implementation of these instructions is done\n" - " in the CFU hardware module (-> rtl/core/neorv32_cpu_cp_cfu.vhd).\n\n"); - - - // The CFU custom instruction can be used as plain C functions with the help of the NEORV32 CFU intrinsics. - // - // There are 8 "prototypes" for the CFU instructions: - // > neorv32_cfu_cmd0(funct7, rs1, rs2) - sets the instruction's "funct3" bit field to 000 - // > neorv32_cfu_cmd1(funct7, rs1, rs2) - sets the instruction's "funct3" bit field to 001 - // > ... - // > neorv32_cfu_cmd7(funct7, rs1, rs2) - sets the instruction's "funct3" bit field to 111 - // - // Every "call" of these functions is turned into a single 32-bit R2-type RISC-V instruction (= "intrinsics"). - // - No overhead at all! Maximum throughput! - // - // Each neorv32_cfu_cmd* function requires three arguments: - // > funct7: a compile-time static 7-bit immediate (for the instruction's "funct7" bit field) - // > rs1: a 32-bit operand A (this is the first register file source rs1) - // > rs2: a 32-bit operand B (this is the second register file source rs2) - // - // The rs* operands can be literals, variables, function return values, ... you name it. - // The 7-bit immediate ("funct7") can be used to pass _compile-time static_ literals to the CFU - // or to do a more fine-grained function selection - it all depends on your hardware implementation. - // - // Each neorv32_cfu_cmd* function returns a 32-bit uint32_t data word, which represents - // the result of the according instruction. - - uint32_t i, opa, opb; - - neorv32_uart0_printf("\n--- CFU \"binary to gray\" instruction (funct3 = 000) ---\n"); + neorv32_uart0_printf("\n<<< NEORV32 Custom Functions Unit (CFU) - Custom Instructions Example Program >>>\n\n"); + + neorv32_uart0_printf("[NOTE] This program assumes the _default_ CFU hardware module, which\n" + " implements some exemplary data processing instructions.\n\n"); + + +/* + The CFU custom instructions can be used as plain C functions as they are simple "intrinsics". + + There are 2 "prototype primitives" for the CFU instructions: + > neorv32_cfu_r3_instr(funct7, funct3, rs1, rs2) - for r3-type instructions + > neorv32_cfu_r4_instr(funct3, rs1, rs2, rs3) - for r4-type instructions + + Every "call" of these functions is turned into a single 32-bit ISC-V instruction word + without any calling overhead at all. + + The "rs*" operands can be literals, variables, function return values, ... - you name it. + The 7-bit immediate ("funct7") and the 3-bit immediate ("funct3") values can be used to pass + _compile-time static_ literals to the CFU or to do a fine-grained function selection. + + Each "neorv32_cfu_r*_instr" function returns a 32-bit data word of type uint32_t that represents + the result of the according instruction. +*/ + + uint32_t i, rs1, rs2, rs3; + + // ------------------------------------ + // R3-type instructions + // ------------------------------------ + + neorv32_uart0_printf("\n--- CFU 'bit reversal' instruction ---\n"); for (i=0; i to simplify the usage of the CFU instructions. +#define madd_lo(a, b, c) neorv32_cfu_r4_instr(0b000, a, b, c) +#define madd_hi(a, b, c) neorv32_cfu_r4_instr(0b001, a, b, c) + + neorv32_uart0_printf("\n--- CFU 'multiply-add (low-part)' instruction ---\n"); for (i=0; i Date: Sun, 4 Dec 2022 19:37:17 +0100 Subject: [PATCH 11/12] [rtl/core] fix illegal instruction detection "custom" opcodes --- rtl/core/neorv32_cpu_control.vhd | 37 +++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/rtl/core/neorv32_cpu_control.vhd b/rtl/core/neorv32_cpu_control.vhd index c1a61c4c3..f67c8ed10 100644 --- a/rtl/core/neorv32_cpu_control.vhd +++ b/rtl/core/neorv32_cpu_control.vhd @@ -1433,29 +1433,40 @@ begin -- ------------------------------------------------------------ if (CPU_EXTENSION_RISCV_Zfinx = true) and (decode_aux.is_f_op = '1') then -- is supported floating-point instruction illegal_cmd <= '0'; + illegal_reg <= execute_engine.i_reg(instr_rs2_msb_c) or + execute_engine.i_reg(instr_rs1_msb_c) or + execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? else illegal_cmd <= '1'; + illegal_reg <= '0'; end if; - illegal_reg <= execute_engine.i_reg(instr_rs2_msb_c) or - execute_engine.i_reg(instr_rs1_msb_c) or - execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? when opcode_cust0_c => -- CFU: custom0 instructions (r3-type) -- ------------------------------------------------------------ - illegal_cmd <= not bool_to_ulogic_f(CPU_EXTENSION_RISCV_Zxcfu); -- CFU extension not implemented - illegal_reg <= execute_engine.i_reg(instr_rs2_msb_c) or - execute_engine.i_reg(instr_rs1_msb_c) or - execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? + if (CPU_EXTENSION_RISCV_Zxcfu = true) then -- CFU extension implemented + illegal_cmd <= '0'; + illegal_reg <= execute_engine.i_reg(instr_rs2_msb_c) or + execute_engine.i_reg(instr_rs1_msb_c) or + execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? + else + illegal_cmd <= '1'; + illegal_reg <= '0'; + end if; when opcode_cust1_c => -- CFU: custom1 instructions (r4-type) -- ------------------------------------------------------------ - illegal_cmd <= not bool_to_ulogic_f(CPU_EXTENSION_RISCV_Zxcfu); -- CFU extension not implemented - illegal_reg <= execute_engine.i_reg(instr_rs3_msb_c) or - execute_engine.i_reg(instr_rs2_msb_c) or - execute_engine.i_reg(instr_rs1_msb_c) or - execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? + if (CPU_EXTENSION_RISCV_Zxcfu = true) then -- CFU extension implemented + illegal_cmd <= '0'; + illegal_reg <= execute_engine.i_reg(instr_rs3_msb_c) or + execute_engine.i_reg(instr_rs2_msb_c) or + execute_engine.i_reg(instr_rs1_msb_c) or + execute_engine.i_reg(instr_rd_msb_c); -- illegal 'E' register? + else + illegal_cmd <= '1'; + illegal_reg <= '0'; + end if; - when others => -- illegal opcode + when others => -- undefined/illegal opcode -- ------------------------------------------------------------ illegal_cmd <= '1'; From d31535111bdbb7c89c1b935a51a7acb7c2835362 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sun, 4 Dec 2022 19:38:00 +0100 Subject: [PATCH 12/12] [docs] update CFU section --- docs/datasheet/cpu_cfu.adoc | 197 +++++++++++++++++++++--------------- 1 file changed, 117 insertions(+), 80 deletions(-) diff --git a/docs/datasheet/cpu_cfu.adoc b/docs/datasheet/cpu_cfu.adoc index 92f112e4b..b4e2f1894 100644 --- a/docs/datasheet/cpu_cfu.adoc +++ b/docs/datasheet/cpu_cfu.adoc @@ -4,60 +4,119 @@ The Custom Functions Unit is the central part of the <<_zxcfu_custom_instructions_extension_cfu>> and represents the actual hardware module, which is used to implement _custom RISC-V instructions_. The concept of the NEORV32 -CFU has been highly inspired by https://github.com/google/CFU-Playground[google's CFU-Playground]. +CFU has been highly inspired by https://github.com/google/CFU-Playground[Google's CFU-Playground]. The CFU is intended for operations that are inefficient in terms of performance, latency, energy consumption or -program memory requirements when implemented in pure software. Some potential application fields and exemplary +program memory requirements when implemented entirely in software. Some potential application fields and exemplary use-cases might include: * **AI:** sub-word / vector / SIMD operations like adding all four bytes of a 32-bit data word * **Cryptographic:** bit substitution and permutation -* **Communication:** conversions like binary to gray-code +* **Communication:** conversions like binary to gray-code; multiply-add operations * **Image processing:** look-up-tables for color space transformations -* implementing instructions from other RISC-V ISA extensions that are not yet supported by the NEORV32 +* implementing instructions from **other RISC-V ISA extensions** that are not yet supported by the NEORV32 [NOTE] -The CFU is not intended for complex and autonomous functional units that implement complete accelerators -like block-based AES de-/encoding). Such accelerator can be implemented within the <<_custom_functions_subsystem_cfs>>. +The CFU is not intended for complex and CPU-independent functional units that implement complete accelerators +(like block-based AES encryption). These kind of accelerators should be better implemented within the +<<_custom_functions_subsystem_cfs>>. A comparison of all chip-internal hardware extension options is provided in the user guide section https://stnolting.github.io/neorv32/ug/#_adding_custom_hardware_modules[Adding Custom Hardware Modules]. :sectnums: -==== Custom CFU Instructions - General +==== CFU Instruction Formats -The custom instruction utilize a specific instruction space that has been explicitly reserved for user-defined -extensions by the RISC-V specifications ("_Guaranteed Non-Standard Encoding Space_"). The NEORV32 CFU uses the -_CUSTOM0_ opcode to identify custom instructions. The binary encoding of this opcode is `0001011`. +The custom instructions executed by the CFU utilize a specific instruction space in the total `rv32` 32-bit instruction +space that has been explicitly reserved for user-defined extensions by the RISC-V specifications ("_Guaranteed Non-Standard +Encoding Space_"). The NEORV32 CFU uses the `custom-0` and `custom-1` opcodes to identify the custom instructions implemented +by the CFU and to differentiate between two instruction formats (note: these formats are common RISC-V instruction format types). +The custom-0 opcode is used to implement custom **R3-type** instructions while the custom-1 opcode is used to +implement custom **R4-type** instructions. The according binary encoding of these opcodes is shown below: -The custom instructions processed by the CFU use the 32-bit **R2-type** RISC-V instruction format, which consists -of six bit-fields: +* `custom-0`: `0001011` (R3-type instructions) +* `custom-1`: `0101011` (R4-type instructions) + +.CFU Instructions - Exceptions +[NOTE] +The CPU control logic will only analyze the opcode of the custom instructions to check if the +instruction word is valid. All remaining bit-fields are **not checked** by the CPU instruction decoding logic. +Hence, a custom CFU instruction can never raise an illegal instruction exception. If the CFU is not +implemented at all (`Zxcfu` ISA extension is not enabled) any instruction with opcode custom-0 or custom-1 +will raise an illegal instruction exception. + + +:sectnums: +==== CFU R3-Type Instructions + +The R3-type CFU instructions operate on two source registers and return the processing result to the destination register. +The actual operation can be defined by using the `funct7` and `funct3` bit fields. These immediates can also be used to +pass additional data to the CFU like offsets, look-up-tables addresses or shift-amounts. However, the actual +functionality is entirely user-defined. + +Example operation: `rd <= rs1 xnor rs2` + +.CFU R3-type instruction format +image::cfu_r3type_instruction.png[align=center] * `funct7`: 7-bit immediate * `rs2`: address of second source register * `rs1`: address of first source register * `funct3`: 3-bit immediate * `rd`: address of destination register -* `opcode`: always `0001011` to identify custom instructions +* `opcode`: always `0001011` (RISC-V "custom-0" opcode) -.CFU instruction format (RISC-V R2-type) -image::cfu_r2type_instruction.png[align=center] +.RISC-V compatibility +[NOTE] +The CFU R3-type instruction format is compliant to the RISC-V ISA specification. +.Instruction encoding space [NOTE] -Obviously, all bit-fields including the immediates have to be static at compile time. +By using the `funct7` and `funct3` entirely for selecting the actual operation a total of 1024 custom R3-type instructions +can be implemented (7-bit + 3-bit = 10 bit -> 1024 different values). -.Custom Instructions - Exceptions + +:sectnums: +==== CFU R4-Type Instructions + +The R4-type CFU instructions operate on three source registers and return the processing result to the destination register. +The actual operation can be defined by using the `funct3` bit field. Alternatively, this immediates can also be used to +pass additional data to the CFU like offsets, look-up-tables addresses or shift-amounts. However, the actual +functionality is entirely user-defined. + +Example operation: `rd <= (rs1 * rs2 + rs3)[31:0]` + +.CFU R4-type instruction format +image::cfu_r4type_instruction.png[align=center] + +* `rs3`: address of third source register +* `rs2`: address of second source register +* `rs1`: address of first source register +* `funct3`: 3-bit immediate +* `rd`: address of destination register +* `opcode`: always `0101011` (RISC-V "custom-1" opcode) + +.RISC-V compatibility [NOTE] -The CPU control logic can only check the _CUSTOM0_ opcode of the custom instructions to check if the -instruction word is valid. It cannot check the `funct3` and `funct7` bit-fields since they are -implementation-defined. Hence, a custom CFU instruction can never raise an illegal instruction exception. -However, custom will raise an illegal instruction exception if the CFU is not enabled/implemented -(i.e. `Zxcfu` ISA extension is not enabled). +The CFU R4-type instruction format is compliant to the RISC-V ISA specification. -The CFU operates on the two source operands and return the processing result to the destination register. -The actual instruction to be performed can be defined by using the `funct7` and `funct3` bit fields. -These immediate bit-fields can also be used to pass additional data to the CFU like offsets, look-up-tables -addresses or shift-amounts. However, the actual functionality is completely user-defined. +.Unused instruction bits +[NOTE] +The RISC-V ISA specification defines bits [26:25] of the R4-type instruction word to be all-zero. These bit are ignored +by the hardware (CFU and illegal instruction check logic) and should be set to all-zero to preserve compatibility with +future implementations. + +.Instruction encoding space +[NOTE] +By using the `funct3` entirely for selecting the actual operation a total of 8 custom R4-type instructions +can be implemented (3-bit -> 8 different values). + +.Hardware resource requirements +[WARNING] +Enabling the CFU and actually implementing R4-type instruction (or more precisely, using `rs3` inside the CFU hardware +module) will add another read port to the core's register file increasing resource requirements. For example, on a +FPGA platform that supports dual-port RAMs this will _double_ the number of required BRAMs for implementing the register +file. :sectnums: @@ -65,43 +124,41 @@ addresses or shift-amounts. However, the actual functionality is completely user The custom instructions provided by the CFU are included into plain C code by using **intrinsics**. Intrinsics behave like "normal" functions but under the hood they are a set of macros that hide the complexity of inline assembly. -Using such intrinsics removes the need to modify the compiler, built-in libraries and the assembler when including custom +Using intrinsics removes the need to modify the compiler, built-in libraries or the assembler when including custom instructions. -The NEORV32 software framework provides 8 pre-defined custom instructions macros, which are defined in -`sw/lib/include/neorv32_cpu_cfu.h`. Each intrinsic provides an implicit definition of the instruction word's -`funct3` bit-field: +The NEORV32 software framework provides two pre-defined prototypes for custom instructions, which are defined in +`sw/lib/include/neorv32_cpu_cfu.h` - one for R3-type instruction and one for R4-type instructions: .CFU instruction prototypes [source,c] ---- -neorv32_cfu_cmd0(funct7, rs1, rs2) // funct3 = 000 -neorv32_cfu_cmd1(funct7, rs1, rs2) // funct3 = 001 -neorv32_cfu_cmd2(funct7, rs1, rs2) // funct3 = 010 -neorv32_cfu_cmd3(funct7, rs1, rs2) // funct3 = 011 -neorv32_cfu_cmd4(funct7, rs1, rs2) // funct3 = 100 -neorv32_cfu_cmd5(funct7, rs1, rs2) // funct3 = 101 -neorv32_cfu_cmd6(funct7, rs1, rs2) // funct3 = 110 -neorv32_cfu_cmd7(funct7, rs1, rs2) // funct3 = 111 +neorv32_cfu_r3_instr(funct7, funct3, rs1, rs2) // R3-type instruction +neorv32_cfu_r4_instr(funct3, rs1, rs2, rs3) // R4-type instruction ---- -Each intrinsic functions always returns a 32-bit value (the processing result). Furthermore, -each intrinsic function requires three arguments: +The intrinsic functions always return a 32-bit value of type `uint32_t` (the processing result), which can be discarded +when not needed. Each intrinsic function requires several arguments depending on the instruction type: + +* `funct7` - 7-bit immediate (r3-type) +* `funct3` - 3-bit immediate (r3-type, r4-type) +* `rs3` - source operand 2, 32-bit (r4-type) +* `rs2` - source operand 2, 32-bit (r3-type, r4-type) +* `rs1` - source operand 1, 32-bit (r3-type, r4-type) -* `funct7` - 7-bit immediate -* `rs2` - source operand 2, 32-bit -* `rs1` - source operand 1, 32-bit +[NOTE] +The literals (immediate bit-fields `funct3` and `funct7`) have to be **static at compile time**. -The `funct7` bit-field is used to pass a 7-bit literal to the CFU. The `rs1` and `rs2` arguments to pass the -actual data to the CFU. These arguments can be populated with variables or literals. The following example -show how to pass arguments when executing `neorv32_cfu_cmd6`: `funct7` is set to all-zero, `rs1` is given -the literal _2751_ and `rs2` is given a variable that contains the return value from `some_function()`. +The `funct3` and `funct7` bit-fields are used to pass 3-bit or 7-bit literals to the CFU. The `rs1`, `rs2` and `rs3` +arguments pass the actual data to the CFU. These register arguments can be populated with variables or literals. +The following example shows how to pass arguments when executing both CFU instruction types: .CFU instruction usage example [source,c] ---- -uint32_t opb = some_function(); -uint32_t res = neorv32_cfu_cmd6(0b0000000, 2751, opb); +uint32_t tmp = some_function(); +uint32_t res = neorv32_cfu_r3_instr(0b0000000, 0b101, tmp, 123); +uint32_t foo = neorv32_cfu_r4_instr(0b011, tmp, res, some_array[i]); ---- .CFU Example Program @@ -113,42 +170,22 @@ The example program is located in `sw/example/demo_cfu`. :sectnums: ==== Custom Instructions Hardware -The actual functionality of the CFU's custom instruction is defined by the logic in the CFU itself. -It is the responsibility of the designer to implement this logic within the CFU hardware module -`rtl/core/neorv32_cpu_cp_cfu.vhd`. - -The CFU hardware module receives the data from instruction word's immediate bit-fields and also -the operation data, which is fetched from the CPU's register file. - -.CFU instruction data passing example -[source,c] ----- -uint32_t opb = 0x12345678UL; -uint32_t res = neorv32_cfu_cmd6(0b0100111, 0x00cafe00, opb); ----- - -In this example the CFU hardware module receives the two source operands as 32-bit signal -and the immediate values as 7-bit and 3-bit signals: - -* `rs1_i` (32-bit) contains the data from the `rs1` register (here = `0x00cafe00`) -* `rs2_i` (32-bit) contains the data from the `rs2` register (here = 0x12345678) -* `control.funct3` (3-bit) contains the immediate value from the `funct3` bit-field (here = `0b110`; "cmd6") -* `control.funct7` (7-bit) contains the immediate value from the `funct7` bit-field (here = `0b0100111`) - -The CFU executes the according instruction (for example this is selected by the `control.funct3` signal) -and provides the operation result in the 32-bit `control.result` signal. The processing can be entirely -combinatorial, so the result is available at the end of the current clock cycle. Processing can also -take several clock cycles and may also include internal states and memories. As soon as the CFU has -completed operations it sets the `control.done` signal high. +The actual functionality of the CFU's custom instructions is defined by the user-defined logic inside +the CFU hardware module `rtl/core/neorv32_cpu_cp_cfu.vhd`. .CFU Hardware Example & More Details [TIP] -The default CFU module already implement some exemplary instructions that are used for illustration +The default CFU hardware module already implement some exemplary instructions that are used for illustration by the CFU example program. See the CFU's VHDL source file (`rtl/core/neorv32_cpu_cp_cfu.vhd`), which is highly commented to explain the available signals and the handshake with the CPU pipeline. +CFU operations can be entirely combinatorial (like bit-reversal) so the result is available at the end of +the current clock cycle. Operations can also take several clock cycles to complete (like multiplications) +and may also include internal states and memories. The CFU's internal controller unit takes care of +interfacing the custom user logic to the CPU's pipeline. + .CFU Execution Time [NOTE] -The CFU is not required to finish processing within a bound time. -However, the designer should keep in mind that the CPU is **stalled** until the CFU has finished processing. -This also means the CPU cannot react to pending interrupts. Nevertheless, interrupt requests will still be queued. +The CFU is not required to finish processing within a bound time. However, you should keep in mind that the +CPU is _stalled_ until the CFU has finished processing. This also means the CPU cannot react to pending +interrupts during this time affecting real-time behavior (interrupt requests will still be queued).