diff --git a/build.sbt b/build.sbt index 2288cded15..7439212d4c 100644 --- a/build.sbt +++ b/build.sbt @@ -149,7 +149,7 @@ lazy val chipyard = (project in file("generators/chipyard")) .dependsOn(testchipip, rocketchip, boom, hwacha, rocketchip_blocks, rocketchip_inclusive_cache, iocell, sha3, // On separate line to allow for cleaner tutorial-setup patches dsptools, rocket_dsp_utils, - gemmini, icenet, tracegen, cva6, nvdla, sodor, ibex, fft_generator, + gemmini, icenet, tracegen, cva6, nvdla, sodor, ibex, fft_generator,cordic, constellation, mempress, barf, shuttle, caliptra_aes) .settings(libraryDependencies ++= rocketLibDeps.value) .settings( @@ -311,3 +311,8 @@ lazy val fpga_shells = (project in file("./fpga/fpga-shells")) lazy val fpga_platforms = (project in file("./fpga")) .dependsOn(chipyard, fpga_shells) .settings(commonSettings) + +lazy val cordic = (project in file("generators/cordic")) + .dependsOn(rocketchip, rocket_dsp_utils) + .settings(libraryDependencies ++= rocketLibDeps.value) + .settings(commonSettings) diff --git a/fpga/src/main/scala/nexysvideo/Configs.scala b/fpga/src/main/scala/nexysvideo/Configs.scala index ce8fbae8e4..b080e46557 100644 --- a/fpga/src/main/scala/nexysvideo/Configs.scala +++ b/fpga/src/main/scala/nexysvideo/Configs.scala @@ -5,7 +5,7 @@ import org.chipsalliance.cde.config._ import freechips.rocketchip.subsystem._ import freechips.rocketchip.devices.debug._ import freechips.rocketchip.devices.tilelink._ -import freechips.rocketchip.diplomacy._ +import org.chipsalliance.diplomacy.lazymodule._ import freechips.rocketchip.system._ import freechips.rocketchip.tile._ @@ -22,18 +22,23 @@ class WithNoDesignKey extends Config((site, here, up) => { }) // DOC include start: WithNexysVideoTweaks and Rocket + class WithNexysVideoTweaks extends Config( + new freechips.rocketchip.subsystem.WithRoccExample ++ + new fftgenerator.WithFFTGenerator(numPoints=8, width=16, decPt=8) ++ // add 8-point mmio fft at the default addr (0x2400) with 16bit fixed-point numbers. + new cordic.WithCORDIC(useAXI4=false, useBlackBox=true) ++ new WithNexysVideoUARTTSI ++ new WithNexysVideoDDRTL ++ new WithNoDesignKey ++ new testchipip.tsi.WithUARTTSIClient ++ new chipyard.harness.WithSerialTLTiedOff ++ - new chipyard.harness.WithHarnessBinderClockFreqMHz(50) ++ - new chipyard.config.WithMemoryBusFrequency(50.0) ++ - new chipyard.config.WithFrontBusFrequency(50.0) ++ - new chipyard.config.WithSystemBusFrequency(50.0) ++ - new chipyard.config.WithPeripheryBusFrequency(50.0) ++ - new chipyard.config.WithControlBusFrequency(50.0) ++ + + new chipyard.harness.WithHarnessBinderClockFreqMHz(10) ++ + new chipyard.config.WithMemoryBusFrequency(10.0) ++ + new chipyard.config.WithFrontBusFrequency(10.0) ++ + new chipyard.config.WithSystemBusFrequency(10.0) ++ + new chipyard.config.WithPeripheryBusFrequency(10.0) ++ + new chipyard.config.WithControlBusFrequency(10.0) ++ new chipyard.harness.WithAllClocksFromHarnessClockInstantiator ++ new chipyard.clocking.WithPassthroughClockGenerator ++ new chipyard.config.WithNoDebug ++ // no jtag @@ -72,3 +77,9 @@ class TinyRocketNexysVideoConfig extends Config( new chipyard.config.WithBroadcastManager ++ // no l2 new chipyard.TinyRocketConfig) // DOC include end: WithTinyNexysVideoTweaks and Rocket + +class BringupNexysVideoConfig extends Config( + new WithNexysVideoSerialTLToGPIO ++ + new WithNexysVideoTweaks(freqMHz = 75) ++ + new chipyard.ChipBringupHostConfig) + diff --git a/generators/chipyard/src/main/scala/DigitalTop.scala b/generators/chipyard/src/main/scala/DigitalTop.scala index ec8ffd9980..c40e11434d 100644 --- a/generators/chipyard/src/main/scala/DigitalTop.scala +++ b/generators/chipyard/src/main/scala/DigitalTop.scala @@ -7,6 +7,8 @@ import freechips.rocketchip.system._ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.devices.tilelink._ +import freechips.rocketchip.diplomacy.{AsynchronousCrossing} +import cordic.WithCORDIC // ------------------------------------ // BOOM and/or Rocket Top Level Systems // ------------------------------------ @@ -37,6 +39,7 @@ class DigitalTop(implicit p: Parameters) extends ChipyardSystem with chipyard.clocking.CanHaveClockTap // Enables optionally adding a clock tap output port with fftgenerator.CanHavePeripheryFFT // Enables optionally having an MMIO-based FFT block with constellation.soc.CanHaveGlobalNoC // Support instantiating a global NoC interconnect + with cordic.CanHavePeripheryCORDIC // Enables optionally having an MMIO-based CORDIC block { override lazy val module = new DigitalTopModule(this) } diff --git a/generators/chipyard/src/main/scala/config/MMIOAcceleratorConfigs.scala b/generators/chipyard/src/main/scala/config/MMIOAcceleratorConfigs.scala index 8d6f106346..bca8888088 100644 --- a/generators/chipyard/src/main/scala/config/MMIOAcceleratorConfigs.scala +++ b/generators/chipyard/src/main/scala/config/MMIOAcceleratorConfigs.scala @@ -58,10 +58,17 @@ class LargeNVDLARocketConfig extends Config( new freechips.rocketchip.subsystem.WithNBigCores(1) ++ new chipyard.config.AbstractConfig) + class ManyMMIOAcceleratorRocketConfig extends Config( + + new freechips.rocketchip.subsystem.WithRoccExample ++ new chipyard.harness.WithDontTouchChipTopPorts(false) ++ // TODO: hack around dontTouch not working in SFC + new fftgenerator.WithFFTGenerator(numPoints=8, width=16, decPt=8) ++ // add 8-point mmio fft at the default addr (0x2400) with 16bit fixed-point numbers. + new cordic.WithCORDIC(useAXI4=false, useBlackBox=true) ++ new chipyard.example.WithStreamingPassthrough ++ // use top with tilelink-controlled streaming passthrough new chipyard.example.WithStreamingFIR ++ // use top with tilelink-controlled streaming FIR + new freechips.rocketchip.subsystem.WithNBigCores(1) ++ - new chipyard.config.AbstractConfig) + new chipyard.config.AbstractConfig + ) \ No newline at end of file diff --git a/generators/cordic/src/main/resources/vsrc/CORDICMMIOBlackBox.v b/generators/cordic/src/main/resources/vsrc/CORDICMMIOBlackBox.v new file mode 100755 index 0000000000..70e3000912 --- /dev/null +++ b/generators/cordic/src/main/resources/vsrc/CORDICMMIOBlackBox.v @@ -0,0 +1,489 @@ +// DOC include start: GCD portlist +module CORDICMMIOBlackBox +#(parameter WIDTH = 16) + ( + input clock, + input reset, + output input_ready, + input input_valid, + input [15:0] theta, + input output_ready, + output output_valid, + output reg [31:0] sin_cos_theta, + output busy + ); +// DOC include end: GCD portlist + + localparam S_IDLE = 2'b00, S_RUN = 2'b01, S_DONE = 2'b10; + reg [1:0] state; + wire cordic_done; + reg cordic_start; + wire [15:0] sin_theta_tmp; + wire [15:0] cos_theta_tmp; + reg [15:0] theta_tmp; + assign input_ready = state == S_IDLE; + assign output_valid = state == S_DONE; + assign busy = state != S_IDLE; + + wire cordic_start_wire; + wire [15:0] theta_wire; + assign cordic_start_wire = cordic_start; + assign theta_wire = theta_tmp; + + always @(posedge clock) begin + if (reset)begin + state <= S_IDLE; + end + else if (state == S_IDLE && input_valid) + state <= S_RUN; + else if (state == S_RUN && cordic_done == 1'b1) + state <= S_DONE; + else if (state == S_DONE && output_ready)begin + state <= S_IDLE; + end + end + + always @(posedge clock) begin + + if (state == S_IDLE && input_valid) begin + theta_tmp <= theta; + cordic_start <= 1'b1; + end else if (state == S_RUN) begin + cordic_start <= 1'b0; // Reset start signal + end else if (state == S_DONE) begin + sin_cos_theta <= {sin_theta_tmp,cos_theta_tmp}; + end + end + MyCORDIC cordic_inst ( + .clk(clock), + .rst_n(~reset), + .theta(theta_tmp), + .start(cordic_start_wire), + .sin_theta(sin_theta_tmp), + .cos_theta(cos_theta_tmp), + .done(cordic_done) + ); + +endmodule // GCDMMIOBlackBox + +module MyCORDIC( + input clk, + input rst_n, + input [15:0]theta, + input start, + output reg [15:0]sin_theta, + output reg [15:0]cos_theta, + output reg done +); + +parameter Kn = 'd19898; // 0.607253*2^15 +parameter iKn = 'd53961; // 1.64676*2^15 + +parameter arctan_0 = 8192 ; // arctan(1/2) +parameter arctan_1 = 4836 ; // arctan(1/2^1) +parameter arctan_2 = 2555 ; // arctan(1/2^2) +parameter arctan_3 = 1297 ; // arctan(1/2^3) +parameter arctan_4 = 651 ; // arctan(1/2^4) +parameter arctan_5 = 326 ; // arctan(1/2^5) +parameter arctan_6 = 163 ; // arctan(1/2^6) +parameter arctan_7 = 81 ; // arctan(1/2^7) +parameter arctan_8 = 41 ; // arctan(1/2^8) +parameter arctan_9 = 20 ; // arctan(1/2^9) +parameter arctan_10 = 10 ; // arctan(1/2^10) +parameter arctan_11 = 5 ; // arctan(1/2^11) + +reg signed [15:0] x [11:0]; +reg signed [15:0] y [11:0]; +reg signed [15:0] z [11:0]; + +wire [15:0]x_tmp; +wire [15:0]y_tmp; + +reg signed [15:0]theta_1; +wire [2:0] Quadrant;//theta角所在的象限 + +reg [3:0]count; +reg enable; +// 象限判断 +assign Quadrant = theta[15:14] + 1; + +always@(*) begin + theta_1 = {2'b00, theta[13:0]}; // default + case (Quadrant) + 3'd1: theta_1 = theta; + 3'd2: theta_1 = 32768 - theta; + 3'd3: theta_1 = theta - 32768; + 3'd4: theta_1 = 65536 - theta; + endcase +end + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + count <= 0; + done <= 0; + enable <= 0; + end + else if(enable) + begin + if(count < 11) + begin + count <= count + 1; + done <= 0; + end + else + begin + count <= 0; + done <= 1; + enable <= 0; + end + end + else if(start) + begin + enable <= 1; + done <= 0; + count <= 0; + end + else if(done) + begin + done <= 0; + end + end + + + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[0] <= 16'd0; + y[0] <= 16'd0; + z[0] <= 16'd0; + end + else + begin + x[0] <= Kn; + y[0] <= 16'd0; + z[0] <= theta_1; + end +end + +always@(posedge clk or negedge rst_n) // i=0 +begin + if(!rst_n) + begin + x[1] <= 16'd0; + y[1] <= 16'd0; + z[1] <= 16'd0; + end + else + begin + if(z[0][15]) + begin + x[1] <= x[0] + y[0]; + y[1] <= y[0] - x[0]; + z[1] <= z[0] + arctan_0; + end + else + begin + x[1] <= x[0] - y[0]; + y[1] <= y[0] + x[0]; + z[1] <= z[0] - arctan_0; + end + end +end + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[2] <= 16'd0; + y[2] <= 16'd0; + z[2] <= 16'd0; + end + else + begin + if(z[1][15]) + begin + x[2] <= x[1] + (y[1] >>> 1); + y[2] <= y[1] - (x[1] >>> 1); + z[2] <= z[1] + arctan_1; + end + else + begin + x[2] <= x[1] - (y[1] >>> 1); + y[2] <= y[1] + (x[1] >>> 1); + z[2] <= z[1] - arctan_1; + end + end +end + +always@(posedge clk or negedge rst_n) // i=2 +begin + if(!rst_n) + begin + x[3] <= 16'd0; + y[3] <= 16'd0; + z[3] <= 16'd0; + end + else + begin + if(z[2][15]) + begin + x[3] <= x[2] + (y[2] >>> 2); + y[3] <= y[2] - (x[2] >>> 2); + z[3] <= z[2] + arctan_2; + end + else + begin + x[3] <= x[2] - (y[2] >>> 2); + y[3] <= y[2] + (x[2] >>> 2); + z[3] <= z[2] - arctan_2; + end + end +end + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[4] <= 16'd0; + y[4] <= 16'd0; + z[4] <= 16'd0; + end + else + begin + if(z[3][15]) + begin + x[4] <= x[3] + (y[3] >>> 3); + y[4] <= y[3] - (x[3] >>> 3); + z[4] <= z[3] + arctan_3; + end + else + begin + x[4] <= x[3] - (y[3] >>> 3); + y[4] <= y[3] + (x[3] >>> 3); + z[4] <= z[3] - arctan_3; + end + end +end + + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[5] <= 16'd0; + y[5] <= 16'd0; + z[5] <= 16'd0; + end + else + begin + if(z[4][15]) + begin + x[5] <= x[4] + (y[4] >>> 4); + y[5] <= y[4] - (x[4] >>> 4); + z[5] <= z[4] + arctan_4; + end + else + begin + x[5] <= x[4] - (y[4] >>> 4); + y[5] <= y[4] + (x[4] >>> 4); + z[5] <= z[4] - arctan_4; + end + end +end + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[6] <= 16'd0; + y[6] <= 16'd0; + z[6] <= 16'd0; + end + else + begin + if(z[5][15]) + begin + x[6] <= x[5] + (y[5] >>> 5); + y[6] <= y[5] - (x[5] >>> 5); + z[6] <= z[5] + arctan_5; + end + else + begin + x[6] <= x[5] - (y[5] >>> 5); + y[6] <= y[5] + (x[5] >>> 5); + z[6] <= z[5] - arctan_5; + end + end +end + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[7] <= 16'd0; + y[7] <= 16'd0; + z[7] <= 16'd0; + end + else + begin + if(z[6][15]) + begin + x[7] <= x[6] + (y[6] >>> 6); + y[7] <= y[6] - (x[6] >>> 6); + z[7] <= z[6] + arctan_6; + end + else + begin + x[7] <= x[6] - (y[6] >>> 6); + y[7] <= y[6] + (x[6] >>> 6); + z[7] <= z[6] - arctan_6; + end + end +end + + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[8] <= 16'd0; + y[8] <= 16'd0; + z[8] <= 16'd0; + end + else + begin + if(z[7][15]) + begin + x[8] <= x[7] + (y[7] >>> 7); + y[8] <= y[7] - (x[7] >>> 7); + z[8] <= z[7] + arctan_7; + end + else + begin + x[8] <= x[7] - (y[7] >>> 7); + y[8] <= y[7] + (x[7] >>> 7); + z[8] <= z[7] - arctan_7; + end + end +end + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[9] <= 16'd0; + y[9] <= 16'd0; + z[9] <= 16'd0; + end + else + begin + if(z[8][15]) + begin + x[9] <= x[8] + (y[8] >>> 8); + y[9] <= y[8] - (x[8] >>> 8); + z[9] <= z[8] + arctan_8; + end + else + begin + x[9] <= x[8] - (y[8] >>> 8); + y[9] <= y[8] + (x[8] >>> 8); + z[9] <= z[8] - arctan_8; + end + end +end + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[10] <= 16'd0; + y[10] <= 16'd0; + z[10] <= 16'd0; + end + else + begin + if(z[9][15]) + begin + x[10] <= x[9] + (y[9] >>> 9); + y[10] <= y[9] - (x[9] >>> 9); + z[10] <= z[9] + arctan_9; + end + else + begin + x[10] <= x[9] - (y[9] >>> 9); + y[10] <= y[9] + (x[9] >>> 9); + z[10] <= z[9] - arctan_9; + end + end +end + +always@(posedge clk or negedge rst_n) +begin + if(!rst_n) + begin + x[11] <= 16'd0; + y[11] <= 16'd0; + z[11] <= 16'd0; + end + else + begin + if(z[10][15]) + begin + x[11] <= x[10] + (y[10] >>> 10); + y[11] <= y[10] - (x[10] >>> 10); + end + else + begin + x[11] <= x[10] - (y[10] >>> 10); + y[11] <= y[10] + (x[10] >>> 10); + end + end +end + +// 溢出判断 +assign x_tmp = x[11][15]==1 ? 16'h7FFF : x[11]; +assign y_tmp = y[11][15]==1 ? 16'h7FFF : y[11]; +//assign x_tmp = x[11]; +//assign y_tmp = y[11]; + +always@(posedge clk or negedge rst_n) // i=11 +begin + if(!rst_n) + begin + sin_theta <= 16'd0; + cos_theta <= 16'd0; + end + else + begin + if(Quadrant == 3'd1) + begin + sin_theta <= y_tmp; + cos_theta <= x_tmp; + end + else if(Quadrant == 3'd2) + begin + sin_theta <= y_tmp; + cos_theta <= -x_tmp; + end + else if(Quadrant == 3'd3) + begin + sin_theta <= -y_tmp; + cos_theta <= -x_tmp; + end + else if(Quadrant == 3'd4) + begin + sin_theta <= -y_tmp; + cos_theta <= x_tmp; + end + else + begin + sin_theta <= sin_theta; + cos_theta <= cos_theta; + end + end +end +endmodule diff --git a/generators/cordic/src/main/scala/CORDIC_NEW.scala b/generators/cordic/src/main/scala/CORDIC_NEW.scala new file mode 100755 index 0000000000..94b984d790 --- /dev/null +++ b/generators/cordic/src/main/scala/CORDIC_NEW.scala @@ -0,0 +1,146 @@ +package cordic +import chisel3._ +import chisel3.util._ +import chisel3.experimental.{IntParam, BaseModule} + +import chisel3.util.HasBlackBoxResource +import freechips.rocketchip.amba.axi4._ +import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.regmapper.{HasRegMap, RegField} +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.subsystem.BaseSubsystem +import org.chipsalliance.cde.config.{Parameters, Field, Config} +import freechips.rocketchip.tilelink.TLRegisterNode +import freechips.rocketchip.util.UIntIsOneOf + +case class CORDICParams( + address: BigInt = 0x4000, + width: Int = 32, + useAXI4: Boolean = false, + useBlackBox: Boolean = true) + +case object CORDICKey extends Field[Option[CORDICParams]](None) + +class CORDICIO(val w:Int)extends Bundle { + val clock = Input(Clock()) + val reset = Input(Bool()) + val input_ready = Output(Bool()) + val input_valid = Input(Bool()) + val theta = Input(UInt(w.W)) + val output_ready = Input(Bool()) + val output_valid = Output(Bool()) + val sin_cos_theta = Output(UInt(w.W)) + val busy = Output(Bool()) +} + +trait CORDICTopIO extends Bundle { + val cordic_busy = Output(Bool()) +} + +trait HasCORDICIO extends BaseModule { + val w: Int////////*** + val io = IO(new CORDICIO(w)) +} + +class CORDICMMIOBlackBox(val w: Int) extends BlackBox(Map("WIDTH" -> IntParam(w))) with HasBlackBoxResource + with HasCORDICIO +{ + addResource("/vsrc/CORDICMMIOBlackBox.v") +} +// DOC include end: cordic blackbox + +trait CORDICModule extends HasRegMap { + + val io: CORDICTopIO + implicit val p: Parameters + + def params: CORDICParams + val clock: Clock + val reset: Reset + val theta = Wire(new DecoupledIO(UInt(params.width.W))) + val sin_cos_theta = Wire(Flipped(DecoupledIO(UInt(params.width.W)))) + + val status = Wire(UInt(2.W)) + val impl = Module(new CORDICMMIOBlackBox(params.width)) + + impl.io.clock := clock + impl.io.reset := reset.asBool + impl.io.theta := theta.bits + impl.io.input_valid := theta.valid + theta.ready := impl.io.input_ready + + sin_cos_theta.bits := impl.io.sin_cos_theta + + sin_cos_theta.valid := impl.io.output_valid + + impl.io.output_ready := sin_cos_theta.ready + + status := Cat(impl.io.input_ready, impl.io.output_valid) + io.cordic_busy := impl.io.busy + regmap( + 0x00 -> Seq( + RegField.r(2, status)), + 0x04 -> Seq( + RegField.w(params.width, theta) + ), + 0x08 -> Seq( + RegField.r(params.width, sin_cos_theta) + ) + ) +} + +class CORDICTL(params: CORDICParams, beatBytes: Int)(implicit p: Parameters) + extends TLRegisterRouter( + params.address , "cordic", Seq("ucbbar,cordic"), beatBytes = beatBytes)( + new TLRegBundle(params, _)with CORDICTopIO)( + new TLRegModule(params, _,_) with CORDICModule) + +class CORDICAXI4(params: CORDICParams, beatBytes: Int)(implicit p: Parameters) + extends AXI4RegisterRouter( + params.address, + beatBytes = beatBytes)( + new AXI4RegBundle(params, _)with CORDICTopIO)( + new AXI4RegModule(params, _,_) with CORDICModule) + +trait CanHavePeripheryCORDIC { this: BaseSubsystem => + private val portName = "cordic" + val cordic_busy = p(CORDICKey) match { + case Some(params) => { + val cordic = if(params.useAXI4){ + val cordic = pbus {LazyModule(new CORDICAXI4(params, pbus.beatBytes)(p))} + pbus.coupleTo(portName) { + cordic.node := + AXI4Buffer () := + TLToAXI4 () := + // toVariableWidthSlave doesn't use holdFirstDeny, which TLToAXI4() needsx + TLFragmenter(pbus.beatBytes, pbus.blockBytes, holdFirstDeny = true) := _ + } + cordic + } else { + val cordic = pbus {LazyModule(new CORDICTL(params, pbus.beatBytes)(p))} + pbus.coupleTo(portName) { + cordic.node := TLFragmenter(pbus.beatBytes, pbus.blockBytes) := _ + } + cordic + } + val pbus_io = pbus { + InModuleBody { + val busy = IO(Output(Bool())) + busy := cordic.module.io.cordic_busy + busy + } + } + val cordic_busy = InModuleBody { + val busy = IO(Output(Bool())).suggestName("cordic_busy") + busy := pbus_io + busy + } + Some(cordic_busy) + } + case None => None + } + +} +class WithCORDIC(useAXI4: Boolean = false, useBlackBox: Boolean = true) extends Config((site, here, up) => { + case CORDICKey => Some(CORDICParams(useAXI4 = useAXI4, useBlackBox = useBlackBox)) +}) diff --git a/tests/Makefile b/tests/Makefile index 1c6df31b5c..988c1e6a79 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -2,7 +2,7 @@ # RISCV Toolchain ################################# -TARGET = riscv64-unknown-elf +TARGET = /home/willis/chipyard/.conda-env/riscv-tools/bin/riscv64-unknown-elf GCC = $(TARGET)-gcc CXX = $(TARGET)-g++ @@ -11,7 +11,10 @@ OBJDUMP = $(TARGET)-objdump DG = $(TARGET)-gdb SIZE = $(TARGET)-size - +COMMON_SRCS := cordic.c +COMMON_OBJS := $(COMMON_SRCS:.c=.o) +$(COMMON_OBJS): %.o : %.c mmio.h cordic.h + $(GCC) $(CFLAGS) -c $< -o $@ ################################# # Flags ################################# @@ -21,15 +24,15 @@ ARCH = rv64imafdc ABI = lp64d ARCHFLAGS = -march=$(ARCH) -mabi=$(ABI) -CFLAGS = -std=gnu99 -O2 -fno-common -fno-builtin-printf -Wall +CFLAGS = -std=gnu99 -O2 -fno-common -fno-builtin-printf -Wall CFLAGS += $(ARCHFLAGS) LDFLAGS = -static - +LDLIBS = -lm -Wl,-u,_printf_float include libgloss.mk PROGRAMS = pwm blkdev accum charcount nic-loopback big-blkdev pingd \ streaming-passthrough streaming-fir nvdla spiflashread spiflashwrite fft gcd \ - hello mt-hello symmetric + hello mt-hello symmetric test SVD_2x2 fft_soft inv_gauss .DEFAULT_GOAL := default @@ -45,11 +48,12 @@ spiflash.img: spiflash.py %.o: %.S $(GCC) $(CFLAGS) -D__ASSEMBLY__=1 -c $< -o $@ -%.o: %.c mmio.h spiflash.h +%.o: %.c mmio.h spiflash.h cordic.h $(GCC) $(CFLAGS) -c $< -o $@ -%.riscv: %.o $(libgloss) - $(GCC) $(LDFLAGS) $< -o $@ +%.riscv: %.o $(libgloss) $(COMMON_OBJS) + $(GCC) $(LDFLAGS) $< $(COMMON_OBJS) -o $@ $(LDLIBS) + %.dump: %.riscv $(OBJDUMP) -D $< > $@ @@ -68,4 +72,4 @@ clean: default: $(addsuffix .riscv, $(PROGRAMS)) spiflash.img .PHONY: dumps -dumps: $(addsuffix .dump, $(PROGRAMS)) +dumps: $(addsuffix .dump, $(PROGRAMS)) \ No newline at end of file diff --git a/tests/SVD_2x2.c b/tests/SVD_2x2.c new file mode 100644 index 0000000000..73e3dff852 --- /dev/null +++ b/tests/SVD_2x2.c @@ -0,0 +1,136 @@ + +#include +#include +#include // for usleep() +#include "mmio.h" +#include +#include "rocc.h" +#include "cordic.h" +#include + +#define EPSILON 1e-10 +#define MAX_ITER 100 +#define PI 3.14159265358979323846 +static inline uint64_t rdcycle() { + uint64_t cycle; + asm volatile ("rdcycle %0" : "=r"(cycle)); + return cycle; +} + +void jacobi_2x2(float A[2][2], float eigval[2], float eigvec[2][2],int CORDIC) { + //uint64_t start = rdcycle(); + float theta; + RISCVarctan(CORDIC,(A[0][0] - A[1][1]),(2* A[0][1]), &theta, false); + theta = 0.5 * theta; + float c = 0, s = 0; + RISCVsin(CORDIC,(theta), &s,false); + RISCVcos(CORDIC,(theta), &c,false); + eigvec[0][0] = c; + eigvec[0][1] = -s; + eigvec[1][0] = s; + eigvec[1][1] = c; + + eigval[0] = c * c * A[0][0] + 2 * c * s * A[0][1] + s * s * A[1][1]; + eigval[1] = s * s * A[0][0] - 2 * c * s * A[0][1] + c * c * A[1][1]; + //uint64_t end = rdcycle(); + //printf("Jacobi rotation cycles: %lu\n", end - start); +} + +// 對任意 2x2 矩陣 A,做 SVD:只求出 U, Σ, V +void svd_2x2(float A[2][2],float Ainv[2][2],bool CORDIC) { + uint64_t t1 = rdcycle(); + + // Step 1: 計算 A^T A(2x2) + float ATA[2][2]; + ATA[0][0] = A[0][0]*A[0][0] + A[1][0]*A[1][0]; + ATA[0][1] = A[0][0]*A[0][1] + A[1][0]*A[1][1]; + ATA[1][0] = ATA[0][1]; + ATA[1][1] = A[0][1]*A[0][1] + A[1][1]*A[1][1]; + uint64_t t2 = rdcycle(); + + // Step 2: 用 Jacobi 對 ATA 對角化,得到 V 和 Σ^2 + float eigval[2], V[2][2]; + jacobi_2x2(ATA, eigval, V, CORDIC); + uint64_t t3 = rdcycle(); + + // Step 3: 奇異值是特徵值平方根(可能需要排序) + float sigma[2]; + sigma[0] = sqrt(fmax(eigval[0], 0)); + sigma[1] = sqrt(fmax(eigval[1], 0)); + uint64_t t4 = rdcycle(); + + // Step 4: 求 U = A * V / σ + float U[2][2]; + for (int i = 0; i < 2; i++) { + float temp0 = A[0][0]*V[0][i] + A[0][1]*V[1][i]; + float temp1 = A[1][0]*V[0][i] + A[1][1]*V[1][i]; + if (sigma[i] > EPSILON) { + U[0][i] = temp0 / sigma[i]; + U[1][i] = temp1 / sigma[i]; + } else { + U[0][i] = 0; + U[1][i] = 0; + } + } + uint64_t t5 = rdcycle(); + float invSigma[2]; + invSigma[0] = 1.0f / sigma[0]; + invSigma[1] = 1.0f / sigma[1]; + + // Intermediate matrix: invSigma * U^T + float SUT[2][2]; + for (int i = 0; i < 2; i++) { // row + for (int j = 0; j < 2; j++) { // col + SUT[i][j] = invSigma[i] * U[j][i]; + } + } + + // Final: Ainv = V * SUT + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + Ainv[i][j] = V[i][0] * SUT[0][j] + V[i][1] * SUT[1][j]; + } + } + + printf("step1: %lu step2: %lu step3: %lu step4: %lu total: %lu\n", t2-t1, t3-t2, t4-t3, t5-t4, t5 - t1); + // 印出結果 + printf("Right singular vectors V:\n"); + printf("[%.5f %.5f]\n", V[0][0], V[0][1]); + printf("[%.5f %.5f]\n", V[1][0], V[1][1]); + + printf("\nSingular values (Σ):\n"); + printf("[%.5f 0]\n", sigma[0]); + printf("[0 %.5f]\n", sigma[1]); + + printf("\nLeft singular vectors U:\n"); + printf("[%.5f %.5f]\n", U[0][0], U[0][1]); + printf("[%.5f %.5f]\n", U[1][0], U[1][1]); + + + printf("SVD Inverse of A:\n"); + printf("[%.5f %.5f]\n", Ainv[0][0], Ainv[0][1]); + printf("[%.5f %.5f]\n", Ainv[1][0], Ainv[1][1]); +} + +int main() { + float A[2][2] = { + {3, 1}, + {3, 2} + }; + float Ainv[2][2], Binv[2][2], Cinv[2][2]; + printf("\nUsing Without CORDIC:\n"); + svd_2x2(A,Ainv,NONE); + float B[2][2] = { + {3, 1}, + {3, 2} + }; + printf("\nUsing With MMIO_CORDIC:\n"); + svd_2x2(B,Binv,CORDIC_MMIO); + float C[2][2] = { + {3, 1}, + {3, 2} + }; + printf("\nUsing With ROCC_CORDIC:\n"); + svd_2x2(C,Cinv,CORDIC_ROCC); + return 0; +} diff --git a/tests/cordic.c b/tests/cordic.c new file mode 100644 index 0000000000..53a2dd7b8f --- /dev/null +++ b/tests/cordic.c @@ -0,0 +1,121 @@ +#include +#include +#include // for usleep() +#include "mmio.h" +#include "rocc.h" +#include +#include +#define CORDIC_BASE 0x4000 +#define CORDIC_STATUS (CORDIC_BASE + 0x00) +#define CORDIC_THETA (CORDIC_BASE + 0x04) +#define CORDIC_SIN_COS (CORDIC_BASE + 0x08) +#define CORDIC_FUNCT_ARCTAN 0x02 +#define CORDIC_FUNCT_COS 0x01 +#define CORDIC_FUNCT_SIN 0x00 +#define CORDIC_SCALE (32768.0f / 3.1415926f) // 32768 / π +#define PI 3.14159265358979323846 +inline uint64_t rdcycle() { + uint64_t cycle; + asm volatile ("rdcycle %0" : "=r"(cycle)); + return cycle; +} +inline void cosrocc(uint32_t theta_rad, float *cos_val) { + uint64_t result; + ROCC_INSTRUCTION_DS(3, result, theta_rad, CORDIC_FUNCT_COS); + int16_t raw = (int16_t)(result & 0xFFFF); + *cos_val = raw / 32768.0f; +} + +inline void sinrocc(uint32_t theta_rad, float *sin_val) { + uint64_t result; + ROCC_INSTRUCTION_DS(3, result, theta_rad, CORDIC_FUNCT_SIN); + int16_t raw = (int16_t)(result & 0xFFFF); + *sin_val = raw / 32768.0f; +} +typedef enum { + NONE = 0, + CORDIC_MMIO = 1, + CORDIC_ROCC = 2 +}CORDIC_ENABLE; +void arctanROCC(uint16_t x,uint16_t y, float *theta_val) { + uint16_t result; + ROCC_INSTRUCTION_DSS(3, result, x, y, CORDIC_FUNCT_ARCTAN); + *theta_val =((float)result) / CORDIC_SCALE; +} +void cosmmio(uint32_t theta_rad, float *cos_val){ + while ((reg_read32(CORDIC_STATUS) & 0x2) == 0) { + } + reg_write32(CORDIC_THETA, theta_rad); + while ((reg_read32(CORDIC_STATUS) & 0x1) == 0) { + } + uint32_t cos_sin_val = reg_read32(CORDIC_SIN_COS); + uint32_t cos_val_raw = (cos_sin_val) & 0xFFFF; + *cos_val = ((int16_t)cos_val_raw) / 32768.0f; +} +void sinmmio(uint32_t theta_rad, float *sin_val){ + while ((reg_read32(CORDIC_STATUS) & 0x2) == 0) { + } + reg_write32(CORDIC_THETA, theta_rad); + while ((reg_read32(CORDIC_STATUS) & 0x1) == 0) { + } + uint32_t cos_sin_val = reg_read32(CORDIC_SIN_COS); + uint32_t cos_val_raw = (cos_sin_val >> 16) & 0xFFFF; + *sin_val = ((int16_t)cos_val_raw) / 32768.0f; +} +void RISCVsin(CORDIC_ENABLE CORDIC,float theta_rad, float *sin_val, bool print_enable){ + uint64_t start = rdcycle(); + if(CORDIC == CORDIC_MMIO){ + sinmmio((uint32_t)(theta_rad * CORDIC_SCALE), sin_val); + }else if(CORDIC == CORDIC_ROCC){ + sinrocc((uint32_t)(theta_rad * CORDIC_SCALE), sin_val); + }else{ + *sin_val = sinf(theta_rad); + } + uint64_t end = rdcycle(); + if(print_enable){ + if(CORDIC == CORDIC_MMIO){ + printf(" %.6f, %lu, ", *sin_val, end - start); + }else if(CORDIC == CORDIC_ROCC){ + printf("%.6f, %lu, ", *sin_val, end - start); + }else{ + printf("%.6f, %lu, ", *sin_val, end - start); + } + } +} +void RISCVcos(CORDIC_ENABLE CORDIC,float theta_rad, float *cos_val,bool print_enable){ + uint64_t start = rdcycle(); + if(CORDIC == CORDIC_MMIO){ + cosmmio((uint32_t)(theta_rad * CORDIC_SCALE), cos_val); + }else if(CORDIC == CORDIC_ROCC){ + cosrocc((uint32_t)(theta_rad * CORDIC_SCALE), cos_val); + }else{ + *cos_val = cosf(theta_rad); + } + uint64_t end = rdcycle(); + if(print_enable){ + if(CORDIC == CORDIC_MMIO){ + printf(" %.6f, %lu, ", *cos_val, end - start); + }else if(CORDIC == CORDIC_ROCC){ + printf("%.6f, %lu\n", *cos_val, end - start); + }else{ + printf("%.6f, %lu, ", *cos_val, end - start); + } + } +} +void RISCVarctan(CORDIC_ENABLE CORDIC,float x,float y, float *theta_val,bool print_enable){ + uint64_t start = rdcycle(); + if(CORDIC == CORDIC_ROCC){ + arctanROCC((int16_t)x,(int16_t)y,theta_val); + }else{ + *theta_val = atan2f(y, x); + if(*theta_val < 0) { + *theta_val += 2 * PI; + } + } + uint64_t end = rdcycle(); + if(print_enable){ + printf(", %.6f, %lu", ((*theta_val) * 180 / PI), end - start); + if(CORDIC == NONE) + printf("\n"); + } +} diff --git a/tests/cordic.h b/tests/cordic.h new file mode 100644 index 0000000000..b9f9150ef1 --- /dev/null +++ b/tests/cordic.h @@ -0,0 +1,36 @@ +// cordic.h +#ifndef CORDIC_H +#define CORDIC_H + +#include +#include +#ifdef __cplusplus +extern "C" { +#endif + +// 共用常數/位址 +#define CORDIC_BASE 0x4000 +#define CORDIC_STATUS (CORDIC_BASE + 0x00) +#define CORDIC_THETA (CORDIC_BASE + 0x04) +#define CORDIC_SIN_COS (CORDIC_BASE + 0x08) +#define CORDIC_FUNCT_COS 0x01 +#define CORDIC_FUNCT_SIN 0x00 +#define CORDIC_SCALE (32768.0f / 3.1415926f) + +typedef enum { + NONE = 0, + CORDIC_MMIO = 1, + CORDIC_ROCC = 2 +} CORDIC_ENABLE; +void arctanROCC(int16_t x,int16_t y, float *theta_val); +void cosrocc(uint32_t theta_rad, float *cos_val); +void sinrocc(uint32_t theta_rad, float *sin_val); +void cosmmio(uint32_t theta_rad, float *cos_val); +void sinmmio(uint32_t theta_rad, float *sin_val); +void RISCVsin(CORDIC_ENABLE en, float theta_rad, float *sin_val,bool print_enable); +void RISCVcos(CORDIC_ENABLE en, float theta_rad, float *cos_val,bool print_enable); +void RISCVarctan(CORDIC_ENABLE CORDIC,float x,float y, float *theta_val,bool print_enable); +#ifdef __cplusplus +} +#endif +#endif diff --git a/tests/fft_soft.c b/tests/fft_soft.c new file mode 100644 index 0000000000..a034e4e908 --- /dev/null +++ b/tests/fft_soft.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include // for usleep() +#include "mmio.h" +#include "rocc.h" +#include "cordic.h" +#include + +#define PI 3.14159265358979323846 +#define N 8 +static inline uint64_t rdcycle() { + uint64_t cycle; + asm volatile ("rdcycle %0" : "=r"(cycle)); + return cycle; +} +typedef struct { + float real; + float imag; +} Complex; +static inline float wrap_to_2pi(float theta) { + // fmodf(theta, 2π) → 把角度縮到 (-2π, 2π) + theta = fmodf(theta, 2.0f * PI); + if (theta < 0) + theta += 2.0f * PI; // 把負角度補成正的 + return theta; // 範圍:[0, 2π) +} + +void fft(Complex *x,int CORDIC) { + uint64_t start = rdcycle(); + // Bit reversal permutation + int j = 0; + for (int i = 0; i < N; i++) { + if (i < j) { + Complex temp = x[i]; + x[i] = x[j]; + x[j] = temp; + } + int m = N >> 1; + while (m >= 1 && j >= m) { + j -= m; + m >>= 1; + } + j += m; + } + + // Cooley-Tukey + for (int s = 1; s <= 3; s++) { // log2(N) = 3 + int m = 1 << s; + float theta = wrap_to_2pi(-2 * PI / m); + float cos = 0,sin = 0; + RISCVsin(CORDIC,theta, &sin,false); + RISCVcos(CORDIC,theta, &cos,false); + Complex wm = { cos, sin}; + for (int k = 0; k < N; k += m) { + Complex w = {1.0, 0.0}; + for (int j = 0; j < m / 2; j++) { + Complex t, u; + t.real = w.real * x[k + j + m/2].real - w.imag * x[k + j + m/2].imag; + t.imag = w.real * x[k + j + m/2].imag + w.imag * x[k + j + m/2].real; + + u = x[k + j]; + + x[k + j].real = u.real + t.real; + x[k + j].imag = u.imag + t.imag; + x[k + j + m/2].real = u.real - t.real; + x[k + j + m/2].imag = u.imag - t.imag; + + // w = w * wm + float w_real = w.real * wm.real - w.imag * wm.imag; + float w_imag = w.real * wm.imag + w.imag * wm.real; + w.real = w_real; + w.imag = w_imag; + } + } + } + uint64_t end = rdcycle(); + printf("FFT cycles: %lu\n", end - start); + +} + +int main() { + Complex input[N] = { + { 0.707, -0.707 }, + { 0.000, 0.996 }, + {-0.707, -0.707 }, + {-1.000, 0.000 }, + {-0.707, 0.707 }, + { 0.000, 0.004 }, + { 0.707, 0.707 }, + { 1.000, 0.000 }, + }; + printf("using Standard function:\n"); + fft(input, 0); + printf("FFT result:\n"); + for (int i = 0; i < N; i++) { + printf("[%d] %.4f ", i, input[i].real); + if(input[i].imag > 0) + printf("+"); + else + printf("-"); + printf(" j%.4f\n", fabs(input[i].imag)); + } + ///////Using MMIO + Complex input1[N] = { + { 0.707, -0.707 }, + { 0.000, 0.996 }, + {-0.707, -0.707 }, + {-1.000, 0.000 }, + {-0.707, 0.707 }, + { 0.000, 0.004 }, + { 0.707, 0.707 }, + { 1.000, 0.000 }, + }; + printf("using CORDIC_MMIO function:\n"); + fft(input1, 1); + printf("FFT result:\n"); + for (int i = 0; i < N; i++) { + printf("[%d] %.4f ", i, input1[i].real); + if(input1[i].imag > 0) + printf("+"); + else + printf("-"); + printf(" j%.4f\n", fabs(input1[i].imag)); + } + ///////Using ROCC + Complex input2[N] = { + { 0.707, -0.707 }, + { 0.000, 0.996 }, + {-0.707, -0.707 }, + {-1.000, 0.000 }, + {-0.707, 0.707 }, + { 0.000, 0.004 }, + { 0.707, 0.707 }, + { 1.000, 0.000 }, + }; + printf("using CORDIC_ROCC function:\n"); + fft(input2, 2); + printf("FFT result:\n"); + for (int i = 0; i < N; i++) { + printf("[%d] %.4f ", i, input2[i].real); + if(input2[i].imag > 0) + printf("+"); + else + printf("-"); + printf(" j%.4f\n", fabs(input2[i].imag)); + } + return 0; +} diff --git a/tests/test.c b/tests/test.c new file mode 100644 index 0000000000..1ba1811368 --- /dev/null +++ b/tests/test.c @@ -0,0 +1,46 @@ +#include"cordic.h" +#include +#include +#include +#define CORDIC_SCALE (32768.0f / 3.1415926f) // 32768 / π +#define N 5 +#define PI 3.14159265358979323846 +int main() { + printf("CORDIC Test Program\n"); + float sin_val, cos_val; + float x = 1, y = 0; + float theta; + printf("y ,x ,expect_angle ,acc_arctan_val, acc_arctan_cycle ,arctan_std_val,arctan_std_cycle\n"); + for(int i = 0;i < 360;i = i + 15){ + float theta_test = i * PI / 180.0f; // 角度轉弧度 + x = 1000 * cosf(theta_test); + y = 1000 * sinf(theta_test); + printf("%.6f,%.6f, %.6f ",y / 1000,x / 1000, theta_test * 180 / PI); + + RISCVarctan(CORDIC_ROCC,x,y, &theta, true); + RISCVarctan(NONE, x, y, &theta, true); + + } + printf("degrees, sinf_val, sinf_cycle, cosf_val, cosf_cycle, \ + MMIO_sin_val, MMIO_sin_cycle, MMIO_cos_val, MMIO_cos_cycle,\ + ROCC_sin_val, ROCC_sin_cycle, ROCC_cos_val, ROCC_cos_cycle, \ + \n"); + for(int i = 0; i < 360; i += 10) { + float theta_rad = i * (3.1415926f / 180.0f); // 角度轉弧度 + + printf("%d, ", i); + + // 使用標準函數 + RISCVsin(NONE, theta_rad, &sin_val, true); + RISCVcos(NONE, theta_rad, &cos_val,true); + + // 使用 MMIO CORDIC + RISCVsin(CORDIC_MMIO, theta_rad, &sin_val, true); + RISCVcos(CORDIC_MMIO, theta_rad, &cos_val,true); + + // 使用 ROCC CORDIC + RISCVsin(CORDIC_ROCC, theta_rad, &sin_val, true); + RISCVcos(CORDIC_ROCC, theta_rad, &cos_val,true); + } + return 0; +} \ No newline at end of file