diff --git a/c_c++/linux/x64/test_apps/Makefile b/c_c++/linux/x64/test_apps/Makefile index 23662f4..1a92c20 100644 --- a/c_c++/linux/x64/test_apps/Makefile +++ b/c_c++/linux/x64/test_apps/Makefile @@ -41,7 +41,7 @@ program_OBJS := $(program_C_OBJS) $(program_CXX_OBJS) program_INCLUDE_DIRS := program_LIBRARY_DIRS := program_LIBRARIES := riffa -CPPFLAGS += -g +CPPFLAGS += -g -pedantic -Wall -Werror -Wextra -pthread -fsanitize=thread,undefined CPPFLAGS += $(foreach includedir,$(program_INCLUDE_DIRS),-I$(includedir)) LDFLAGS += $(foreach librarydir,$(program_LIBRARY_DIRS),-L$(librarydir)) diff --git a/c_c++/linux/x64/test_apps/testutil.c b/c_c++/linux/x64/test_apps/testutil.c index 16f7025..ec2648d 100644 --- a/c_c++/linux/x64/test_apps/testutil.c +++ b/c_c++/linux/x64/test_apps/testutil.c @@ -32,26 +32,38 @@ // USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH // DAMAGE. // ---------------------------------------------------------------------- + +#include +#include #include #include #include "timer.h" #include "riffa.h" #define NUM_TESTS 100 +struct thread_info { /* Used as argument to thread_start() */ + + // please refer to API of fpga_send() and fpga_recv() at http://riffa.ucsd.edu/node/10 or https://github.com/KastnerRG/riffa/blob/master/driver/linux/riffa.c#L84-L111 + fpga_t * fpga; + unsigned int chnl; + unsigned int * buffer; + unsigned int len; + unsigned int offset; + unsigned int last; + long long timeout; +}; + int main(int argc, char** argv) { fpga_t * fpga; fpga_info_list info; int option; - int i; + unsigned int i; int id; int chnl; - size_t numWords; - int sent; - int recvd; - int failure = 0; + unsigned int numWords; unsigned int * sendBuffer; unsigned int * recvBuffer; - int err; + GET_TIME_INIT(3); if (argc < 2) { @@ -69,7 +81,7 @@ int main(int argc, char** argv) { return -1; } printf("Number of devices: %d\n", info.num_fpgas); - for (i = 0; i < info.num_fpgas; i++) { + for (i = 0; i < (unsigned int)info.num_fpgas; i++) { printf("%d: id:%d\n", i, info.id[i]); printf("%d: num_chnls:%d\n", i, info.num_chnls[i]); printf("%d: name:%s\n", i, info.name[i]); @@ -104,12 +116,12 @@ int main(int argc, char** argv) { return -1; } - size_t maxWords, minWords; + unsigned int maxWords, minWords; id = atoi(argv[2]); chnl = atoi(argv[3]); minWords = 4; // Must be at least 4 for the channel tester app maxWords = atoi(argv[4]); - printf("Running bandwidth test from %zu up to %zu words\n", minWords, maxWords); + printf("Running bandwidth test from %d up to %d words\n", minWords, maxWords); // Get the device with id fpga = fpga_open(id); @@ -134,410 +146,105 @@ int main(int argc, char** argv) { return -1; } - int numWords; - for (numWords = minWords; numWords <= maxWords; numWords = numWords*2) { - int j; - for (j = 0; j < NUM_TESTS + 1; ++j) { + for (numWords = minWords; numWords <= maxWords; numWords += (2*numWords <= maxWords) ? numWords : (maxWords-numWords)) { // adaptively change the buffer size for the final iteration, and double the transaction size for every iteration + //int j; + //for (j = 0; j < NUM_TESTS + 1; ++j) { // Initialize the data for (i = 0; i < numWords; i++) { sendBuffer[i] = i+1; recvBuffer[i] = 0; } - GET_TIME_VAL(0); - - // Send the data - sent = fpga_send(fpga, chnl, sendBuffer, numWords, 0, 1, 25000); - printf("Test %d: words sent: %d\n", j, sent); + int NTH = 2; // number of threads : one fpga_recv() thread, one fpga_send() thread - GET_TIME_VAL(1); - - if (sent != 0) { - // Recv the data - recvd = fpga_recv(fpga, chnl, recvBuffer, numWords, 25000); - printf("Test %d: words recv: %d\n", j, recvd); - } + pthread_t tid[NTH]; + struct thread_info tinfo[NTH]; + /* + struct thread_info *tinfo; + // Allocate memory for pthread_create() arguments + tinfo = calloc(NTH, sizeof(struct thread_info)); + if (tinfo == NULL) + exit(1); + */ - GET_TIME_VAL(2); + unsigned int ret_val[NTH]; // retval[0] is number of words sent; retval[1] is number of words received - // Check the data - if (recvd != 0) { - for (i = 4; i < recvd; i++) { - if (recvBuffer[i] != sendBuffer[i]) { - printf("recvBuffer[%d]: %d, expected %d\n", i, recvBuffer[i], sendBuffer[i]); - return; - } - } - - if (j > 0) - printf("send bw: %f\n", - sent*4.0/1000/1000/((TIME_VAL_TO_MS(1) - TIME_VAL_TO_MS(0))/1000.0)); //, - - if (j > 0) - printf("recv bw: %f\n", - recvd*4.0/1000/1000/((TIME_VAL_TO_MS(2) - TIME_VAL_TO_MS(1))/1000.0)); //, - } - } - } - // Done with device - fpga_close(fpga); - } - else if (option == 3) { // Send data, receive data - if (argc < 5) { - printf("Usage: %s %d \n", argv[0], option); - return -1; - } - - size_t maxWords, minWords; - id = atoi(argv[2]); - chnl = atoi(argv[3]); - minWords = 4; // Must be at least 4 for the channel tester app - maxWords = atoi(argv[4]); - printf("Running receive offset test from %zu up to %zu words\n", minWords, maxWords); - - // Get the device with id - fpga = fpga_open(id); - if (fpga == NULL) { - printf("Could not get FPGA %d\n", id); - return -1; - } - - // Malloc the arrays (page aligned) - printf("Asked for %zu bytes\n",((maxWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - err = posix_memalign((void **)&sendBuffer, 4096, ((maxWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - if (sendBuffer == NULL) { - printf("Could not malloc memory for sendBuffer\n"); - fpga_close(fpga); - return -1; - } - err = posix_memalign((void **)&recvBuffer, 4096, ((maxWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - - recvBuffer = (unsigned int *)malloc(((maxWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - if (recvBuffer == NULL) { - printf("Could not malloc memory for recvBuffer\n"); - free(sendBuffer); - fpga_close(fpga); - return -1; - } + int loop = 0; // for pthread_join() - int numWords; - for (numWords = minWords; numWords <= maxWords; numWords = numWords*2) { - int j; - for (j = 0; j < 4096/sizeof(unsigned int); j++) { - int rxOffset = j; - // Initialize the data - for (i = 0; i < numWords; i++) { - sendBuffer[i+ rxOffset] = i+1; - recvBuffer[i] = 0; + //printf("\n Going to create threads \n"); + /** Creation of threads*/ + /* for(loop=0; loop 0) - printf("send bw: %f\n", - sent*4.0/1000/1000/((TIME_VAL_TO_MS(1) - TIME_VAL_TO_MS(0))/1000.0)); //, + /** Synch of threads in order to exit normally*/ + GET_TIME_VAL(0); - if (j > 0) - printf("recv bw: %f\n", - recvd*4.0/1000/1000/((TIME_VAL_TO_MS(2) - TIME_VAL_TO_MS(1))/1000.0)); //, - if(failure) - return; + for(loop=0; loop \n", argv[0], option); - return -1; - } - size_t maxWords, minWords; - id = atoi(argv[2]); - chnl = atoi(argv[3]); - minWords = 4; // Must be at least 4 for the channel tester app - maxWords = atoi(argv[4]); - printf("Running tx offset test from %zu up to %zu words\n", minWords, maxWords); - - // Get the device with id - fpga = fpga_open(id); - if (fpga == NULL) { - printf("Could not get FPGA %d\n", id); - return -1; - } - - // Malloc the arrays (page aligned) - printf("Asked for %zu bytes\n",((maxWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - - err = posix_memalign((void **)&sendBuffer, 4096, ((maxWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - if (err) { - printf("Could not malloc memory for sendBuffer\n"); - fpga_close(fpga); - return -1; - } - err = posix_memalign((void **)&recvBuffer, 4096, ((maxWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - - if (err) { - printf("Could not malloc memory for recvBuffer\n"); - free(sendBuffer); - fpga_close(fpga); - return -1; - } - - int numWords; - for (numWords = minWords; numWords <= maxWords; numWords = numWords*2) { - int j; - for (j = 0; j < 4096/sizeof(unsigned int); ++j) { - int txOffset = j; - // Initialize the data - for (i = 0; i < numWords; i++) { - sendBuffer[i] = i+1; - recvBuffer[i + txOffset] = 0; - } - - GET_TIME_VAL(0); + GET_TIME_VAL(1); - // Send the data - sent = fpga_send(fpga, chnl, sendBuffer, numWords, 0, 1, 25000); - printf("Test %d: words sent: %d (Address %p) \n", j, sent, sendBuffer); + const double MILLI_CONVERSION = 1000.0; // converts milliseconds to seconds + const unsigned int BIRECTION = 2; // two ways, so total number of data transferred is doubled + double total_execution_time = ((TIME_VAL_TO_MS(1) - TIME_VAL_TO_MS(0)) / MILLI_CONVERSION); // in seconds - GET_TIME_VAL(1); + printf("number of words sent = %d\n\r", ret_val[0]); + printf("number of words recv = %d\n\r", ret_val[1]); - if (sent != 0) { - // Recv the data - recvd = fpga_recv(fpga, chnl, &recvBuffer[txOffset], numWords, 25000); - printf("Test %d: words recv: %d (Address %p) \n", j, recvd, &recvBuffer[txOffset]); - } + printf("Total execution time = %f s\n\r", total_execution_time); - GET_TIME_VAL(2); + if(ret_val[1] == numWords) // number of words sent == number of words received + { + const int GIGA_CONVERSION = 1000*1000*1000; // converts Bps to GBps + const int BYTES_PER_WORD = 4; // 32-bit = 4 bytes + const int WORDS_PER_TRANSACTION = 4; // we are using 128-bit PCIe interface. therefore there are 4 32-bit words in each transaction - // Check the data - if (recvd != 0) { - for (i = 4; i < recvd; i++) { - if (recvBuffer[i + txOffset] != sendBuffer[i]) { - printf("recvBuffer[%d]: %d, expected %d\n", i, recvBuffer[i + txOffset], sendBuffer[i]); - failure = 1; + // check the data + for (i = WORDS_PER_TRANSACTION; i < numWords; i++) { // the first 4 32-bit words are always corrupted, please refer to explanation given at https://pergamos.lib.uoa.gr/uoa/dl/frontend/file/lib/default/data/1326221/theFile#page=38 + if (recvBuffer[i] != sendBuffer[i]) { + printf("recvBuffer[%d]: %d, expected %d\n", i, recvBuffer[i], sendBuffer[i]); + return -1; } } - - if (j > 0) - printf("send bw: %f\n", - sent*4.0/1000/1000/((TIME_VAL_TO_MS(1) - TIME_VAL_TO_MS(0))/1000.0)); //, - - if (j > 0) - printf("recv bw: %f\n", - recvd*4.0/1000/1000/((TIME_VAL_TO_MS(2) - TIME_VAL_TO_MS(1))/1000.0)); //, - if(failure) - return; - + printf("Overall bandwidth: %f GBps\n\n", (double)BIRECTION*numWords*(double)BYTES_PER_WORD/(double)GIGA_CONVERSION/total_execution_time); } - } + + if(numWords == maxWords) break; // last iteration, so exit the loop } - // Done with device - fpga_close(fpga); - } - else if (option == 5) { // Send data, receive data - if (argc < 7) { - printf("Usage: %s %d \n", argv[0], option); - return -1; - } - size_t offset; - size_t numWords; - unsigned int numIter; - id = atoi(argv[2]); - chnl = atoi(argv[3]); - offset = atoi(argv[4]) % (4096 / sizeof(unsigned int)); - if(numWords < 4) { - printf("Must transfer at least 4 words %d\n", id); - return -1; - } - numWords = atoi(argv[5]); - numIter = atoi(argv[6]); - printf("Running single test with %zu words, from host-page offset %zu \n", numWords, offset); - // Get the device with id - fpga = fpga_open(id); - if (fpga == NULL) { - printf("Could not get FPGA %d\n", id); - return -1; - } - - // Malloc the arrays (page aligned) - printf("Asked for %zu bytes\n",((numWords*sizeof(unsigned int) + 4096)/4096)*4096 + 4096); - err = posix_memalign((void **)&sendBuffer, 4096, ((numWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - if (err) { - printf("Could not malloc memory for sendBuffer\n"); - fpga_close(fpga); - return -1; - } - - err = posix_memalign((void **)&recvBuffer, 4096, ((numWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - if (err) { - printf("Could not malloc memory for recvBuffer\n"); - free(sendBuffer); - fpga_close(fpga); - return -1; - } - - int j; - for (j = 0; j < numIter; ++j) { - for (i = 0; i < numWords; i++) { - sendBuffer[i + offset] = i+1; - recvBuffer[i] = 0; - } - - GET_TIME_VAL(0); - - // Send the data - sent = fpga_send(fpga, chnl, &sendBuffer[offset], numWords, 0, 1, 25000); - printf("words sent: %d\n", sent); - - GET_TIME_VAL(1); - - if (sent != 0) { - // Recv the data - recvd = fpga_recv(fpga, chnl, recvBuffer, numWords, 25000); - printf("words recv: %d\n", recvd); - } - - GET_TIME_VAL(2); - - // Check the data - if (recvd != 0) { - for (i = 4; i < recvd; i++) { - if (recvBuffer[i] != sendBuffer[i + offset]) { - printf("recvBuffer[%d]: %d, expected %d\n", i, recvBuffer[i], sendBuffer[i + offset]); - failure = 1; - } - } - - printf("send bw: %f\n", - sent*4.0/1000/1000/((TIME_VAL_TO_MS(1) - TIME_VAL_TO_MS(0))/1000.0)); //, - - printf("recv bw: %f\n", - recvd*4.0/1000/1000/((TIME_VAL_TO_MS(2) - TIME_VAL_TO_MS(1))/1000.0)); //, - if(failure) - return; - - } - } // Done with device - fpga_close(fpga); + fpga_close(fpga); } - else if (option == 6) { // Send data, receive data - if (argc < 7) { - printf("Usage: %s %d \n", argv[0], option); - return -1; - } - size_t offset; - size_t numWords; - unsigned int numIter; - - id = atoi(argv[2]); - chnl = atoi(argv[3]); - offset = atoi(argv[4]) % (4096 / sizeof(unsigned int)); - if(numWords < 4) { - printf("Must transfer at least 4 words %d\n", id); - return -1; - } - numWords = atoi(argv[5]); - numIter = atoi(argv[6]); - printf("Running single test with %zu words, to host-page offset %zu \n", numWords, offset); - - // Get the device with id - fpga = fpga_open(id); - if (fpga == NULL) { - printf("Could not get FPGA %d\n", id); - return -1; - } - - // Malloc the arrays (page aligned) - printf("Asked for %zu bytes\n",((numWords*sizeof(unsigned int) + 4096)/4096)*4096 + 4096); - err = posix_memalign((void **)&sendBuffer, 4096, ((numWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - if (err) { - printf("Could not malloc memory for sendBuffer\n"); - fpga_close(fpga); - return -1; - } - - err = posix_memalign((void **)&recvBuffer, 4096, ((numWords*sizeof(unsigned int)*2 + 4096)/4096)*4096 + 4096); - if (err) { - printf("Could not malloc memory for recvBuffer\n"); - free(sendBuffer); - fpga_close(fpga); - return -1; - } - - int j; - for (j = 0; j < numIter; ++j) { - for (i = 0; i < numWords; i++) { - sendBuffer[i] = i+1; - recvBuffer[i + offset] = 0; - } - - GET_TIME_VAL(0); - - // Send the data - sent = fpga_send(fpga, chnl, sendBuffer, numWords, 0, 1, 25000); - printf("test %d: words sent: %d\n", j, sent); - GET_TIME_VAL(1); - - if (sent != 0) { - // Recv the data - recvd = fpga_recv(fpga, chnl, &recvBuffer[offset], numWords, 25000); - printf("test %d: words recv: %d (Address %p %p)\n", j, recvd, &recvBuffer[offset], &recvBuffer[offset+numWords]); - } - - GET_TIME_VAL(2); - - // Check the data - if (recvd != 0) { - for (i = 4; i < recvd; i++) { - if (recvBuffer[i + offset] != sendBuffer[i]) { - printf("recvBuffer[%d]: %d, expected %d\n", i, recvBuffer[i + offset], sendBuffer[i]); - failure = 1; - } - } - - printf("send bw: %f\n", - sent*4.0/1000/1000/((TIME_VAL_TO_MS(1) - TIME_VAL_TO_MS(0))/1000.0)); //, - - printf("recv bw: %f\n", - recvd*4.0/1000/1000/((TIME_VAL_TO_MS(2) - TIME_VAL_TO_MS(1))/1000.0)); //, - if(failure) - return; - - } - } - // Done with device - fpga_close(fpga); - } return 0; } diff --git a/driver/linux/Makefile b/driver/linux/Makefile index 97952bd..3e3cb0c 100644 --- a/driver/linux/Makefile +++ b/driver/linux/Makefile @@ -85,7 +85,7 @@ define assert-variables endef all: builddvr -debug: CC += -DDEBUG -g +debug: CC += -DDEBUG -g -pthread debug: DBUGVAL = DEBUG debug: builddvr builddvr: $(NAME).ko $(NAME).so.$(LIB_VER) @@ -139,6 +139,8 @@ install: $(NAME).so.$(LIB_VER) $(NAME).ko ln -sf /usr/local/lib/lib$(NAME).so.$(LIB_VER) /usr/local/lib/lib$(NAME).so ldconfig depmod + make unload + make load uninstall: rm -f /usr/local/lib/lib$(NAME).so* diff --git a/driver/linux/riffa.c b/driver/linux/riffa.c index 2868c07..91b9b9a 100644 --- a/driver/linux/riffa.c +++ b/driver/linux/riffa.c @@ -45,9 +45,21 @@ #include #include #include +#include #include #include "riffa.h" +struct thread_info { /* Used as argument to thread_start() */ + // please refer to API of fpga_send() and fpga_recv() at http://riffa.ucsd.edu/node/10 or https://github.com/KastnerRG/riffa/blob/master/driver/linux/riffa.c#L84-L111 + fpga_t * fpga; + unsigned int chnl; + unsigned int * buffer; + unsigned int len; + unsigned int offset; + unsigned int last; + long long timeout; +}; + struct fpga_t { int fd; @@ -81,33 +93,42 @@ void fpga_close(fpga_t * fpga) free(fpga); } -int fpga_send(fpga_t * fpga, int chnl, void * data, int len, int destoff, - int last, long long timeout) +//int fpga_send(fpga_t * fpga, int chnl, void * data, int len, int destoff, int last, long long timeout) +void* fpga_send(void *arg) { - fpga_chnl_io io; + struct thread_info *tinfo_send = (struct thread_info *) arg; + + fpga_chnl_io io_send; - io.id = fpga->id; - io.chnl = chnl; - io.len = len; - io.offset = destoff; - io.last = last; - io.timeout = timeout; - io.data = (char *)data; + io_send.id = tinfo_send->fpga->id; + io_send.chnl = tinfo_send->chnl; + io_send.len = tinfo_send->len; + io_send.offset = tinfo_send->offset; + io_send.last = tinfo_send->last; + io_send.timeout = tinfo_send->timeout; + io_send.data = (char *)(tinfo_send->buffer); - return ioctl(fpga->fd, IOCTL_SEND, &io); + int number_of_words_sent = ioctl(tinfo_send->fpga->fd, IOCTL_SEND, &io_send); + + pthread_exit((void *)(intptr_t)number_of_words_sent); } -int fpga_recv(fpga_t * fpga, int chnl, void * data, int len, long long timeout) +//int fpga_recv(fpga_t * fpga, int chnl, void * data, int len, long long timeout) +void* fpga_recv(void *arg) { - fpga_chnl_io io; + struct thread_info *tinfo_recv = (struct thread_info *) arg; + + fpga_chnl_io io_recv; + + io_recv.id = tinfo_recv->fpga->id; + io_recv.chnl = tinfo_recv->chnl; + io_recv.len = tinfo_recv->len; + io_recv.timeout = tinfo_recv->timeout; + io_recv.data = (char *)(tinfo_recv->buffer); - io.id = fpga->id; - io.chnl = chnl; - io.len = len; - io.timeout = timeout; - io.data = (char *)data; + int number_of_words_recv = ioctl(tinfo_recv->fpga->fd, IOCTL_RECV, &io_recv); - return ioctl(fpga->fd, IOCTL_RECV, &io); + pthread_exit((void *)(intptr_t)number_of_words_recv); } void fpga_reset(fpga_t * fpga) diff --git a/driver/linux/riffa.h b/driver/linux/riffa.h index 408b867..a8db183 100644 --- a/driver/linux/riffa.h +++ b/driver/linux/riffa.h @@ -85,8 +85,8 @@ void fpga_close(fpga_t * fpga); * On success, returns the number of words sent. On error returns a negative * value. */ -int fpga_send(fpga_t * fpga, int chnl, void * data, int len, int destoff, - int last, long long timeout); +//int fpga_send(fpga_t * fpga, int chnl, void * data, int len, int destoff, int last, long long timeout); +void* fpga_send(void * arg); /** * Receives data from the FPGA channel chnl to the data pointer, using the @@ -103,7 +103,8 @@ int fpga_send(fpga_t * fpga, int chnl, void * data, int len, int destoff, * On success, returns the number of words written to the data array. On error * returns a negative value. */ -int fpga_recv(fpga_t * fpga, int chnl, void * data, int len, long long timeout); +//int fpga_recv(fpga_t * fpga, int chnl, void * data, int len, long long timeout); +void* fpga_recv(void * arg); /** * Resets the state of the FPGA and all transfers across all channels. This is diff --git a/driver/linux/riffa_driver.c b/driver/linux/riffa_driver.c index dc4b86b..4b971fd 100644 --- a/driver/linux/riffa_driver.c +++ b/driver/linux/riffa_driver.c @@ -51,7 +51,13 @@ #include #include #include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0) #include +#else +#include +#endif + #include #include #include @@ -125,6 +131,9 @@ static dev_t devt; static atomic_t used_fpgas[NUM_FPGAS]; static struct fpga_state * fpgas[NUM_FPGAS]; +static unsigned int tx_len; +static bool recv_sg_buf_populated; + /////////////////////////////////////////////////////// // MEMORY ALLOCATION & HELPER FUNCTIONS /////////////////////////////////////////////////////// @@ -312,17 +321,19 @@ static inline void process_intr_vector(struct fpga_state * sc, int off, // New TX (PC receive) transaction. if (vect & (1<<((5*i)+0))) { recv = 1; + recv_sg_buf_populated = 0; // resets for new transaction + // Read the offset/last and length offlast = read_reg(sc, CHNL_REG(chnl, TX_OFFLAST_REG_OFF)); - len = read_reg(sc, CHNL_REG(chnl, TX_LEN_REG_OFF)); + tx_len = read_reg(sc, CHNL_REG(chnl, TX_LEN_REG_OFF)); // Keep track of this transaction if (push_circ_queue(sc->recv[chnl]->msgs, EVENT_TXN_OFFLAST, offlast)) { printk(KERN_ERR "riffa: fpga:%d chnl:%d, recv txn offlast msg queue full\n", sc->id, chnl); } - if (push_circ_queue(sc->recv[chnl]->msgs, EVENT_TXN_LEN, len)) { + /*if (push_circ_queue(sc->recv[chnl]->msgs, EVENT_TXN_LEN, len)) { printk(KERN_ERR "riffa: fpga:%d chnl:%d, recv txn len msg queue full\n", sc->id, chnl); - } - DEBUG_MSG(KERN_INFO "riffa: fpga:%d chnl:%d, recv txn (len:%d off:%d last:%d)\n", sc->id, chnl, len, (offlast>>1), (offlast & 0x1)); + }*/ + DEBUG_MSG(KERN_INFO "riffa: fpga:%d chnl:%d, recv txn (len:%d off:%d last:%d)\n", sc->id, chnl, tx_len, (offlast>>1), (offlast & 0x1)); } // RX (PC send) scatter gather buffer is read. @@ -443,8 +454,10 @@ static inline struct sg_mapping * fill_sg_buf(struct fpga_state * sc, int chnl, down_read(¤t->mm->mmap_sem); #if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) num_pages = get_user_pages(current, current->mm, udata, num_pages_reqd, 1, 0, pages, NULL); - #else + #elsif LINUX_VERSION_CODE < KERNEL_VERSION(4,9,0) num_pages = get_user_pages(udata, num_pages_reqd, 1, 0, pages, NULL); + #else + num_pages = get_user_pages(udata, num_pages_reqd, FOLL_WRITE, pages, NULL); #endif up_read(¤t->mm->mmap_sem); if (num_pages <= 0) { @@ -610,24 +623,25 @@ static inline unsigned int chnl_recv(struct fpga_state * sc, int chnl, break; if (tymeout == 0) { printk(KERN_ERR "riffa: fpga:%d chnl:%d, recv timed out\n", sc->id, chnl); - free_sg_buf(sc, sc->recv[chnl]->sg_map_0); + /*free_sg_buf(sc, sc->recv[chnl]->sg_map_0); free_sg_buf(sc, sc->recv[chnl]->sg_map_1); - return (unsigned int)(recvd>>2); + return (unsigned int)(recvd>>2);*/ } } tymeout = tymeouto; - + DEBUG_MSG(KERN_INFO "msg_type: %d\n", msg_type); // added by cheng fei // Process the message. switch (msg_type) { case EVENT_TXN_OFFLAST: // Read the offset and last flags (always before reading length) offset = (((unsigned long long)(msg>>1))<<2); last = (msg & 0x1); - break; + //break; - case EVENT_TXN_LEN: + //case EVENT_TXN_LEN: // Read the length - length = (((unsigned long long)msg)<<2); + //length = (((unsigned long long)msg)<<2); + length = tx_len << 2; recvd = 0; overflow = 0; // Check for address overflow @@ -663,7 +677,13 @@ static inline unsigned int chnl_recv(struct fpga_state * sc, int chnl, write_reg(sc, CHNL_REG(chnl, TX_SG_ADDR_LO_REG_OFF), (sc->recv[chnl]->buf_hw_addr & 0xFFFFFFFF)); write_reg(sc, CHNL_REG(chnl, TX_SG_ADDR_HI_REG_OFF), ((sc->recv[chnl]->buf_hw_addr>>32) & 0xFFFFFFFF)); write_reg(sc, CHNL_REG(chnl, TX_SG_LEN_REG_OFF), 4 * sg_map->num_sg); + + recv_sg_buf_populated = 1; + DEBUG_MSG(KERN_INFO "riffa: fpga:%d chnl:%d, recv sg buf populated, %d sent\n", sc->id, chnl, sg_map->num_sg); + + wake_up(&sc->send[chnl]->waitq); // https://elixir.bootlin.com/linux/v4.19-rc7/source/include/linux/wait.h#L476 + // The @condition is checked each time the waitqueue @wq_head is woken up. wake_up() has to be called after changing any variable that could change the result of the wait condition. } break; @@ -698,6 +718,8 @@ static inline unsigned int chnl_recv(struct fpga_state * sc, int chnl, break; case EVENT_TXN_DONE: + recv_sg_buf_populated = 0; // resets recv sg buf parameters for next transaction. + // Ignore if we haven't received offlast/len. if (last == -1) break; @@ -807,6 +829,12 @@ static inline unsigned int chnl_send(struct fpga_state * sc, int chnl, length -= sg_map->length; sc->send[chnl]->sg_map_1 = sg_map; + if(tx_len > 0) { // FPGA initiates new Tx transaction, so "yield" to software chnl_recv() thread + + // gives time for software chnl_recv() thread to populate recv sg buf parameter + wait_event_interruptible_timeout(sc->send[chnl]->waitq, (recv_sg_buf_populated == 1), timeout); + } + // Let FPGA know about the scatter gather buffer. write_reg(sc, CHNL_REG(chnl, RX_SG_ADDR_LO_REG_OFF), (sc->send[chnl]->buf_hw_addr & 0xFFFFFFFF)); write_reg(sc, CHNL_REG(chnl, RX_SG_ADDR_HI_REG_OFF), ((sc->send[chnl]->buf_hw_addr>>32) & 0xFFFFFFFF)); @@ -830,9 +858,9 @@ static inline unsigned int chnl_send(struct fpga_state * sc, int chnl, break; if (tymeout == 0) { printk(KERN_ERR "riffa: fpga:%d chnl:%d, send timed out\n", sc->id, chnl); - free_sg_buf(sc, sc->send[chnl]->sg_map_0); + /*free_sg_buf(sc, sc->send[chnl]->sg_map_0); free_sg_buf(sc, sc->send[chnl]->sg_map_1); - return (unsigned int)(sent>>2); + return (unsigned int)(sent>>2);*/ } } tymeout = tymeouto; @@ -978,6 +1006,13 @@ static inline void reset(int id) for (i = 0; i < sc->num_chnls; ++i) { while (!pop_circ_queue(sc->send[i]->msgs, &dummy0, &dummy1)); while (!pop_circ_queue(sc->recv[i]->msgs, &dummy0, &dummy1)); + + // resets read and write pointers of the circular queue + atomic_set(&sc->recv[i]->msgs->writeIndex, 0); + atomic_set(&sc->recv[i]->msgs->readIndex, 0); + atomic_set(&sc->send[i]->msgs->writeIndex, 0); + atomic_set(&sc->send[i]->msgs->readIndex, 0); + wake_up(&sc->send[i]->waitq); wake_up(&sc->recv[i]->waitq); clear_bit(CHNL_FLAG_BUSY, &sc->send[i]->flags); @@ -1530,7 +1565,12 @@ static void __devexit fpga_remove(struct pci_dev *dev) // MODULE INIT/EXIT FUNCTIONS /////////////////////////////////////////////////////// -static DEFINE_PCI_DEVICE_TABLE(fpga_ids) = { +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) +static DEFINE_PCI_DEVICE_TABLE(fpga_ids) = +#else +static const struct pci_device_id fpga_ids[] = +#endif +{ {PCI_DEVICE(VENDOR_ID0, PCI_ANY_ID)}, {PCI_DEVICE(VENDOR_ID1, PCI_ANY_ID)}, {0}, @@ -1598,4 +1638,3 @@ static void __exit fpga_exit(void) module_init(fpga_init); module_exit(fpga_exit); - diff --git a/driver/linux/riffa_driver.h b/driver/linux/riffa_driver.h index 255018c..cf366b2 100644 --- a/driver/linux/riffa_driver.h +++ b/driver/linux/riffa_driver.h @@ -47,7 +47,7 @@ #include -#define DBUG 1 +#define DEBUG 1 #ifdef DEBUG #define DEBUG_MSG(...) printk(__VA_ARGS__) diff --git a/fpga/riffa_hdl/chnl_tester.v b/fpga/riffa_hdl/chnl_tester.v index 4cd3543..c26b06e 100644 --- a/fpga/riffa_hdl/chnl_tester.v +++ b/fpga/riffa_hdl/chnl_tester.v @@ -51,6 +51,7 @@ module chnl_tester #( parameter C_PCI_DATA_WIDTH = 9'd32 ) ( + /*Signals to receive from PC via RIFFA core, transmit to FPGA via this chnl_tester module*/ input CLK, input RST, output CHNL_RX_CLK, @@ -63,6 +64,7 @@ module chnl_tester #( input CHNL_RX_DATA_VALID, output CHNL_RX_DATA_REN, + /*Signals to transmit to PC via RIFFA core, receive from FPGA via this chnl_tester module*/ output CHNL_TX_CLK, output CHNL_TX, input CHNL_TX_ACK, @@ -70,68 +72,125 @@ module chnl_tester #( output [31:0] CHNL_TX_LEN, output [30:0] CHNL_TX_OFF, output [C_PCI_DATA_WIDTH-1:0] CHNL_TX_DATA, - output CHNL_TX_DATA_VALID, + output reg CHNL_TX_DATA_VALID, input CHNL_TX_DATA_REN ); reg [C_PCI_DATA_WIDTH-1:0] rData={C_PCI_DATA_WIDTH{1'b0}}; -reg [31:0] rLen=0; +reg [C_PCI_DATA_WIDTH-1:0] data_reg={C_PCI_DATA_WIDTH{1'b0}}; +reg [C_PCI_DATA_WIDTH-1:0] tData={C_PCI_DATA_WIDTH{1'b0}}; + reg [31:0] rCount=0; -reg [1:0] rState=0; +reg [31:0] tCount=0; +reg [31:0] tCount_prev=0; +reg [1:0] rState=0; // Receiver states +reg [1:0] tState=0; // Transmitter states +reg TX_IN_PROGRESS = 0; assign CHNL_RX_CLK = CLK; assign CHNL_RX_ACK = (rState == 2'd1); -assign CHNL_RX_DATA_REN = (rState == 2'd1); +assign CHNL_RX_DATA_REN = (rState == 2'd1); assign CHNL_TX_CLK = CLK; -assign CHNL_TX = (rState == 2'd3); +assign CHNL_TX = (CHNL_RX && !CHNL_RX_DATA_VALID && (tCount_prev != CHNL_TX_LEN)) || (CHNL_RX_DATA_REN && CHNL_RX_DATA_VALID) || TX_IN_PROGRESS; // modify the CHNL_TX timing such that (assertion of both CHNL_TX_DATA_REN and CHNL_TX_DATA_VALID signals) are aligned "RIGHT AFTER (not until after all data had been received)" (assertion of both CHNL_RX_DATA_REN and CHNL_RX_DATA_VALID signals). Please refer to https://i.imgur.com/9a1AYiZ.png (Rx and Tx control signals are not overlapping) assign CHNL_TX_LAST = 1'd1; -assign CHNL_TX_LEN = rLen; // in words +assign CHNL_TX_LEN = CHNL_RX_LEN; // in words assign CHNL_TX_OFF = 0; -assign CHNL_TX_DATA = rData; -assign CHNL_TX_DATA_VALID = (rState == 2'd3); +assign CHNL_TX_DATA = tData; + + +always @(posedge CLK) begin -always @(posedge CLK or posedge RST) begin if (RST) begin - rLen <= #1 0; - rCount <= #1 0; rState <= #1 0; - rData <= #1 0; + rCount <= #1 0; end + else begin case (rState) - 2'd0: begin // Wait for start of RX, save length - if (CHNL_RX) begin - rLen <= #1 CHNL_RX_LEN; + 2'd0: begin // Wait for start of RX, save length + if (CHNL_RX) begin + rCount <= #1 0; + rState <= #1 2'd1; + end + end + + 2'd1: begin // Wait for last data in RX, save value + if (CHNL_RX_DATA_VALID) begin + rCount <= #1 rCount + (C_PCI_DATA_WIDTH/32); + end + + if (rCount >= CHNL_RX_LEN) begin + rState <= #1 2'd0; + rCount <= #1 0; + end + end + + default: begin + rState <= #1 2'd0; rCount <= #1 0; - rState <= #1 2'd1; end - end - 2'd1: begin // Wait for last data in RX, save value - if (CHNL_RX_DATA_VALID) begin - rData <= #1 CHNL_RX_DATA; - rCount <= #1 rCount + (C_PCI_DATA_WIDTH/32); + endcase + end +end + +reg rValid, valid_reg; + +always @(posedge CLK) begin // have to modify the logic flow for this always block for non-loopback case + + // for invalidating Tx data when CHNL_RX_DATA_VALID goes low + rValid <= CHNL_RX_DATA_VALID; + valid_reg <= rValid; + CHNL_TX_DATA_VALID <= valid_reg; + + // for timing synchronization of loopback between Rx and Tx due to three clock cycle delay incurred in https://github.com/KastnerRG/riffa/blob/master/fpga/riffa_hdl/tx_port_channel_gate_128.v#L148-L186 + rData <= CHNL_RX_DATA; + data_reg <= rData; + tData <= data_reg; +end + +always @(posedge CLK) tCount_prev <= tCount; + +always @(posedge CLK) begin + + if (RST) begin + tState <= #1 0; + tCount <= #1 0; + TX_IN_PROGRESS <= #1 0; + end + + else begin + case (tState) + + 2'd0: begin // Prepare for TX + if(CHNL_TX) begin // linux driver replied that it is ready for the first piece of data again after acknowledging it can receive the first piece of data (this piece of data is not consumed by linux driver yet until next state). Please refer to Tx timing diagram at http://riffa.ucsd.edu/node/3 + tState <= #1 2'd1; + tCount <= #1 0; //(C_PCI_DATA_WIDTH/32); + TX_IN_PROGRESS <=#1 1; // continues to assert "CHNL_TX" signal until the assertion of "CHNL_TX_ACK"signal + end end - if (rCount >= rLen) - rState <= #1 2'd2; - end - - 2'd2: begin // Prepare for TX - rCount <= #1 (C_PCI_DATA_WIDTH/32); - rState <= #1 2'd3; - end - - 2'd3: begin // Start TX with save length and data value - if (CHNL_TX_DATA_REN & CHNL_TX_DATA_VALID) begin - rData <= #1 {rCount + 4, rCount + 3, rCount + 2, rCount + 1}; - rCount <= #1 rCount + (C_PCI_DATA_WIDTH/32); - if (rCount >= rLen) - rState <= #1 2'd0; + + 2'd1: begin // Start TX with save length and data value + if (CHNL_TX_DATA_REN & CHNL_TX_DATA_VALID) begin + tCount <= #1 tCount + (C_PCI_DATA_WIDTH/32); + TX_IN_PROGRESS <= #1 1; // extends "CHNL_TX_DATA_VALID" asserted signal for another "CHNL_TX_LEN" clock cycles AFTER a single "CHNL_TX_ACK" positive pulse + end + + if (tCount >= CHNL_TX_LEN) begin + tState <= #1 2'd0; + tCount <= #1 0; + TX_IN_PROGRESS <= #1 0; + end end - end - + + default: begin + tState <= #1 2'd0; + tCount <= #1 0; + TX_IN_PROGRESS <= #1 0; + end + endcase end end diff --git a/fpga/riffa_hdl/tx_port_channel_gate_128.v b/fpga/riffa_hdl/tx_port_channel_gate_128.v index 42b4649..6a608ba 100644 --- a/fpga/riffa_hdl/tx_port_channel_gate_128.v +++ b/fpga/riffa_hdl/tx_port_channel_gate_128.v @@ -89,7 +89,7 @@ module tx_port_channel_gate_128 reg rOpen=0, _rOpen=0; assign CHNL_TX_ACK = rAck; - assign CHNL_TX_DATA_REN = (rOpen & !wFifoFull); // S_TXPORTGATE128_OPEN + assign CHNL_TX_DATA_REN = ((rState == `S_TXPORTGATE128_OPEN) && (!wFifoFull)); // (rOpen & !wFifoFull); // Buffer the input signals that come from outside the tx_port. always @ (posedge CHNL_CLK) begin