Full (AXI-)stream ahead! – Using AXI-stream with floating point numbers in HLS
Tim Fernandez-Hart
A more technical blog this week. Recently, I had to build an IP core in a baremetal environment that outputs a set of floating point numbers. Crucially, the number of floats was not known at synthesis, which meant I could not use an AXI-full or AXI-lite interface. The only option was an AXI-stream interface. I looked around for an example project or tutorial but none quite fit the bill. They all involved integers with AXI-stream or else an AXI-full interface with floats. So I thought I would write one to fill in the gaps because both floats and AXI-stream have their quirks.
Why use a DMA?
Without a DMA all the read and write operations would have to be performed by a CPU over the system bus. This is a blocking operation and ties the processor up performing menial read and write tasks, rather than higher level tasks more suited to a processor. Offloading the DDR read/ write operations to a DMA makes the system faster and more efficient.

Our IP block
We will make a simple proof of principal IP block using HLS. It will take two parameters, one an integer n, and the second a float f. The IP will then stream the given float f out n times. Here is the HLS code:
dma_test.h
#include <hls_stream.h>
#include <ap_fixed.h>
#include “ap_axi_sdata.h” // ← This is required for side-channels i.e. TLAST
typedef ap_axis<32,0 ,0 ,0> out_pkt;
union fp_int {
int i;
float f;
};
void dmaFloatTransfer(int n, float num, hls::stream<out_pkt> &output);
dma_test.cpp
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include “dma_test.h”
void dmaFloatTransfer(int n, float num, hls::stream<pkt> &output) {
#pragma HLS INTERFACE axis port=output
#pragma HLS INTERFACE s_axilite port=n bundle=CTRL
#pragma HLS INTERFACE s_axilite port=num bundle=CTRL
#pragma HLS INTERFACE s_axilite port=return bundle=CTRL
out_pkt pkt;
fp_int out_data;
for (int i=0 ; i<n ; i++) {
out_data.f = num;
pkt.data = out_data.i;
pkt.strb = 0xf;
pkt.keep = 0xf;
if (i==(n-1)){
pkt.last = true;
} else {
pkt.last = 0;
}
output << pkt;
}
}

Vivado
Lets configure the DMA. First disable scatter gather. This can be used to collect data from different memory addresses and uses a pool of reusable ring buffers which makes things rather more complicated. We shall save scatter gather for another time, so disable it for now. Our IP is also a producer of data so we do not need the AXI-stream ‘read’ interface.


#include <stdio.h>
#include “platform.h”
#include <xparameters.h>
#include “xdmafloattransfer.h”
#include “xaxidma.h”
union fp_int {
int i;
float f;
};
XDmafloattransfer dmaFloatTransfer;
XDmafloattransfer_Config *dmaFloatTransfer_cfg;
XAxiDma axiDMA;
XAxiDma_Config *axiDMA_cfg;
//DMA Addresses
#define MEM_BASE_ADDR 0x01000000
#define RX_BUFFER_BASE (MEM_BASE_ADDR + 0x00300000)
void initPeripherals()
{
printf(“Initialising FloatTransferIP…\\\\n”);
dmaFloatTransfer_cfg = XDmafloattransfer_LookupConfig(XPAR_DMAFLOATTRANSFER_0_DEVICE_ID);
if (dmaFloatTransfer_cfg)
{
int status = XDmafloattransfer_CfgInitialize(&dmaFloatTransfer, dmaFloatTransfer_cfg);
if(status != XST_SUCCESS)
{
printf(“Error Initialising IP core\\\\n”);
}
}
printf(“Initialising DMA…\\\\n”);
axiDMA_cfg = XAxiDma_LookupConfig(XPAR_AXI_DMA_0_DEVICE_ID);
if (axiDMA_cfg)
{
int status = XAxiDma_CfgInitialize(&axiDMA,axiDMA_cfg);
if(status != XST_SUCCESS)
{
printf(“Error Initialising DMA core\\\\n”);
}
}
//Disable Interrupts
XAxiDma_IntrDisable(&axiDMA, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DEVICE_TO_DMA);
XAxiDma_IntrDisable(&axiDMA, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DMA_TO_DEVICE);
}
int main()
{
union fp_int {
int i;
float f;
} converter;
int *m_dma_buffer_RX = (int*) RX_BUFFER_BASE;
initPeripherals();
int n = 2;
float num = 3.2f;
XDmafloattransfer_Set_n(&dmaFloatTransfer, n);
XDmafloattransfer_Set_num(&dmaFloatTransfer, *((u32*)&num));
XDmafloattransfer_Start(&dmaFloatTransfer);
//Flush the cache of the buffers
Xil_DCacheFlushRange((u32)m_dma_buffer_RX, n*sizeof(u32));
printf(“n: %d\\\\n”, n);
printf(“Get The Data\\\\n”);
XAxiDma_SimpleTransfer(&axiDMA,(u32)m_dma_buffer_RX,n*sizeof(u32),XAXIDMA_DEVICE_TO_DMA);
while(XAxiDma_Busy(&axiDMA,XAXIDMA_DEVICE_TO_DMA));
printf(“XAxiDma no longer busy.\\\\n”);
//Invalidate
Xil_DCacheInvalidateRange((u32)m_dma_buffer_RX,n*sizeof(u32));
while(!XDmafloattransfer_IsDone(&dmaFloatTransfer));
printf(“Calculation is Complete\\\\n”);
//Display Data
for (int idx = 0; idx < n; idx++)
{
num = (int) m_dma_buffer_RX[idx];
converter.i = *((float*)&num);
printf(“Recv[%d]=%f\\\\n”, idx , converter.f);
}
return 0;
}