Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

C++ Quick Reference

Common patterns for writing TAPA kernels. For full API details see C++ API.


Task structure

// Upper-level task: declare streams, invoke leaf tasks. No computation.
void Top(tapa::mmap<const float> in, tapa::mmap<float> out, uint64_t n) {
  tapa::stream<float, 16> q("q");
  tapa::task()
      .invoke(Load, in, n, q)
      .invoke(Store, q, out, n);
}

// Leaf task: contains all computation.
void Load(tapa::mmap<const float> mem, uint64_t n, tapa::ostream<float>& q) {
  for (uint64_t i = 0; i < n; ++i) q.write(mem[i]);
}

void Store(tapa::istream<float>& q, tapa::mmap<float> mem, uint64_t n) {
  for (uint64_t i = 0; i < n; ++i) mem[i] = q.read();
}

Host code

#include <gflags/gflags.h>
#include <tapa.h>

DEFINE_string(bitstream, "", "XO or xclbin path. Empty = software simulation.");

int main(int argc, char* argv[]) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  std::vector<float, tapa::aligned_allocator<float>> a(n), b(n);

  tapa::invoke(Top, FLAGS_bitstream,
               tapa::read_only_mmap<const float>(a),
               tapa::write_only_mmap<float>(b),
               (uint64_t)n);
}
FLAGS_bitstream valueBackend
(empty)Software simulation
kernel.xoFast cosimulation
kernel.hw.xclbinOn-board execution

Stream types

TypeUse in signatureDirection
tapa::stream<T, Depth>local variable in upper taskowner
tapa::istream<T>&leaf task parameterread only
tapa::ostream<T>&leaf task parameterwrite only
tapa::streams<T, N>local variablearray owner
tapa::istreams<T, N>&leaf task parameterarray read
tapa::ostreams<T, N>&leaf task parameterarray write
// Read
T val = in.read();             // blocking
bool ok = in.try_read(val);    // non-blocking, returns true on success

// Write
out.write(val);                // blocking
bool ok = out.try_write(val);  // non-blocking

// State checks
bool e = in.empty();
bool f = out.full();

// End-of-transaction
out.close();                   // send EoT marker
in.open();                     // consume EoT marker
TAPA_WHILE_NOT_EOT(in) { ... } // loop until EoT

Stream depth and FPGA resource:

DepthResource
< 128SRL shift-register (no BRAM)
≥ 128BRAM
≥ 4096 and element width ≥ 36 bURAM

Memory types

TypeSignatureAccess style
tapa::mmap<T>by valuesynchronous, pointer-like
tapa::async_mmap<T>by reference &decoupled AXI channels
// mmap — simple loop
for (int i = 0; i < n; ++i) out[i] = in[i];

// async_mmap — overlapping reads (two-counter loop)
for (int64_t i_req = 0, i_resp = 0; i_resp < n;) {
#pragma HLS pipeline II=1
  if (i_req < n) mem.read_addr.try_write(i_req++);
  T val;
  if (mem.read_data.try_read(val)) result[i_resp++] = val;
}

// async_mmap — writes with response drain
for (int64_t i_req = 0, i_resp = 0; i_resp < n;) {
#pragma HLS pipeline II=1
  if (i_req < n && !src.empty() &&
      !mem.write_addr.full() && !mem.write_data.full()) {
    mem.write_addr.try_write(i_req);
    mem.write_data.try_write(src.read(nullptr));
    ++i_req;
  }
  uint8_t ack;
  if (mem.write_resp.try_read(ack)) i_resp += unsigned(ack) + 1;
}

Parallel task instances

// Invoke N instances; each gets a unique index via tapa::seq
tapa::streams<float, 4> ch("ch");
tapa::task().invoke<tapa::join, 4>(Worker, ch, tapa::seq{});

void Worker(tapa::istream<float>& in, int idx) { /* ... */ }

Useful pragmas

#pragma HLS pipeline II=1      // pipeline loop with II=1
#pragma HLS unroll factor=4    // partially unroll loop

// C++ attribute equivalents
[[tapa::pipeline(1)]]
[[tapa::unroll(4)]]
[[tapa::target("ignore")]]     // mark task for custom RTL replacement

End-of-transaction macros

TAPA_WHILE_NOT_EOT(in)          { out.write(in.read(nullptr)); }
TAPA_WHILE_NEITHER_EOT(in1,in2) { /* both have data */ }
TAPA_WHILE_NONE_EOT(a, b, c)    { /* all three have data */ }

Build and run

# Software simulation
tapa g++ -- kernel.cpp host.cpp -o app
./app

# RTL synthesis
tapa compile --top Top --part-num xcu250-figd2104-2L-e \
  --clock-period 3.33 -f kernel.cpp -o kernel.xo

# Fast cosimulation
./app --bitstream=kernel.xo

# Bitstream link (v++)
v++ -o app.hw.xclbin --link --target hw --kernel Top \
  --platform xilinx_u250_gen3x16_xdma_4_1_202210_1 kernel.xo

# On-board run
./app --bitstream=app.hw.xclbin