Refer to the TAPA Github repo for the full code.
Mini Examples
Vector Add
// Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.
// All rights reserved. The contributor(s) of this file has/have agreed to the
// RapidStream Contributor License Agreement.
#include <cstdint>
#include <tapa.h>
void Add(tapa::istream<float>& a, tapa::istream<float>& b,
tapa::ostream<float>& c, uint64_t n) {
for (uint64_t i = 0; i < n; ++i) {
c << (a.read() + b.read());
}
}
void Mmap2Stream(tapa::mmap<const float> mmap, uint64_t n,
tapa::ostream<float>& stream) {
for (uint64_t i = 0; i < n; ++i) {
stream << mmap[i];
}
}
void Stream2Mmap(tapa::istream<float>& stream, tapa::mmap<float> mmap,
uint64_t n) {
for (uint64_t i = 0; i < n; ++i) {
stream >> mmap[i];
}
}
void VecAdd(tapa::mmap<const float> a, tapa::mmap<const float> b,
tapa::mmap<float> c, uint64_t n) {
tapa::stream<float> a_q("a");
tapa::stream<float> b_q("b");
tapa::stream<float> c_q("c");
tapa::task()
.invoke(Mmap2Stream, a, n, a_q)
.invoke(Mmap2Stream, b, n, b_q)
.invoke(Add, a_q, b_q, c_q, n)
.invoke(Stream2Mmap, c_q, c, n);
}
Vector Add with Multiple Task Hierarchy
// Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.
// All rights reserved. The contributor(s) of this file has/have agreed to the
// RapidStream Contributor License Agreement.
#include <cstdint>
#include <tapa.h>
void Add(uint64_t n_int, tapa::istream<float>& a_int,
tapa::istream<float>& b_int, tapa::ostream<float>& c_int) {
float a, b;
bool a_succeed = false, b_succeed = false;
uint64_t read = 0;
[[tapa::pipeline(1)]] while (read < n_int) {
if (!a_succeed) {
a = a_int.read(a_succeed);
}
if (!b_succeed) {
b = b_int.read(b_succeed);
}
if (a_succeed && b_succeed) {
c_int.write(a + b);
a_succeed = b_succeed = false;
read += 1;
}
}
c_int.close();
// Clear the eot tokens.
a_int.open();
b_int.open();
}
void Compute(uint64_t n_ext, tapa::istream<float>& a_ext,
tapa::istream<float>& b_ext, tapa::ostream<float>& c_ext) {
tapa::task().invoke(Add, n_ext, a_ext, b_ext, c_ext);
}
void Mmap2Stream_internal(tapa::async_mmap<float>& mmap_int, uint64_t n_int,
tapa::ostream<float>& stream_int) {
[[tapa::pipeline(1)]] for (uint64_t rq_i = 0, rs_i = 0; rs_i < n_int;) {
float elem;
if (rq_i < n_int &&
rq_i < rs_i + 50 && // TODO: resolve the DRAM lock issue
mmap_int.read_addr.try_write(rq_i))
rq_i++;
if (mmap_int.read_data.try_read(elem)) {
stream_int.write(elem);
rs_i++;
}
}
stream_int.close();
}
void Mmap2Stream(tapa::mmap<float> mmap_ext, uint64_t n_ext,
tapa::ostream<float>& stream_ext) {
tapa::task().invoke(Mmap2Stream_internal, mmap_ext, n_ext, stream_ext);
}
void Load(tapa::mmap<float> a_array, tapa::mmap<float> b_array,
tapa::ostream<float>& a_stream, tapa::ostream<float>& b_stream,
uint64_t n) {
tapa::task()
.invoke(Mmap2Stream, a_array, n, a_stream)
.invoke(Mmap2Stream, b_array, n, b_stream);
}
void Store(tapa::istream<float>& stream, tapa::mmap<float> mmap, uint64_t n) {
for (uint64_t i = 0; i < n; ++i) {
mmap[i] = stream.read();
}
// Clear the eot token.
stream.open();
}
void VecAddNested(tapa::mmap<float> a_array, tapa::mmap<float> b_array,
tapa::mmap<float> c_array, uint64_t n) {
tapa::stream<float, 8> a_stream("a");
tapa::stream<float, 8> b_stream("b");
tapa::stream<float, 8> c_stream("c");
tapa::task()
.invoke(Load, a_array, b_array, a_stream, b_stream, n)
.invoke(Compute, n, a_stream, b_stream, c_stream)
.invoke(Store, c_stream, c_array, n);
}
Bandwidth Test (using async_mmap
)
// Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.
// All rights reserved. The contributor(s) of this file has/have agreed to the
// RapidStream Contributor License Agreement.
#include <cstdint>
#include <iomanip>
#include <tapa.h>
#include "bandwidth.h"
#include "lfsr.h"
void Copy(tapa::async_mmap<Elem>& mem, uint64_t n, uint64_t flags) {
const bool random = flags & kRandom;
const bool read = flags & kRead;
const bool write = flags & kWrite;
if (!read && !write) return;
uint16_t mask = 0xffffu;
[[tapa::unroll]] //
for (int i = 16; i > 0; --i) {
if (n < (1ULL << i)) {
mask >>= 1;
}
}
Lfsr<16> lfsr_rd = 0xbeefu;
Lfsr<16> lfsr_wr = 0xbeefu;
Elem elem;
[[tapa::pipeline(1)]] //
for (uint64_t i_rd_req = 0, i_rd_resp = 0, i_wr_req = 0, i_wr_resp = 0;
write ? (i_wr_resp < n) : (i_rd_resp < n);) {
bool can_read = !mem.read_data.empty();
bool can_write = !mem.write_addr.full() && !mem.write_data.full();
int64_t read_addr = random ? uint64_t(lfsr_rd & mask) : i_rd_req;
int64_t write_addr = random ? uint64_t(lfsr_wr & mask) : i_wr_req;
if (read
// `i_rd_req < i_rd_resp + 50` is required for Vitis cosim on some
// platforms. Without it, cosim might end up stuck because the AXI
// interface from the Vitis platform never responds.
&& i_rd_req < i_rd_resp + 50 && i_rd_req < n &&
mem.read_addr.try_write(read_addr)) {
++i_rd_req;
++lfsr_rd;
VLOG(3) << "RD REQ [" << std::setw(5) << read_addr << "]";
}
if (read && can_read && (!write || can_write)) {
mem.read_data.try_read(elem);
++i_rd_resp;
VLOG(3) << "RD RSP #" << std::setw(5) << i_rd_resp - 1;
}
if (((read && can_read && write) || (!read && i_wr_req < n)) && can_write) {
mem.write_addr.write(write_addr);
mem.write_data.write(elem);
++i_wr_req;
++lfsr_wr;
VLOG(3) << "WR REQ [" << std::setw(5) << write_addr << "]";
}
if (write && !mem.write_resp.empty()) {
i_wr_resp += mem.write_resp.read(nullptr) + 1;
VLOG(3) << "WR RSP #" << std::setw(5) << i_wr_resp - 1;
}
}
}
void Bandwidth(tapa::mmaps<Elem, kBankCount> chan, uint64_t n, uint64_t flags) {
tapa::task().invoke<tapa::join, kBankCount>(Copy, chan, n, flags);
}
Network
// Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.
// All rights reserved. The contributor(s) of this file has/have agreed to the
// RapidStream Contributor License Agreement.
#include <tapa.h>
using tapa::detach;
using tapa::istream;
using tapa::istreams;
using tapa::mmap;
using tapa::ostreams;
using tapa::streams;
using tapa::task;
using tapa::vec_t;
using pkt_t = uint64_t;
constexpr int kN = 8; // kN x kN network
constexpr int kStageCount = 3; // log2(kN)
void Switch2x2(int b, istream<pkt_t>& pkt_in_q0, istream<pkt_t>& pkt_in_q1,
ostreams<pkt_t, 2>& pkt_out_q) {
uint8_t priority = 0;
b = kStageCount - 1 - b;
[[tapa::pipeline(1)]] for (bool valid_0, valid_1;;) {
#pragma HLS latency max = 0
auto pkt_0 = pkt_in_q0.peek(valid_0);
auto pkt_1 = pkt_in_q1.peek(valid_1);
bool fwd_0_0 = valid_0 && (pkt_0 & (1 << b)) == 0;
bool fwd_0_1 = valid_0 && (pkt_0 & (1 << b)) != 0;
bool fwd_1_0 = valid_1 && (pkt_1 & (1 << b)) == 0;
bool fwd_1_1 = valid_1 && (pkt_1 & (1 << b)) != 0;
bool conflict =
valid_0 && valid_1 && fwd_0_0 == fwd_1_0 && fwd_0_1 == fwd_1_1;
bool prioritize_1 = priority & 1;
bool read_0 = !((!fwd_0_0 && !fwd_0_1) || (prioritize_1 && conflict));
bool read_1 = !((!fwd_1_0 && !fwd_1_1) || (!prioritize_1 && conflict));
bool write_0 = fwd_0_0 || fwd_1_0;
bool write_1 = fwd_1_1 || fwd_0_1;
bool write_0_0 = fwd_0_0 && (!fwd_1_0 || !prioritize_1);
bool write_1_1 = fwd_1_1 && (!fwd_0_1 || prioritize_1);
// if can forward through (0->0 or 1->1), do it
// otherwise, check for conflict
const bool written_0 =
write_0 && pkt_out_q[0].try_write(write_0_0 ? pkt_0 : pkt_1);
const bool written_1 =
write_1 && pkt_out_q[1].try_write(write_1_1 ? pkt_1 : pkt_0);
// if can forward through (0->0 or 1->1), do it
// otherwise, round robin priority of both ins
if (read_0 && (write_0_0 ? written_0 : written_1)) {
pkt_in_q0.read(nullptr);
}
if (read_1 && (write_1_1 ? written_1 : written_0)) {
pkt_in_q1.read(nullptr);
}
if (conflict) ++priority;
}
}
void InnerStage(int b, istreams<pkt_t, kN / 2>& in_q0,
istreams<pkt_t, kN / 2>& in_q1, ostreams<pkt_t, kN> out_q) {
task().invoke<detach, kN / 2>(Switch2x2, b, in_q0, in_q1, out_q);
}
void Stage(int b, istreams<pkt_t, kN>& in_q, ostreams<pkt_t, kN> out_q) {
task().invoke<detach>(InnerStage, b, in_q, in_q, out_q);
}
void Produce(mmap<vec_t<pkt_t, kN>> mmap_in, uint64_t n,
ostreams<pkt_t, kN>& out_q) {
produce:
[[tapa::pipeline(1)]] for (uint64_t i = 0; i < n; ++i) {
auto buf = mmap_in[i];
for (int j = 0; j < kN; ++j) {
out_q[j].write(buf[j]);
}
}
}
void Consume(mmap<vec_t<pkt_t, kN>> mmap_out, uint64_t n,
istreams<pkt_t, kN> in_q) {
consume:
[[tapa::pipeline(1)]] for (uint64_t i = 0; i < n; ++i) {
vec_t<pkt_t, kN> buf;
for (int j = 0; j < kN; ++j) {
buf.set(j, in_q[j].read());
CHECK_EQ(buf[j] % kN, j);
}
mmap_out[i] = buf;
}
}
void Network(mmap<vec_t<pkt_t, kN>> mmap_in, mmap<vec_t<pkt_t, kN>> mmap_out,
uint64_t n) {
streams<pkt_t, kN*(kStageCount + 1), 4096> qs("qs");
task()
.invoke(Produce, mmap_in, n, qs)
.invoke<tapa::join, kStageCount>(Stage, tapa::seq(), qs, qs)
.invoke(Consume, mmap_out, n, qs);
}
Cannon
// Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.
// All rights reserved. The contributor(s) of this file has/have agreed to the
// RapidStream Contributor License Agreement.
#include <cassert>
#include <cstdint>
#include <tapa.h>
// p x p PEs
const int p = 2;
// Handles kN x kN matrices maximum.
const int kN = 32; // Use fixed value for efficient hardware generation.
// Scatter n*n matrix into p*p blocks, each block.
void Scatter(tapa::mmap<const float> matrix_ptr, tapa::ostream<float>& block_00,
tapa::ostream<float>& block_01, tapa::ostream<float>& block_10,
tapa::ostream<float>& block_11) {
const uint64_t kNumElems = (kN / p) * (kN / p);
for (uint64_t i = 0; i < kNumElems; ++i) {
block_00.write(*matrix_ptr);
++matrix_ptr;
}
for (uint64_t i = 0; i < kNumElems; ++i) {
block_01.write(*matrix_ptr);
++matrix_ptr;
}
for (uint64_t i = 0; i < kNumElems; ++i) {
block_10.write(*matrix_ptr);
++matrix_ptr;
}
for (uint64_t i = 0; i < kNumElems; ++i) {
block_11.write(*matrix_ptr);
++matrix_ptr;
}
}
void Gather(tapa::mmap<float> matrix_ptr, tapa::istream<float>& block_00,
tapa::istream<float>& block_01, tapa::istream<float>& block_10,
tapa::istream<float>& block_11) {
const uint64_t kNumElems = (kN / p) * (kN / p);
for (uint64_t i = 0; i < kNumElems; ++i) {
*matrix_ptr = block_00.read();
++matrix_ptr;
}
for (uint64_t i = 0; i < kNumElems; ++i) {
*matrix_ptr = block_01.read();
++matrix_ptr;
}
for (uint64_t i = 0; i < kNumElems; ++i) {
*matrix_ptr = block_10.read();
++matrix_ptr;
}
for (uint64_t i = 0; i < kNumElems; ++i) {
*matrix_ptr = block_11.read();
++matrix_ptr;
}
}
// Each PE processes n/p * n/p block of matrix.
void ProcElem(tapa::istream<float>& a_fifo, tapa::istream<float>& b_fifo,
tapa::ostream<float>& c_fifo, tapa::ostream<float>& i_prev,
tapa::istream<float>& i_next, tapa::ostream<float>& j_prev,
tapa::istream<float>& j_next) {
const uint64_t kNumElems = (kN / p) * (kN / p);
float a[kN / p * kN / p];
float b[kN / p * kN / p];
float c[kN / p * kN / p];
#pragma HLS array_partition variable = a cyclic factor = 32
#pragma HLS array_partition variable = b block factor = 32
#pragma HLS array_partition variable = c cyclic factor = 32
// Initialize local a, b, and c.
for (uint64_t i = 0; i < kNumElems; ++i) {
a[i] = a_fifo.read();
b[i] = b_fifo.read();
c[i] = 0.f;
}
for (int l = 0; l < p; ++l) {
[[tapa::pipeline(1)]] for (int ij = 0; ij < kNumElems; ++ij) {
#pragma HLS dependence false variable = c
float tmp = 0.f;
const int i = ij / (kN / p);
const int j = ij % (kN / p);
for (int k = 0; k < kN / p; ++k) {
tmp += a[i * (kN / p) + k] * b[k * (kN / p) + j];
}
c[ij] += tmp;
}
[[tapa::pipeline(1)]] for (uint64_t a_wr = 0, b_wr = 0, a_rd = 0, b_rd = 0;
a_wr < kNumElems || b_wr < kNumElems ||
a_rd < kNumElems || b_rd < kNumElems;) {
#pragma HLS loop_tripcount min = kNumElems max = kNumElems
#pragma HLS dependence false variable = a
#pragma HLS dependence false variable = b
if (b_wr < kNumElems && i_prev.try_write(b[b_wr])) ++b_wr;
if (a_wr < kNumElems && j_prev.try_write(a[a_wr])) ++a_wr;
if (b_rd < b_wr && i_next.try_read(b[b_rd])) ++b_rd;
if (a_rd < a_wr && j_next.try_read(a[a_rd])) ++a_rd;
}
}
for (uint64_t i = 0; i < kNumElems; ++i) {
c_fifo.write(c[i]);
}
}
void Cannon(tapa::mmap<const float> a_vec, tapa::mmap<const float> b_vec,
tapa::mmap<float> c_vec, uint64_t n) {
assert(kN % p == 0);
assert(n <= kN);
tapa::stream<float, 2> a_00("a->PE00");
tapa::stream<float, 2> a_01("a->PE01");
tapa::stream<float, 2> a_10("a->PE10");
tapa::stream<float, 2> a_11("a->PE11");
tapa::stream<float, 2> b_00("b->PE00");
tapa::stream<float, 2> b_01("b->PE01");
tapa::stream<float, 2> b_10("b->PE10");
tapa::stream<float, 2> b_11("b->PE11");
tapa::stream<float, 2> c_00("c->PE00");
tapa::stream<float, 2> c_01("c->PE01");
tapa::stream<float, 2> c_10("c->PE10");
tapa::stream<float, 2> c_11("c->PE11");
tapa::stream<float, 8> fifo_00_01("PE00->PE01");
tapa::stream<float, 8> fifo_01_00("PE01->PE00");
tapa::stream<float, 8> fifo_10_11("PE10->PE11");
tapa::stream<float, 8> fifo_11_10("PE11->PE10");
tapa::stream<float, 8> fifo_00_10("PE00->PE10");
tapa::stream<float, 8> fifo_10_00("PE10->PE00");
tapa::stream<float, 8> fifo_01_11("PE01->PE11");
tapa::stream<float, 8> fifo_11_01("PE11->PE01");
tapa::task()
.invoke(Scatter, a_vec, a_00, a_01, a_10, a_11)
.invoke(Scatter, b_vec, b_00, b_01, b_10, b_11)
.invoke(ProcElem, a_00, b_00, c_00, fifo_00_10, fifo_10_00, fifo_00_01,
fifo_01_00)
.invoke(ProcElem, a_01, b_01, c_01, fifo_01_11, fifo_11_01, fifo_01_00,
fifo_00_01)
.invoke(ProcElem, a_10, b_10, c_10, fifo_10_00, fifo_00_10, fifo_10_11,
fifo_11_10)
.invoke(ProcElem, a_11, b_11, c_11, fifo_11_01, fifo_01_11, fifo_11_10,
fifo_10_11)
.invoke(Gather, c_vec, c_00, c_01, c_10, c_11);
}
Jacobi
// Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.
// All rights reserved. The contributor(s) of this file has/have agreed to the
// RapidStream Contributor License Agreement.
#include <tapa.h>
void Mmap2Stream(tapa::mmap<const float> mmap, uint64_t n,
tapa::ostream<tapa::vec_t<float, 2>>& stream) {
[[tapa::pipeline(2)]] for (uint64_t i = 0; i < n; ++i) {
tapa::vec_t<float, 2> tmp;
tmp.set(0, mmap[i * 2]);
tmp.set(1, mmap[i * 2 + 1]);
stream.write(tmp);
}
stream.close();
}
void Stream2Mmap(tapa::istream<tapa::vec_t<float, 2>>& stream,
tapa::mmap<float> mmap) {
[[tapa::pipeline(2)]] for (uint64_t i = 0;;) {
bool eot;
if (stream.try_eot(eot)) {
if (eot) break;
auto packed = stream.read(nullptr);
mmap[i * 2] = packed[0];
mmap[i * 2 + 1] = packed[1];
++i;
}
}
}
void Module0Func(tapa::ostream<float>& fifo_st_0,
tapa::ostream<float>& fifo_st_1,
tapa::istream<tapa::vec_t<float, 2>>& dram_t1_bank_0_fifo) {
module_0_epoch:
[[tapa::pipeline(1)]] TAPA_WHILE_NOT_EOT(dram_t1_bank_0_fifo) {
auto dram_t1_bank_0_buf = dram_t1_bank_0_fifo.read(nullptr);
fifo_st_0.write(dram_t1_bank_0_buf[1]);
fifo_st_1.write(dram_t1_bank_0_buf[0]);
}
fifo_st_0.close();
fifo_st_1.close();
}
void Module1Func(tapa::ostream<float>& fifo_st_0,
tapa::ostream<float>& fifo_st_1,
tapa::istream<float>& fifo_ld_0) {
module_1_epoch:
[[tapa::pipeline(1)]] TAPA_WHILE_NOT_EOT(fifo_ld_0) {
auto fifo_ref_0 = fifo_ld_0.read(nullptr);
fifo_st_0.write(fifo_ref_0);
fifo_st_1.write(fifo_ref_0);
}
fifo_st_0.close();
fifo_st_1.close();
}
void Module3Func1(tapa::ostream<float>& fifo_st_0,
tapa::istream<float>& fifo_ld_0,
tapa::istream<float>& fifo_ld_1) {
const int delay_0 = 50;
int count = 0;
module_3_1_epoch:
[[tapa::pipeline(1)]] TAPA_WHILE_NEITHER_EOT(fifo_ld_0, fifo_ld_1) {
float fifo_ref_0 = 0.f;
bool do_ld_0 = count >= delay_0;
if (do_ld_0) {
fifo_ref_0 = fifo_ld_0.read(nullptr);
}
float fifo_ref_1 = fifo_ld_1.read(nullptr);
fifo_st_0.write(fifo_ref_0 + fifo_ref_1);
if (!do_ld_0) {
++count;
}
}
fifo_st_0.close();
}
void Module3Func2(tapa::ostream<float>& fifo_st_0,
tapa::istream<float>& fifo_ld_0,
tapa::istream<float>& fifo_ld_1) {
const int delay_0 = 51;
int count = 0;
module_3_2_epoch:
[[tapa::pipeline(1)]] TAPA_WHILE_NEITHER_EOT(fifo_ld_0, fifo_ld_1) {
float fifo_ref_0 = 0.f;
bool do_ld_0 = count >= delay_0;
if (do_ld_0) {
fifo_ref_0 = fifo_ld_0.read(nullptr);
}
float fifo_ref_1 = fifo_ld_1.read(nullptr);
fifo_st_0.write(fifo_ref_0 + fifo_ref_1);
if (!do_ld_0) {
++count;
}
}
fifo_st_0.close();
}
void Module6Func1(tapa::ostream<float>& fifo_st_0,
tapa::istream<float>& fifo_ld_0,
tapa::istream<float>& fifo_ld_1,
tapa::istream<float>& fifo_ld_2) {
const int delay_0 = 50;
const int delay_2 = 50;
int count = 0;
module_6_1_epoch:
[[tapa::pipeline(1)]] TAPA_WHILE_NONE_EOT(fifo_ld_0, fifo_ld_1, fifo_ld_2) {
float fifo_ref_0 = 0.f;
bool do_ld_0 = count >= delay_0;
if (do_ld_0) {
fifo_ref_0 = fifo_ld_0.read(nullptr);
}
auto fifo_ref_1 = fifo_ld_1.read(nullptr);
float fifo_ref_2 = 0.f;
bool do_ld_2 = count >= delay_2;
if (do_ld_2) {
fifo_ref_2 = fifo_ld_2.read(nullptr);
}
fifo_st_0.write((fifo_ref_0 + fifo_ref_1 + fifo_ref_2) * 0.2f);
if (!do_ld_0 || !do_ld_2) {
++count;
}
}
fifo_st_0.close();
}
void Module6Func2(tapa::ostream<float>& fifo_st_0,
tapa::istream<float>& fifo_ld_0,
tapa::istream<float>& fifo_ld_1,
tapa::istream<float>& fifo_ld_2) {
const int delay_0 = 49;
const int delay_2 = 50;
int count = 0;
module_6_2_epoch:
[[tapa::pipeline(1)]] TAPA_WHILE_NONE_EOT(fifo_ld_0, fifo_ld_1, fifo_ld_2) {
float fifo_ref_0 = 0.f;
bool do_ld_0 = count >= delay_0;
if (do_ld_0) {
fifo_ref_0 = fifo_ld_0.read(nullptr);
}
auto fifo_ref_1 = fifo_ld_1.read(nullptr);
float fifo_ref_2 = 0.f;
bool do_ld_2 = count >= delay_2;
if (do_ld_2) {
fifo_ref_2 = fifo_ld_2.read(nullptr);
}
fifo_st_0.write((fifo_ref_0 + fifo_ref_1 + fifo_ref_2) * 0.2f);
if (!do_ld_0 || !do_ld_2) {
++count;
}
}
fifo_st_0.close();
}
void Module8Func(tapa::ostream<tapa::vec_t<float, 2>>& dram_t0_bank_0_fifo,
tapa::istream<float>& fifo_ld_0,
tapa::istream<float>& fifo_ld_1) {
module_8_epoch:
[[tapa::pipeline(1)]] TAPA_WHILE_NEITHER_EOT(fifo_ld_0, fifo_ld_1) {
tapa::vec_t<float, 2> tmp;
tmp.set(0, fifo_ld_0.read(nullptr));
tmp.set(1, fifo_ld_1.read(nullptr));
dram_t0_bank_0_fifo.write(tmp);
}
dram_t0_bank_0_fifo.close();
}
void Jacobi(tapa::mmap<float> bank_0_t0, tapa::mmap<const float> bank_0_t1,
uint64_t coalesced_data_num) {
tapa::stream<tapa::vec_t<float, 2>, 32> bank_0_t1_buf("bank_0_t1_buf");
tapa::stream<tapa::vec_t<float, 2>, 32> bank_0_t0_buf("bank_0_t0_buf");
tapa::stream<float, 2> from_super_source_to_t1_offset_0(
"from_super_source_to_t1_offset_0");
tapa::stream<float, 2> from_super_source_to_t1_offset_1(
"from_super_source_to_t1_offset_1");
tapa::stream<float, 2> from_t1_offset_0_to_t1_offset_2000(
"from_t1_offset_0_to_t1_offset_2000");
tapa::stream<float, 4> from_t1_offset_0_to_tcse_var_0_pe_1(
"from_t1_offset_0_to_tcse_var_0_pe_1");
tapa::stream<float, 2> from_t1_offset_1_to_t1_offset_2001(
"from_t1_offset_1_to_t1_offset_2001");
tapa::stream<float, 6> from_t1_offset_1_to_tcse_var_0_pe_0(
"from_t1_offset_1_to_tcse_var_0_pe_0");
tapa::stream<float, 58> from_t1_offset_2000_to_t0_pe_1(
"from_t1_offset_2000_to_t0_pe_1");
tapa::stream<float, 52> from_t1_offset_2001_to_tcse_var_0_pe_1(
"from_t1_offset_2001_to_tcse_var_0_pe_1");
tapa::stream<float, 56> from_t1_offset_2001_to_t0_pe_0(
"from_t1_offset_2001_to_t0_pe_0");
tapa::stream<float, 2> from_tcse_var_0_pe_1_to_tcse_var_0_offset_0(
"from_tcse_var_0_pe_1_to_tcse_var_0_offset_0");
tapa::stream<float, 53> from_t1_offset_2000_to_tcse_var_0_pe_0(
"from_t1_offset_2000_to_tcse_var_0_pe_0");
tapa::stream<float, 2> from_tcse_var_0_pe_0_to_tcse_var_0_offset_1(
"from_tcse_var_0_pe_0_to_tcse_var_0_offset_1");
tapa::stream<float, 6> from_tcse_var_0_offset_0_to_t0_pe_1(
"from_tcse_var_0_offset_0_to_t0_pe_1");
tapa::stream<float, 2> from_tcse_var_0_offset_1_to_t0_pe_0(
"from_tcse_var_0_offset_1_to_t0_pe_0");
tapa::stream<float, 52> from_tcse_var_0_offset_0_to_t0_pe_0(
"from_tcse_var_0_offset_0_to_t0_pe_0");
tapa::stream<float, 4> from_t0_pe_0_to_super_sink(
"from_t0_pe_0_to_super_sink");
tapa::stream<float, 51> from_tcse_var_0_offset_1_to_t0_pe_1(
"from_tcse_var_0_offset_1_to_t0_pe_1");
tapa::stream<float, 2> from_t0_pe_1_to_super_sink(
"from_t0_pe_1_to_super_sink");
tapa::task()
.invoke(Mmap2Stream, "Mmap2Stream", bank_0_t1, coalesced_data_num,
bank_0_t1_buf)
.invoke(Module0Func, "Module0Func",
/*output*/ from_super_source_to_t1_offset_0,
/*output*/ from_super_source_to_t1_offset_1,
/* input*/ bank_0_t1_buf)
.invoke(Module1Func, "Module1Func#1",
/*output*/ from_t1_offset_0_to_t1_offset_2000,
/*output*/ from_t1_offset_0_to_tcse_var_0_pe_1,
/* input*/ from_super_source_to_t1_offset_0)
.invoke(Module1Func, "Module1Func#2",
/*output*/ from_t1_offset_1_to_t1_offset_2001,
/*output*/ from_t1_offset_1_to_tcse_var_0_pe_0,
/* input*/ from_super_source_to_t1_offset_1)
.invoke(Module1Func, "Module2Func#1",
/*output*/ from_t1_offset_2000_to_tcse_var_0_pe_0,
/*output*/ from_t1_offset_2000_to_t0_pe_1,
/* input*/ from_t1_offset_0_to_t1_offset_2000)
.invoke(Module1Func, "Module2Func#2",
/*output*/ from_t1_offset_2001_to_tcse_var_0_pe_1,
/*output*/ from_t1_offset_2001_to_t0_pe_0,
/* input*/ from_t1_offset_1_to_t1_offset_2001)
.invoke(Module3Func1, "Module3Func#1",
/*output*/ from_tcse_var_0_pe_1_to_tcse_var_0_offset_0,
/* input*/ from_t1_offset_2001_to_tcse_var_0_pe_1,
/* input*/ from_t1_offset_0_to_tcse_var_0_pe_1)
.invoke(Module3Func2, "Module3Func#2",
/*output*/ from_tcse_var_0_pe_0_to_tcse_var_0_offset_1,
/* input*/ from_t1_offset_2000_to_tcse_var_0_pe_0,
/* input*/ from_t1_offset_1_to_tcse_var_0_pe_0)
.invoke(Module1Func, "Module1Func#3",
/*output*/ from_tcse_var_0_offset_0_to_t0_pe_0,
/*output*/ from_tcse_var_0_offset_0_to_t0_pe_1,
/* input*/ from_tcse_var_0_pe_1_to_tcse_var_0_offset_0)
.invoke(Module1Func, "Module1Func#4",
/*output*/ from_tcse_var_0_offset_1_to_t0_pe_1,
/*output*/ from_tcse_var_0_offset_1_to_t0_pe_0,
/* input*/ from_tcse_var_0_pe_0_to_tcse_var_0_offset_1)
.invoke(Module6Func1, "Module6Func#1",
/*output*/ from_t0_pe_0_to_super_sink,
/* input*/ from_tcse_var_0_offset_0_to_t0_pe_0,
/* input*/ from_tcse_var_0_offset_1_to_t0_pe_0,
/* input*/ from_t1_offset_2001_to_t0_pe_0)
.invoke(Module6Func2, "Module6Func#2",
/*output*/ from_t0_pe_1_to_super_sink,
/* input*/ from_tcse_var_0_offset_1_to_t0_pe_1,
/* input*/ from_tcse_var_0_offset_0_to_t0_pe_1,
/* input*/ from_t1_offset_2000_to_t0_pe_1)
.invoke(Module8Func, "Module8Func",
/*output*/ bank_0_t0_buf,
/* input*/ from_t0_pe_0_to_super_sink,
/* input*/ from_t0_pe_1_to_super_sink)
.invoke(Stream2Mmap, "Stream2Mmap", bank_0_t0_buf, bank_0_t0);
}
Real-World Examples
The TAPA repo also includes a set of large scale designs under the
tests/regression
directory. This directory is under active development and
we are adding more sophiticated TAPA designs here.
cnn
andlu_decomposition
are both systolic arrays (of different shape) originally published in FPGA’21.hbm-bandwidth
can be used to profile the HBM bandwidth. It reads from and write back to all 32 channels of HBM. It is a good demonstration of the expressiveness and the optimized area ofasync_mmap
.hbm-bandwidth-1-ch
only reads from and writes to one HBM channel.serpens
is a sparse matrix-vector multiplication published in DAC’22. We provide different versions that are of the same architecture but different parallelism.spmm
is a sparse matrix-matrix multiplication published in FPGA’22spmv-hisparse-mmap
is another sparse matrix-vector multiplication published in HiSparse at FPGA’22knn
is a K-nearest-neighbor accelerator originally published in FPT’20page_rank
is an accelerator for the Page-Rank algorithm that is included in FCCM’21