From da9726aa93b3aaa90a65e03d30742f6262d760c0 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Thu, 12 Feb 2026 22:48:55 +0100 Subject: [PATCH 1/9] dpdk: backport rte_hash RCU fixes When rte_rcu_qsbr_dq_enqueue() fails in DQ mode, the deleted key slot is never freed and becomes permanently leaked. Also, when rte_hash_add_key_data() overwrites an existing key, the old data pointer is silently lost. With RCU-protected readers still potentially accessing the old data, there is no safe way to free it. Add two patches from an upstream series [1]: - Fall back to synchronous reclamation instead of only logging an error when the RCU defer queue enqueue fails on key deletion. - When RCU is configured with a free_key_data_func callback, automatically defer-free the old data pointer on overwrite. The third patch from that series (adding a new rte_hash_replace API) is not needed since the free_key_data_func callback is sufficient. [1] https://patches.dpdk.org/project/dpdk/list/?series=37352 Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- subprojects/dpdk.wrap | 5 +- ...ng-entries-on-RCU-defer-queue-failur.patch | 46 +++ ...ed-data-on-overwrite-when-RCU-is-con.patch | 323 ++++++++++++++++++ 3 files changed, 373 insertions(+), 1 deletion(-) create mode 100644 subprojects/packagefiles/dpdk/hash-avoid-leaking-entries-on-RCU-defer-queue-failur.patch create mode 100644 subprojects/packagefiles/dpdk/hash-free-replaced-data-on-overwrite-when-RCU-is-con.patch diff --git a/subprojects/dpdk.wrap b/subprojects/dpdk.wrap index 828812103..8827d3fd4 100644 --- a/subprojects/dpdk.wrap +++ b/subprojects/dpdk.wrap @@ -2,7 +2,10 @@ url = https://github.com/DPDK/dpdk-stable revision = v25.11 depth = 1 -diff_files = dpdk/iavf-fix-reported-max-TX-and-RX-queues-in-ethdev-inf.patch +diff_files = + dpdk/iavf-fix-reported-max-TX-and-RX-queues-in-ethdev-inf.patch, + dpdk/hash-avoid-leaking-entries-on-RCU-defer-queue-failur.patch, + dpdk/hash-free-replaced-data-on-overwrite-when-RCU-is-con.patch [provide] dependency_names = libdpdk diff --git a/subprojects/packagefiles/dpdk/hash-avoid-leaking-entries-on-RCU-defer-queue-failur.patch b/subprojects/packagefiles/dpdk/hash-avoid-leaking-entries-on-RCU-defer-queue-failur.patch new file mode 100644 index 000000000..62f266876 --- /dev/null +++ b/subprojects/packagefiles/dpdk/hash-avoid-leaking-entries-on-RCU-defer-queue-failur.patch @@ -0,0 +1,46 @@ +From 182f9223b9d407db31de3e54833572be554f08dc Mon Sep 17 00:00:00 2001 +From: Robin Jarry +Date: Fri, 13 Feb 2026 11:20:19 +0100 +Subject: [PATCH dpdk] hash: avoid leaking entries on RCU defer queue failure + +When rte_rcu_qsbr_dq_enqueue() fails in DQ mode, the deleted key slot +is never freed and becomes permanently leaked. Fall back to synchronous +reclamation instead of only logging an error. + +Cc: stable@dpdk.org +Fixes: 769b2de7fb52 ("hash: implement RCU resources reclamation") + +Signed-off-by: Robin Jarry +--- + lib/hash/rte_cuckoo_hash.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c +index da12825c6ed2..8189bde024be 100644 +--- a/lib/hash/rte_cuckoo_hash.c ++++ b/lib/hash/rte_cuckoo_hash.c +@@ -1870,18 +1870,15 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, + /* Key index where key is stored, adding the first dummy index */ + rcu_dq_entry.key_idx = ret + 1; + rcu_dq_entry.ext_bkt_idx = index; +- if (h->dq == NULL) { ++ if (h->dq == NULL || rte_rcu_qsbr_dq_enqueue(h->dq, &rcu_dq_entry) != 0) { + /* Wait for quiescent state change if using +- * RTE_HASH_QSBR_MODE_SYNC ++ * RTE_HASH_QSBR_MODE_SYNC or if RCU enqueue failed. + */ + rte_rcu_qsbr_synchronize(h->hash_rcu_cfg->v, + RTE_QSBR_THRID_INVALID); + __hash_rcu_qsbr_free_resource((void *)((uintptr_t)h), + &rcu_dq_entry, 1); +- } else if (h->dq) +- /* Push into QSBR FIFO if using RTE_HASH_QSBR_MODE_DQ */ +- if (rte_rcu_qsbr_dq_enqueue(h->dq, &rcu_dq_entry) != 0) +- HASH_LOG(ERR, "Failed to push QSBR FIFO"); ++ } + } + __hash_rw_writer_unlock(h); + return ret; +-- +2.53.0 + diff --git a/subprojects/packagefiles/dpdk/hash-free-replaced-data-on-overwrite-when-RCU-is-con.patch b/subprojects/packagefiles/dpdk/hash-free-replaced-data-on-overwrite-when-RCU-is-con.patch new file mode 100644 index 000000000..968eceaac --- /dev/null +++ b/subprojects/packagefiles/dpdk/hash-free-replaced-data-on-overwrite-when-RCU-is-con.patch @@ -0,0 +1,323 @@ +From ec82d13d198825f2d49d9fe24568dee5905554e9 Mon Sep 17 00:00:00 2001 +From: Robin Jarry +Date: Thu, 12 Feb 2026 21:25:53 +0100 +Subject: [PATCH dpdk] hash: free replaced data on overwrite when RCU is + configured + +When rte_hash_add_key_data() overwrites an existing key, the old data +pointer is silently lost. With RCU-protected readers still potentially +accessing the old data, the application has no safe way to free it. + +When RCU is configured with a free_key_data_func callback, automatically +enqueue the old data for deferred freeing via the RCU defer queue on +overwrite. In SYNC mode, synchronize and call free_key_data_func +directly. + +Cc: stable@dpdk.org +Fixes: 769b2de7fb52 ("hash: implement RCU resources reclamation") + +Signed-off-by: Robin Jarry +--- + lib/hash/rte_cuckoo_hash.c | 101 ++++++++++++++++++++++++++++--------- + lib/hash/rte_hash.h | 8 ++- + 2 files changed, 84 insertions(+), 25 deletions(-) + +diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c +index 8189bde024be..f487b3b725dd 100644 +--- a/lib/hash/rte_cuckoo_hash.c ++++ b/lib/hash/rte_cuckoo_hash.c +@@ -75,6 +75,7 @@ EAL_REGISTER_TAILQ(rte_hash_tailq) + struct __rte_hash_rcu_dq_entry { + uint32_t key_idx; + uint32_t ext_bkt_idx; ++ void *old_data; + }; + + RTE_EXPORT_SYMBOL(rte_hash_find_existing) +@@ -763,10 +764,11 @@ enqueue_slot_back(const struct rte_hash *h, + + /* Search a key from bucket and update its data. + * Writer holds the lock before calling this. ++ * If old_data is non-NULL, save the previous data pointer before overwriting. + */ + static inline int32_t + search_and_update(const struct rte_hash *h, void *data, const void *key, +- struct rte_hash_bucket *bkt, uint16_t sig) ++ struct rte_hash_bucket *bkt, uint16_t sig, void **old_data) + { + int i; + struct rte_hash_key *k, *keys = h->key_store; +@@ -776,6 +778,8 @@ search_and_update(const struct rte_hash *h, void *data, const void *key, + k = (struct rte_hash_key *) ((char *)keys + + bkt->key_idx[i] * h->key_entry_size); + if (rte_hash_cmp_eq(key, k->key, h) == 0) { ++ if (old_data != NULL) ++ *old_data = k->pdata; + /* The store to application data at *data + * should not leak after the store to pdata + * in the key store. i.e. pdata is the guard +@@ -807,7 +811,7 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h, + struct rte_hash_bucket *sec_bkt, + const struct rte_hash_key *key, void *data, + uint16_t sig, uint32_t new_idx, +- int32_t *ret_val) ++ int32_t *ret_val, void **old_data) + { + unsigned int i; + struct rte_hash_bucket *cur_bkt; +@@ -817,7 +821,7 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h, + /* Check if key was inserted after last check but before this + * protected region in case of inserting duplicated keys. + */ +- ret = search_and_update(h, data, key, prim_bkt, sig); ++ ret = search_and_update(h, data, key, prim_bkt, sig, old_data); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; +@@ -825,7 +829,7 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h, + } + + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { +- ret = search_and_update(h, data, key, cur_bkt, sig); ++ ret = search_and_update(h, data, key, cur_bkt, sig, old_data); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; +@@ -872,7 +876,7 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, + const struct rte_hash_key *key, void *data, + struct queue_node *leaf, uint32_t leaf_slot, + uint16_t sig, uint32_t new_idx, +- int32_t *ret_val) ++ int32_t *ret_val, void **old_data) + { + uint32_t prev_alt_bkt_idx; + struct rte_hash_bucket *cur_bkt; +@@ -892,7 +896,7 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, + /* Check if key was inserted after last check but before this + * protected region. + */ +- ret = search_and_update(h, data, key, bkt, sig); ++ ret = search_and_update(h, data, key, bkt, sig, old_data); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; +@@ -900,7 +904,7 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, + } + + FOR_EACH_BUCKET(cur_bkt, alt_bkt) { +- ret = search_and_update(h, data, key, cur_bkt, sig); ++ ret = search_and_update(h, data, key, cur_bkt, sig, old_data); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; +@@ -997,7 +1001,8 @@ rte_hash_cuckoo_make_space_mw(const struct rte_hash *h, + struct rte_hash_bucket *sec_bkt, + const struct rte_hash_key *key, void *data, + uint16_t sig, uint32_t bucket_idx, +- uint32_t new_idx, int32_t *ret_val) ++ uint32_t new_idx, int32_t *ret_val, ++ void **old_data) + { + unsigned int i; + struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN]; +@@ -1023,7 +1028,7 @@ rte_hash_cuckoo_make_space_mw(const struct rte_hash *h, + int32_t ret = rte_hash_cuckoo_move_insert_mw(h, + bkt, sec_bkt, key, data, + tail, i, sig, +- new_idx, ret_val); ++ new_idx, ret_val, old_data); + if (likely(ret != -1)) + return ret; + } +@@ -1076,6 +1081,29 @@ alloc_slot(const struct rte_hash *h, struct lcore_cache *cached_free_slots) + return slot_id; + } + ++/* ++ * When RCU is configured with a free function, auto-free the overwritten ++ * data pointer via RCU. ++ */ ++static inline void ++__hash_rcu_auto_free_old_data(const struct rte_hash *h, void *old_data_val) ++{ ++ struct __rte_hash_rcu_dq_entry rcu_dq_entry = { ++ .key_idx = EMPTY_SLOT, /* sentinel value for __hash_rcu_qsbr_free_resource */ ++ .old_data = old_data_val, ++ }; ++ ++ if (h->hash_rcu_cfg == NULL || h->hash_rcu_cfg->free_key_data_func == NULL) ++ return; ++ ++ if (h->dq == NULL || rte_rcu_qsbr_dq_enqueue(h->dq, &rcu_dq_entry) != 0) { ++ /* SYNC mode or enqueue failed in DQ mode */ ++ rte_rcu_qsbr_synchronize(h->hash_rcu_cfg->v, RTE_QSBR_THRID_INVALID); ++ h->hash_rcu_cfg->free_key_data_func(h->hash_rcu_cfg->key_data_ptr, ++ old_data_val); ++ } ++} ++ + static inline int32_t + __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, + hash_sig_t sig, void *data) +@@ -1092,6 +1120,7 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, + struct lcore_cache *cached_free_slots = NULL; + int32_t ret_val; + struct rte_hash_bucket *last; ++ void *saved_old_data = NULL; + + short_sig = get_short_sig(sig); + prim_bucket_idx = get_prim_bucket_index(h, sig); +@@ -1103,18 +1132,20 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, + + /* Check if key is already inserted in primary location */ + __hash_rw_writer_lock(h); +- ret = search_and_update(h, data, key, prim_bkt, short_sig); ++ ret = search_and_update(h, data, key, prim_bkt, short_sig, ++ &saved_old_data); + if (ret != -1) { + __hash_rw_writer_unlock(h); +- return ret; ++ goto overwrite; + } + + /* Check if key is already inserted in secondary location */ + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { +- ret = search_and_update(h, data, key, cur_bkt, short_sig); ++ ret = search_and_update(h, data, key, cur_bkt, short_sig, ++ &saved_old_data); + if (ret != -1) { + __hash_rw_writer_unlock(h); +- return ret; ++ goto overwrite; + } + } + +@@ -1153,33 +1184,39 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, + + /* Find an empty slot and insert */ + ret = rte_hash_cuckoo_insert_mw(h, prim_bkt, sec_bkt, key, data, +- short_sig, slot_id, &ret_val); ++ short_sig, slot_id, &ret_val, ++ &saved_old_data); + if (ret == 0) + return slot_id - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); +- return ret_val; ++ ret = ret_val; ++ goto overwrite; + } + + /* Primary bucket full, need to make space for new entry */ + ret = rte_hash_cuckoo_make_space_mw(h, prim_bkt, sec_bkt, key, data, +- short_sig, prim_bucket_idx, slot_id, &ret_val); ++ short_sig, prim_bucket_idx, slot_id, &ret_val, ++ &saved_old_data); + if (ret == 0) + return slot_id - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); +- return ret_val; ++ ret = ret_val; ++ goto overwrite; + } + + /* Also search secondary bucket to get better occupancy */ + ret = rte_hash_cuckoo_make_space_mw(h, sec_bkt, prim_bkt, key, data, +- short_sig, sec_bucket_idx, slot_id, &ret_val); ++ short_sig, sec_bucket_idx, slot_id, &ret_val, ++ &saved_old_data); + + if (ret == 0) + return slot_id - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); +- return ret_val; ++ ret = ret_val; ++ goto overwrite; + } + + /* if ext table not enabled, we failed the insertion */ +@@ -1193,17 +1230,21 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, + */ + __hash_rw_writer_lock(h); + /* We check for duplicates again since could be inserted before the lock */ +- ret = search_and_update(h, data, key, prim_bkt, short_sig); ++ ret = search_and_update(h, data, key, prim_bkt, short_sig, ++ &saved_old_data); + if (ret != -1) { + enqueue_slot_back(h, cached_free_slots, slot_id); +- goto failure; ++ __hash_rw_writer_unlock(h); ++ goto overwrite; + } + + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { +- ret = search_and_update(h, data, key, cur_bkt, short_sig); ++ ret = search_and_update(h, data, key, cur_bkt, short_sig, ++ &saved_old_data); + if (ret != -1) { + enqueue_slot_back(h, cached_free_slots, slot_id); +- goto failure; ++ __hash_rw_writer_unlock(h); ++ goto overwrite; + } + } + +@@ -1263,6 +1304,11 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, + __hash_rw_writer_unlock(h); + return slot_id - 1; + ++overwrite: ++ if (saved_old_data != NULL) ++ __hash_rcu_auto_free_old_data(h, saved_old_data); ++ return ret; ++ + failure: + __hash_rw_writer_unlock(h); + return ret; +@@ -1566,6 +1612,15 @@ __hash_rcu_qsbr_free_resource(void *p, void *e, unsigned int n) + *((struct __rte_hash_rcu_dq_entry *)e); + + RTE_SET_USED(n); ++ ++ if (rcu_dq_entry.key_idx == EMPTY_SLOT) { ++ /* Overwrite case: free old data only, do not recycle slot */ ++ RTE_ASSERT(h->hash_rcu_cfg->free_key_data_func != NULL); ++ h->hash_rcu_cfg->free_key_data_func(h->hash_rcu_cfg->key_data_ptr, ++ rcu_dq_entry.old_data); ++ return; ++ } ++ + keys = h->key_store; + + k = (struct rte_hash_key *) ((char *)keys + +diff --git a/lib/hash/rte_hash.h b/lib/hash/rte_hash.h +index f692e0868dcf..e33f0aea0f5e 100644 +--- a/lib/hash/rte_hash.h ++++ b/lib/hash/rte_hash.h +@@ -226,7 +226,9 @@ rte_hash_max_key_id(const struct rte_hash *h); + * Thread safety can be enabled by setting flag during + * table creation. + * If the key exists already in the table, this API updates its value +- * with 'data' passed in this API. It is the responsibility of ++ * with 'data' passed in this API. If RCU is configured with a ++ * free_key_data_func callback, the old data is automatically ++ * deferred-freed via RCU. Otherwise, it is the responsibility of + * the application to manage any memory associated with the old value. + * The readers might still be using the old value even after this API + * has returned. +@@ -253,7 +255,9 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data); + * Thread safety can be enabled by setting flag during + * table creation. + * If the key exists already in the table, this API updates its value +- * with 'data' passed in this API. It is the responsibility of ++ * with 'data' passed in this API. If RCU is configured with a ++ * free_key_data_func callback, the old data is automatically ++ * deferred-freed via RCU. Otherwise, it is the responsibility of + * the application to manage any memory associated with the old value. + * The readers might still be using the old value even after this API + * has returned. +-- +2.53.0 + From e1b2aacc52778cb57b6481ed5754837818d77d50 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 20:17:58 +0100 Subject: [PATCH 2/9] output: fix stats and status check for VLAN interfaces When outputting on a VLAN interface, the local iface variable is reassigned to the parent interface after VLAN tag insertion. The subsequent UP status check and TX stats increment then use this reassigned pointer, accounting them on the parent instead of the original VLAN interface. Use d->iface which still references the original VLAN interface for the status check and stats increment. Fixes: 770168529e5c ("port: add dedicated port_tx functions") Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- modules/infra/datapath/iface_output.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/modules/infra/datapath/iface_output.c b/modules/infra/datapath/iface_output.c index 8a20afae8..8a2a72378 100644 --- a/modules/infra/datapath/iface_output.c +++ b/modules/infra/datapath/iface_output.c @@ -58,6 +58,7 @@ static uint16_t iface_output_process( ) { uint16_t iface_id, vlan_id; const struct iface *iface; + struct iface_mbuf_data *d; struct rte_mbuf *m; rte_edge_t edge; @@ -65,8 +66,9 @@ static uint16_t iface_output_process( for (uint16_t i = 0; i < nb_objs; i++) { m = objs[i]; - iface = mbuf_data(m)->iface; - iface_id = iface->id; + d = iface_mbuf_data(m); + iface = d->iface; + iface_id = d->iface->id; if (iface->type == GR_IFACE_TYPE_VLAN) { const struct iface_info_vlan *vlan = iface_info_vlan(iface); @@ -86,15 +88,15 @@ static uint16_t iface_output_process( edge = NO_PARENT; goto next; } - if (!(iface->flags & GR_IFACE_F_UP)) { + if (!(d->iface->flags & GR_IFACE_F_UP)) { edge = IFACE_DOWN; goto next; } - IFACE_STATS_INC(tx, m, iface); + IFACE_STATS_INC(tx, m, d->iface); - iface_mbuf_data(m)->iface = iface; - iface_mbuf_data(m)->vlan_id = vlan_id; + d->iface = iface; + d->vlan_id = vlan_id; edge = iface_type_edges[iface->type]; next: rte_node_enqueue_x1(graph, node, edge, m); From 5ccc76ace2be8720597f95c0bd0e8ccea8a537ac Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 20:19:36 +0100 Subject: [PATCH 3/9] output: preserve VLAN ID from input path Bridge members that are not VLAN interfaces (trunk ports) need to carry the VLAN ID through the output path so that the Ethernet header can be built with the correct 802.1Q tag. iface_output unconditionally clears d->vlan_id to zero for non-VLAN interfaces, discarding the VLAN ID set during input processing. Only set d->vlan_id when the output interface is actually a VLAN type. Clear it instead at the points where it is no longer needed: in eth_output after the Ethernet header has been built, and in the control plane injection path where no VLAN context exists. Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- modules/infra/control/ctlplane.c | 3 ++- modules/infra/datapath/eth_output.c | 2 ++ modules/infra/datapath/iface_output.c | 11 +++-------- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/modules/infra/control/ctlplane.c b/modules/infra/control/ctlplane.c index a42b59a49..134d4a377 100644 --- a/modules/infra/control/ctlplane.c +++ b/modules/infra/control/ctlplane.c @@ -206,7 +206,8 @@ static void iface_cp_poll(evutil_socket_t, short reason, void *ev_iface) { } } - mbuf_data(mbuf)->iface = iface; + iface_mbuf_data(mbuf)->iface = iface; + iface_mbuf_data(mbuf)->vlan_id = 0; if (post_to_stack(iface_output, mbuf) < 0) { LOG(ERR, "post_to_stack: %s", strerror(errno)); diff --git a/modules/infra/datapath/eth_output.c b/modules/infra/datapath/eth_output.c index 8e073192e..3b93ddc07 100644 --- a/modules/infra/datapath/eth_output.c +++ b/modules/infra/datapath/eth_output.c @@ -51,6 +51,8 @@ eth_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, eth->src_addr = src_mac; eth->ether_type = priv->ether_type; + iface_mbuf_data(mbuf)->vlan_id = 0; + edge = OUTPUT; next: if (gr_mbuf_is_traced(mbuf)) { diff --git a/modules/infra/datapath/iface_output.c b/modules/infra/datapath/iface_output.c index 8a2a72378..146419ebb 100644 --- a/modules/infra/datapath/iface_output.c +++ b/modules/infra/datapath/iface_output.c @@ -56,7 +56,6 @@ static uint16_t iface_output_process( void **objs, uint16_t nb_objs ) { - uint16_t iface_id, vlan_id; const struct iface *iface; struct iface_mbuf_data *d; struct rte_mbuf *m; @@ -68,20 +67,17 @@ static uint16_t iface_output_process( m = objs[i]; d = iface_mbuf_data(m); iface = d->iface; - iface_id = d->iface->id; if (iface->type == GR_IFACE_TYPE_VLAN) { const struct iface_info_vlan *vlan = iface_info_vlan(iface); - vlan_id = vlan->vlan_id; + d->vlan_id = vlan->vlan_id; iface = iface_from_id(vlan->parent_id); - } else { - vlan_id = 0; } if (gr_mbuf_is_traced(m)) { struct iface_output_trace_data *t = gr_mbuf_trace_add(m, node, sizeof(*t)); - t->iface_id = iface_id; - t->vlan_id = vlan_id; + t->iface_id = d->iface->id; + t->vlan_id = d->vlan_id; } if (iface == NULL) { @@ -96,7 +92,6 @@ static uint16_t iface_output_process( IFACE_STATS_INC(tx, m, d->iface); d->iface = iface; - d->vlan_id = vlan_id; edge = iface_type_edges[iface->type]; next: rte_node_enqueue_x1(graph, node, edge, m); From f190d7f0a8cc74053ecd648efeb7e1e881148418 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Thu, 22 Jan 2026 18:56:44 +0100 Subject: [PATCH 4/9] main: move control queue from infra module A future change will require calling control_queue_push() from gr_event_push() which lives in main/. If control_queue stays in the infra module, this would create a circular dependency between main and infra. Move control_queue.c and gr_control_queue.h to main/ and replace the event-based drain mechanism with explicit control_queue_drain() calls from iface_destroy() and nexthop_destroy() after the RCU sync. Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- .../infra/control => main}/control_queue.c | 19 ++----------------- .../infra/control => main}/gr_control_queue.h | 4 ++++ main/meson.build | 1 + modules/infra/control/iface.c | 8 +++++--- modules/infra/control/meson.build | 1 - modules/infra/control/nexthop.c | 7 ++++--- 6 files changed, 16 insertions(+), 24 deletions(-) rename {modules/infra/control => main}/control_queue.c (86%) rename {modules/infra/control => main}/gr_control_queue.h (88%) diff --git a/modules/infra/control/control_queue.c b/main/control_queue.c similarity index 86% rename from modules/infra/control/control_queue.c rename to main/control_queue.c index 4922a8901..24a62b86d 100644 --- a/modules/infra/control/control_queue.c +++ b/main/control_queue.c @@ -2,14 +2,12 @@ // Copyright (c) 2024 Christophe Fontaine #include -#include -#include #include #include #include -#include #include +#include #include #include @@ -88,23 +86,11 @@ static void *sem_wait_to_event(void *) { return NULL; } -// When interfaces or nexthops are deleted, drain the control queue -// to free any packets that reference the deleted object. This prevents -// callbacks from being invoked with dangling pointers. -static void event_handler(uint32_t event, const void *obj) { +void control_queue_drain(uint32_t event, const void *obj) { struct control_queue_drain drain = {event, obj}; control_queue_poll(0, 0, &drain); } -static struct gr_event_subscription event_sub = { - .callback = event_handler, - .ev_count = 2, - .ev_types = { - GR_EVENT_IFACE_REMOVE, - GR_EVENT_NEXTHOP_DELETE, - }, -}; - static void control_queue_init(struct event_base *ev_base) { atomic_init(&thread_shutdown, false); @@ -146,5 +132,4 @@ static struct gr_module module = { RTE_INIT(control_queue_module_init) { gr_register_module(&module); - gr_event_subscribe(&event_sub); } diff --git a/modules/infra/control/gr_control_queue.h b/main/gr_control_queue.h similarity index 88% rename from modules/infra/control/gr_control_queue.h rename to main/gr_control_queue.h index 81cab3446..6c701f4da 100644 --- a/modules/infra/control/gr_control_queue.h +++ b/main/gr_control_queue.h @@ -13,6 +13,10 @@ struct control_queue_drain { const void *obj; // Object being deleted }; +// Force drain the control queue from all items. +// Pass ev_type and deleted_obj to item callbacks so that they can ignore/free references. +void control_queue_drain(uint32_t ev_type, const void *deleted_obj); + // Callback definition to pass arbitrary data to be processed by the control plane event loop. // It is up to the function to free any data referenced by the pointer if necessary. // diff --git a/main/meson.build b/main/meson.build index 8086eb988..e02364094 100644 --- a/main/meson.build +++ b/main/meson.build @@ -3,6 +3,7 @@ src += files( 'api.c', + 'control_queue.c', 'dpdk.c', 'event.c', 'main.c', diff --git a/modules/infra/control/iface.c b/modules/infra/control/iface.c index 93311f08e..81380481a 100644 --- a/modules/infra/control/iface.c +++ b/modules/infra/control/iface.c @@ -2,6 +2,7 @@ // Copyright (c) 2024 Robin Jarry #include +#include #include #include #include @@ -586,10 +587,11 @@ int iface_destroy(struct iface *iface) { rte_rcu_qsbr_synchronize(gr_datapath_rcu(), RTE_QSBR_THRID_INVALID); - // Push IFACE_REMOVE event after RCU sync to ensure all datapath threads + // Drain the control queue after RCU sync to ensure all datapath threads // have seen that this iface is gone. At this point, only packets already - // in the control queue may still reference it. The event triggers - // a drain that frees those packets before type->fini() frees the iface. + // in the control queue may still reference it. + control_queue_drain(GR_EVENT_IFACE_REMOVE, iface); + gr_event_push(GR_EVENT_IFACE_REMOVE, iface); type = iface_type_get(iface->type); diff --git a/modules/infra/control/meson.build b/modules/infra/control/meson.build index 34642ecb9..15cd88d04 100644 --- a/modules/infra/control/meson.build +++ b/modules/infra/control/meson.build @@ -3,7 +3,6 @@ src += files( 'bond.c', - 'control_queue.c', 'ctlplane.c', 'graph.c', 'group_nexthop.c', diff --git a/modules/infra/control/nexthop.c b/modules/infra/control/nexthop.c index 32c274fe0..227c1b1cb 100644 --- a/modules/infra/control/nexthop.c +++ b/modules/infra/control/nexthop.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-3-Clause // Copyright (c) 2024 Robin Jarry +#include #include #include #include @@ -478,11 +479,11 @@ void nexthop_destroy(struct nexthop *nh) { rte_rcu_qsbr_synchronize(gr_datapath_rcu(), RTE_QSBR_THRID_INVALID); - // Push NEXTHOP_DELETE event after RCU sync to ensure all datapath + // Drain the control queue after RCU sync to ensure all datapath // threads have seen that this nexthop is gone. At this point, only // packets already in the control queue may still reference it. - // The event triggers a drain that frees those packets before we free - // the nexthop memory. + control_queue_drain(GR_EVENT_NEXTHOP_DELETE, nh); + if (nh->origin != GR_NH_ORIGIN_INTERNAL) gr_event_push(GR_EVENT_NEXTHOP_DELETE, nh); From be44fa73364f84caf8937e2780ad92c14dde551a Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 23 Jan 2026 22:58:14 +0100 Subject: [PATCH 5/9] events: allow sending from datapath workers Event notifications must be processed on the control plane thread. Modify gr_event_push() to detect when it is called from a datapath worker and use the control queue to defer the notification to the control plane event loop. This enables datapath nodes (such as bridge MAC learning) to create MAC entries on the fly without blocking the control plane. Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- main/event.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/main/event.c b/main/event.c index 7f6aabe42..ded4dbed5 100644 --- a/main/event.c +++ b/main/event.c @@ -4,10 +4,13 @@ #include "api.h" #include +#include #include #include #include +#include + #include STAILQ_HEAD(subscribers, gr_event_subscription); @@ -17,7 +20,7 @@ void gr_event_subscribe(struct gr_event_subscription *sub) { STAILQ_INSERT_TAIL(&subscribers, sub, next); } -void gr_event_push(uint32_t ev_type, const void *obj) { +static void notify_subscribers(void *obj, uintptr_t ev_type, const struct control_queue_drain *) { const struct gr_event_subscription *sub; STAILQ_FOREACH (sub, &subscribers, next) { @@ -28,9 +31,24 @@ void gr_event_push(uint32_t ev_type, const void *obj) { } } } + api_send_notifications(ev_type, obj); } +void gr_event_push(uint32_t ev_type, const void *obj) { + if (rte_lcore_has_role(rte_lcore_id(), ROLE_NON_EAL)) { + // Called from a dataplane worker thread. + // Defer the notification to the control plane thread. + if (control_queue_push(notify_subscribers, (void *)obj, ev_type) < 0) { + // XXX: add error stat if push fails? + } + } else { + // Called from the control plane thread. + // Notify subscribers immediately. + notify_subscribers((void *)obj, ev_type, NULL); + } +} + STAILQ_HEAD(serializers, gr_event_serializer); static struct serializers serializers = STAILQ_HEAD_INITIALIZER(serializers); From 3d484753239593110fa3a5b5d5c54b935e79f1bd Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 00:05:52 +0100 Subject: [PATCH 6/9] l2: add bridge interface type Introduce a new l2 module with a bridge interface type that allows grouping multiple member interfaces (ports, VLANs, bonds) into a single L2 broadcast domain. The bridge maintains a list of members and supports configurable MAC learning, BUM traffic flooding, per-bridge ageing timer and a custom MAC address. Members are switched to GR_IFACE_MODE_BRIDGE when attached and restored to the default VRF when the bridge is destroyed. FDB management and datapath nodes for actual packet forwarding will follow in subsequent commits. Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- modules/infra/api/gr_infra.h | 9 +- modules/infra/control/ctlplane.c | 1 + modules/infra/control/iface.c | 1 + modules/l2/api/gr_l2.h | 42 ++++++ modules/l2/api/meson.build | 6 + modules/l2/cli/bridge.c | 210 +++++++++++++++++++++++++++++ modules/l2/cli/meson.build | 6 + modules/l2/control/bridge.c | 153 +++++++++++++++++++++ modules/l2/control/gr_l2_control.h | 16 +++ modules/l2/control/meson.build | 8 ++ modules/l2/meson.build | 6 + modules/meson.build | 1 + 12 files changed, 457 insertions(+), 2 deletions(-) create mode 100644 modules/l2/api/gr_l2.h create mode 100644 modules/l2/api/meson.build create mode 100644 modules/l2/cli/bridge.c create mode 100644 modules/l2/cli/meson.build create mode 100644 modules/l2/control/bridge.c create mode 100644 modules/l2/control/gr_l2_control.h create mode 100644 modules/l2/control/meson.build create mode 100644 modules/l2/meson.build diff --git a/modules/infra/api/gr_infra.h b/modules/infra/api/gr_infra.h index 90ecb247c..88c6df8c1 100644 --- a/modules/infra/api/gr_infra.h +++ b/modules/infra/api/gr_infra.h @@ -21,6 +21,7 @@ typedef enum : uint8_t { GR_IFACE_TYPE_VLAN, GR_IFACE_TYPE_IPIP, GR_IFACE_TYPE_BOND, + GR_IFACE_TYPE_BRIDGE, GR_IFACE_TYPE_COUNT } gr_iface_type_t; @@ -56,6 +57,7 @@ typedef enum : uint8_t { GR_IFACE_MODE_VRF = 0, GR_IFACE_MODE_XC, GR_IFACE_MODE_BOND, + GR_IFACE_MODE_BRIDGE, GR_IFACE_MODE_COUNT } gr_iface_mode_t; @@ -433,8 +435,6 @@ struct gr_infra_cpu_affinity_set_req { // Helper function to convert iface type enum to string static inline const char *gr_iface_type_name(gr_iface_type_t type) { switch (type) { - case GR_IFACE_TYPE_UNDEF: - return "undef"; case GR_IFACE_TYPE_VRF: return "vrf"; case GR_IFACE_TYPE_PORT: @@ -445,6 +445,9 @@ static inline const char *gr_iface_type_name(gr_iface_type_t type) { return "ipip"; case GR_IFACE_TYPE_BOND: return "bond"; + case GR_IFACE_TYPE_BRIDGE: + return "bridge"; + case GR_IFACE_TYPE_UNDEF: case GR_IFACE_TYPE_COUNT: break; } @@ -460,6 +463,8 @@ static inline const char *gr_iface_mode_name(gr_iface_mode_t mode) { return "XC"; case GR_IFACE_MODE_BOND: return "bond"; + case GR_IFACE_MODE_BRIDGE: + return "bridge"; case GR_IFACE_MODE_COUNT: break; } diff --git a/modules/infra/control/ctlplane.c b/modules/infra/control/ctlplane.c index 134d4a377..2d9bcf41c 100644 --- a/modules/infra/control/ctlplane.c +++ b/modules/infra/control/ctlplane.c @@ -397,6 +397,7 @@ static void iface_event(uint32_t event, const void *obj) { case GR_IFACE_TYPE_PORT: case GR_IFACE_TYPE_VLAN: case GR_IFACE_TYPE_BOND: + case GR_IFACE_TYPE_BRIDGE: break; default: return; diff --git a/modules/infra/control/iface.c b/modules/infra/control/iface.c index 81380481a..4e2eed60d 100644 --- a/modules/infra/control/iface.c +++ b/modules/infra/control/iface.c @@ -34,6 +34,7 @@ static bool iface_type_valid(gr_iface_type_t type) { case GR_IFACE_TYPE_VLAN: case GR_IFACE_TYPE_IPIP: case GR_IFACE_TYPE_BOND: + case GR_IFACE_TYPE_BRIDGE: return true; case GR_IFACE_TYPE_UNDEF: case GR_IFACE_TYPE_COUNT: diff --git a/modules/l2/api/gr_l2.h b/modules/l2/api/gr_l2.h new file mode 100644 index 000000000..c79066704 --- /dev/null +++ b/modules/l2/api/gr_l2.h @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#pragma once + +#include +#include +#include +#include + +#include + +#define GR_L2_MODULE 0xbabe + +// Bridge configuration flags. +typedef enum : uint16_t { + GR_BRIDGE_F_NO_FLOOD = GR_BIT16(0), + GR_BRIDGE_F_NO_LEARN = GR_BIT16(1), +} gr_bridge_flags_t; + +#define GR_BRIDGE_MAX_MEMBERS 64 +#define GR_BRIDGE_DEFAULT_AGEING 300 + +// Bridge reconfiguration attribute flags. +#define GR_BRIDGE_SET_AGEING_TIME GR_BIT64(32) +#define GR_BRIDGE_SET_FLAGS GR_BIT64(33) +#define GR_BRIDGE_SET_MAC GR_BIT64(34) + +struct __gr_iface_info_bridge_base { + uint16_t ageing_time; // Learned MAC ageing time in seconds (0 = default) + gr_bridge_flags_t flags; + struct rte_ether_addr mac; // Randomly generated if not set explicitly. + uint16_t n_members; +}; + +// Info structure for GR_IFACE_TYPE_BRIDGE interfaces. +// Only port, VLAN and bond interfaces can be members. +// Members are reassigned to the default VRF when the bridge is destroyed. +struct gr_iface_info_bridge { + BASE(__gr_iface_info_bridge_base); + uint16_t members[GR_BRIDGE_MAX_MEMBERS]; // Interface IDs of bridge members. +}; diff --git a/modules/l2/api/meson.build b/modules/l2/api/meson.build new file mode 100644 index 000000000..98416c3c9 --- /dev/null +++ b/modules/l2/api/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +api_headers += files('gr_l2.h') + +api_inc += include_directories('.') diff --git a/modules/l2/cli/bridge.c b/modules/l2/cli/bridge.c new file mode 100644 index 000000000..af03fc6ba --- /dev/null +++ b/modules/l2/cli/bridge.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +static void bridge_show(struct gr_api_client *c, const struct gr_iface *iface) { + const struct gr_iface_info_bridge *bridge = PAYLOAD(iface); + + printf("flags: %sflood %slearn\n", + (bridge->flags & GR_BRIDGE_F_NO_FLOOD) ? "no_" : "", + (bridge->flags & GR_BRIDGE_F_NO_LEARN) ? "no_" : ""); + + printf("ageing_time: %u seconds\n", bridge->ageing_time); + printf("mac: " ETH_F "\n", &bridge->mac); + printf("members:\n"); + + for (uint8_t i = 0; i < bridge->n_members; i++) { + struct gr_iface *member = iface_from_id(c, bridge->members[i]); + if (member != NULL) + printf("- %s\n", member->name); + free(member); + } +} + +static void +bridge_list_info(struct gr_api_client *, const struct gr_iface *iface, char *buf, size_t len) { + const struct gr_iface_info_bridge *bridge = PAYLOAD(iface); + snprintf( + buf, + len, + "members=%u %sflood %slearn", + bridge->n_members, + (bridge->flags & GR_BRIDGE_F_NO_FLOOD) ? "no_" : "", + (bridge->flags & GR_BRIDGE_F_NO_LEARN) ? "no_" : "" + ); +} + +static struct cli_iface_type bridge_type = { + .type_id = GR_IFACE_TYPE_BRIDGE, + .show = bridge_show, + .list_info = bridge_list_info, +}; + +static uint64_t parse_bridge_args( + struct gr_api_client *c, + const struct ec_pnode *p, + struct gr_iface *iface, + bool update +) { + struct gr_iface_info_bridge *bridge = PAYLOAD(iface); + uint64_t set_attrs; + + set_attrs = parse_iface_args(c, p, iface, sizeof(*bridge), update); + + if (arg_str(p, "flood")) { + bridge->flags &= ~GR_BRIDGE_F_NO_FLOOD; + set_attrs |= GR_BRIDGE_SET_FLAGS; + } else if (arg_str(p, "no_flood")) { + bridge->flags |= GR_BRIDGE_F_NO_FLOOD; + set_attrs |= GR_BRIDGE_SET_FLAGS; + } + if (arg_str(p, "learn")) { + bridge->flags &= ~GR_BRIDGE_F_NO_LEARN; + set_attrs |= GR_BRIDGE_SET_FLAGS; + } else if (arg_str(p, "no_learn")) { + bridge->flags |= GR_BRIDGE_F_NO_LEARN; + set_attrs |= GR_BRIDGE_SET_FLAGS; + } + + if (arg_u16(p, "AGE", &bridge->ageing_time) == 0) + set_attrs |= GR_BRIDGE_SET_AGEING_TIME; + else if (errno != ENOENT) + return 0; + + if (arg_eth_addr(p, "MAC", &bridge->mac) == 0) + set_attrs |= GR_BRIDGE_SET_MAC; + else if (errno != ENOENT) + return 0; + + if (set_attrs == 0) + errno = EINVAL; + + return set_attrs; +} + +static cmd_status_t bridge_add(struct gr_api_client *c, const struct ec_pnode *p) { + const struct gr_infra_iface_add_resp *resp; + struct gr_infra_iface_add_req *req = NULL; + void *resp_ptr = NULL; + size_t len; + + len = sizeof(*req) + sizeof(struct gr_iface_info_bridge); + if ((req = calloc(1, len)) == NULL) + goto err; + + req->iface.type = GR_IFACE_TYPE_BRIDGE; + req->iface.flags = GR_IFACE_F_UP; + + if (parse_bridge_args(c, p, &req->iface, false) == 0) + goto err; + + if (gr_api_client_send_recv(c, GR_INFRA_IFACE_ADD, len, req, &resp_ptr) < 0) + goto err; + + free(req); + resp = resp_ptr; + printf("Created interface %u\n", resp->iface_id); + free(resp_ptr); + return CMD_SUCCESS; +err: + free(req); + return CMD_ERROR; +} + +static cmd_status_t bridge_set(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_infra_iface_set_req *req = NULL; + cmd_status_t ret = CMD_ERROR; + size_t len; + + len = sizeof(*req) + sizeof(struct gr_iface_info_bridge); + if ((req = calloc(1, len)) == NULL) + goto out; + + if ((req->set_attrs = parse_bridge_args(c, p, &req->iface, true)) == 0) + goto out; + + if (gr_api_client_send_recv(c, GR_INFRA_IFACE_SET, len, req, NULL) < 0) + goto out; + + ret = CMD_SUCCESS; +out: + free(req); + return ret; +} + +#define BRIDGE_ATTRS_CMD IFACE_ATTRS_CMD ",(ageing_time AGE),(mac MAC),FLOOD,LEARN" + +#define BRIDGE_ATTRS_ARGS \ + IFACE_ATTRS_ARGS, \ + with_help( \ + "Expiration time for learned MAC addresses.", \ + ec_node_uint("AGE", 0, UINT16_MAX, 10) \ + ), \ + with_help("Bridge ethernet address.", ec_node_re("MAC", ETH_ADDR_RE)), \ + EC_NODE_OR( \ + "FLOOD", \ + with_help( \ + "Enable flooding of BUM traffic.", ec_node_str("flood", "flood") \ + ), \ + with_help( \ + "Disable flooding of BUM traffic.", \ + ec_node_str("no_flood", "no_flood") \ + ) \ + ), \ + EC_NODE_OR( \ + "LEARN", \ + with_help("Enable MAC learning.", ec_node_str("learn", "learn")), \ + with_help("Disable MAC learning.", ec_node_str("no_learn", "no_learn")) \ + ) + +static int ctx_init(struct ec_node *root) { + int ret; + + ret = CLI_COMMAND( + INTERFACE_ADD_CTX(root), + "bridge NAME [" BRIDGE_ATTRS_CMD "]", + bridge_add, + "Create a new bridge interface.", + with_help("Interface name.", ec_node("any", "NAME")), + BRIDGE_ATTRS_ARGS + ); + if (ret < 0) + return ret; + ret = CLI_COMMAND( + INTERFACE_SET_CTX(root), + "bridge NAME (name NEW_NAME)," BRIDGE_ATTRS_CMD, + bridge_set, + "Modify bridge parameters.", + with_help( + "Interface name.", + ec_node_dyn("NAME", complete_iface_names, INT2PTR(GR_IFACE_TYPE_BRIDGE)) + ), + with_help("New interface name.", ec_node("any", "NEW_NAME")), + BRIDGE_ATTRS_ARGS + ); + if (ret < 0) + return ret; + + return 0; +} + +static struct cli_context ctx = { + .name = "bridge", + .init = ctx_init, +}; + +static void __attribute__((constructor, used)) init(void) { + cli_context_register(&ctx); + register_iface_type(&bridge_type); +} diff --git a/modules/l2/cli/meson.build b/modules/l2/cli/meson.build new file mode 100644 index 000000000..7401bdfae --- /dev/null +++ b/modules/l2/cli/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +cli_src += files( + 'bridge.c', +) diff --git a/modules/l2/control/bridge.c b/modules/l2/control/bridge.c new file mode 100644 index 000000000..a8bce94b1 --- /dev/null +++ b/modules/l2/control/bridge.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include + +#include +#include + +static int bridge_reconfig( + struct iface *iface, + uint64_t set_attrs, + const struct gr_iface *, + const void *api_info +) { + struct iface_info_bridge *cur = iface_info_bridge(iface); + const struct gr_iface_info_bridge *next = api_info; + + if (set_attrs & GR_BRIDGE_SET_MAC) + iface_set_eth_addr(iface, &next->mac); + if (set_attrs & GR_BRIDGE_SET_FLAGS) + cur->flags = next->flags; + if (set_attrs & GR_BRIDGE_SET_AGEING_TIME) + cur->ageing_time = next->ageing_time ?: GR_BRIDGE_DEFAULT_AGEING; + + return 0; +} + +static int bridge_attach_member(struct iface *bridge, struct iface *member) { + struct iface_info_bridge *br = iface_info_bridge(bridge); + + switch (member->type) { + case GR_IFACE_TYPE_PORT: + case GR_IFACE_TYPE_VLAN: + case GR_IFACE_TYPE_BOND: + break; + default: + return errno_set(EMEDIUMTYPE); + } + + for (unsigned i = 0; i < br->n_members; i++) { + if (br->members[i] == member) + return 0; // already a member + } + + if (br->n_members == ARRAY_DIM(br->members)) + return errno_set(EUSERS); + + br->members[br->n_members++] = member; + member->domain_id = bridge->id; + member->vrf_id = GR_VRF_ID_UNDEF; + member->mode = GR_IFACE_MODE_BRIDGE; + + return 0; +} + +static int bridge_detach_member(struct iface *bridge, struct iface *member) { + struct iface_info_bridge *br = iface_info_bridge(bridge); + + for (unsigned i = 0; i < br->n_members; i++) { + if (br->members[i] == member) { + unsigned last = br->n_members - 1; + if (i < last) + br->members[i] = br->members[last]; + br->n_members--; + member->domain_id = GR_IFACE_ID_UNDEF; + member->mode = GR_IFACE_MODE_VRF; + break; + } + } + + return 0; +} + +static int bridge_fini(struct iface *iface) { + struct iface_info_bridge *bridge = iface_info_bridge(iface); + + for (unsigned i = 0; i < bridge->n_members; i++) { + struct iface *member = bridge->members[i]; + member->vrf_id = vrf_default_get_or_create(); + if (member->vrf_id != GR_VRF_ID_UNDEF) + vrf_incref(member->vrf_id); + member->domain_id = GR_IFACE_ID_UNDEF; + member->mode = GR_IFACE_MODE_VRF; + gr_event_push(GR_EVENT_IFACE_POST_RECONFIG, member); + } + + return 0; +} + +static int bridge_init(struct iface *iface, const void *api_info) { + int ret; + + iface->domain_id = iface->id; // for convenience, bridges are in their own domain + + ret = bridge_reconfig(iface, IFACE_SET_ALL, NULL, api_info); + if (ret < 0) { + bridge_fini(iface); + errno = -ret; + } + + return ret; +} + +static int bridge_get_eth_addr(const struct iface *iface, struct rte_ether_addr *mac) { + const struct iface_info_bridge *bridge = iface_info_bridge(iface); + *mac = bridge->mac; + return 0; +} + +static int bridge_set_eth_addr(struct iface *iface, const struct rte_ether_addr *mac) { + struct iface_info_bridge *bridge = iface_info_bridge(iface); + + if (rte_is_zero_ether_addr(mac)) { + rte_eth_random_addr(bridge->mac.addr_bytes); + } else { + bridge->mac = *mac; + } + + return 0; +} + +static void bridge_to_api(void *info, const struct iface *iface) { + const struct iface_info_bridge *bridge = iface_info_bridge(iface); + struct gr_iface_info_bridge *api = info; + + api->ageing_time = bridge->ageing_time; + api->flags = bridge->flags; + api->mac = bridge->mac; + api->n_members = bridge->n_members; + for (unsigned i = 0; i < bridge->n_members; i++) + api->members[i] = bridge->members[i]->id; +} + +static struct iface_type iface_type_bridge = { + .id = GR_IFACE_TYPE_BRIDGE, + .pub_size = sizeof(struct gr_iface_info_bridge), + .priv_size = sizeof(struct iface_info_bridge), + .init = bridge_init, + .reconfig = bridge_reconfig, + .fini = bridge_fini, + .attach_domain = bridge_attach_member, + .detach_domain = bridge_detach_member, + .get_eth_addr = bridge_get_eth_addr, + .set_eth_addr = bridge_set_eth_addr, + .to_api = bridge_to_api, +}; + +RTE_INIT(bridge_constructor) { + iface_type_register(&iface_type_bridge); +} diff --git a/modules/l2/control/gr_l2_control.h b/modules/l2/control/gr_l2_control.h new file mode 100644 index 000000000..e9a16239d --- /dev/null +++ b/modules/l2/control/gr_l2_control.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#pragma once + +#include +#include + +#include + +// Internal bridge info structure. +GR_IFACE_INFO(GR_IFACE_TYPE_BRIDGE, iface_info_bridge, { + BASE(__gr_iface_info_bridge_base); + + struct iface *members[GR_BRIDGE_MAX_MEMBERS]; +}); diff --git a/modules/l2/control/meson.build b/modules/l2/control/meson.build new file mode 100644 index 000000000..28731231b --- /dev/null +++ b/modules/l2/control/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +src += files( + 'bridge.c', +) + +inc += include_directories('.') diff --git a/modules/l2/meson.build b/modules/l2/meson.build new file mode 100644 index 000000000..a696792d8 --- /dev/null +++ b/modules/l2/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +subdir('api') +subdir('cli') +subdir('control') diff --git a/modules/meson.build b/modules/meson.build index 1859be3d3..0b978a62b 100644 --- a/modules/meson.build +++ b/modules/meson.build @@ -5,6 +5,7 @@ subdir('infra') subdir('ip') subdir('ip6') subdir('ipip') +subdir('l2') subdir('l4') subdir('policy') subdir('srv6') From 1d7c7c713522f82755f825be5e13bb18e86f0a5a Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 00:08:51 +0100 Subject: [PATCH 7/9] l2: add fdb support Implement a forwarding database backed by an RCU-protected rte_hash with a dedicated rte_mempool for entries. The hash is configured with a free_key_data_func callback so that deleted entries are automatically returned to the pool after RCU synchronization. The hash table uses RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF which provides fully lock-free concurrency for both readers and writers. This allows multiple datapath workers to call fdb_learn() concurrently without any serialization. If two workers (or a worker and the control plane) race to insert the same MAC+VLAN key, one will overwrite the other. With the RCU fix from commit 9b43d4469e75 ("dpdk: backport rte_hash RCU fixes"), the overwritten data pointer is properly defer-freed back to the pool instead of being leaked. Entries can be added/deleted/flushed via the API and can also be dynamically learned from the datapath via fdb_learn(). A periodic ageing timer evicts learned entries that have not been refreshed within the bridge ageing_time. Static entries configured by the user are never aged out. FDB entries associated with a member or bridge are automatically purged on detach or bridge destruction. The FDB table size defaults to 4096 entries and can be changed at runtime via the config set/get API, provided the table is empty. Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- docs/meson.build | 7 +- modules/l2/api/gr_l2.h | 88 ++++++ modules/l2/cli/fdb.c | 352 ++++++++++++++++++++++ modules/l2/cli/meson.build | 1 + modules/l2/control/bridge.c | 3 + modules/l2/control/fdb.c | 452 +++++++++++++++++++++++++++++ modules/l2/control/gr_l2_control.h | 18 ++ modules/l2/control/meson.build | 1 + 8 files changed, 919 insertions(+), 3 deletions(-) create mode 100644 modules/l2/cli/fdb.c create mode 100644 modules/l2/control/fdb.c diff --git a/docs/meson.build b/docs/meson.build index 35f3a2ba5..478f5ef41 100644 --- a/docs/meson.build +++ b/docs/meson.build @@ -74,9 +74,10 @@ custom_target( # Individual command man pages # The list is hardcoded since we can't run grcli during meson configuration. grcli_commands = [ - 'address', 'affinity', 'conntrack', 'dnat44', 'events', 'graph', 'interface', - 'logging', 'nexthop', 'ping', 'ping6', 'route', 'router-advert', 'snat44', - 'stats', 'trace', 'traceroute', 'traceroute6', 'tunsrc', + 'address', 'affinity', 'conntrack', 'dnat44', 'events', 'fdb', 'graph', + 'interface', 'logging', 'nexthop', 'ping', 'ping6', 'route', + 'router-advert', 'snat44', 'stats', 'trace', 'traceroute', 'traceroute6', + 'tunsrc', ] foreach cmd : grcli_commands diff --git a/modules/l2/api/gr_l2.h b/modules/l2/api/gr_l2.h index c79066704..84f0c6dba 100644 --- a/modules/l2/api/gr_l2.h +++ b/modules/l2/api/gr_l2.h @@ -40,3 +40,91 @@ struct gr_iface_info_bridge { BASE(__gr_iface_info_bridge_base); uint16_t members[GR_BRIDGE_MAX_MEMBERS]; // Interface IDs of bridge members. }; + +// FDB (L2 Forwarding Database) management ///////////////////////////////////// + +// FDB entry flags. +typedef enum : uint8_t { + GR_FDB_F_STATIC = GR_BIT8(0), // User-configured, never aged out. + GR_FDB_F_LEARN = GR_BIT8(1), // Learned via local bridge. +} gr_fdb_flags_t; + +// Forwarding database entry associating a MAC+VLAN to a bridge member interface. +struct gr_fdb_entry { + uint16_t bridge_id; + struct rte_ether_addr mac; + uint16_t vlan_id; + uint16_t iface_id; // Updated automatically when a MAC moves between members. + gr_fdb_flags_t flags; + clock_t last_seen; // Refreshed on each datapath hit for learned entries. +}; + +enum { + GR_EVENT_FDB_ADD = EVENT_TYPE(GR_L2_MODULE, 0x0001), + GR_EVENT_FDB_DEL = EVENT_TYPE(GR_L2_MODULE, 0x0002), + GR_EVENT_FDB_UPDATE = EVENT_TYPE(GR_L2_MODULE, 0x0003), +}; + +// Add an FDB entry. The bridge_id is resolved from the member interface's domain. +// Entries without GR_FDB_F_STATIC are subject to ageing like learned entries. +#define GR_FDB_ADD REQUEST_TYPE(GR_L2_MODULE, 0x0001) + +struct gr_fdb_add_req { + struct gr_fdb_entry fdb; + bool exist_ok; // If true, update existing entry instead of returning EEXIST. +}; + +// struct gr_fdb_add_resp { }; + +// Delete an FDB entry by key. +#define GR_FDB_DEL REQUEST_TYPE(GR_L2_MODULE, 0x0002) + +struct gr_fdb_del_req { + uint16_t bridge_id; + struct rte_ether_addr mac; + uint16_t vlan_id; + bool missing_ok; // If true, ignore ENOENT. +}; + +// Flush FDB entries. All non-zero fields are ANDed as filters. +#define GR_FDB_FLUSH REQUEST_TYPE(GR_L2_MODULE, 0x0003) + +struct gr_fdb_flush_req { + uint16_t bridge_id; // GR_IFACE_ID_UNDEF to match all bridges. + struct rte_ether_addr mac; // Zero address to match all MACs. + uint16_t iface_id; // GR_IFACE_ID_UNDEF to match all interfaces. + gr_fdb_flags_t flags; // GR_FDB_F_STATIC: flush all. Otherwise, only dynamic entries. +}; + +// struct gr_fdb_flush_resp { }; + +// List FDB entries with optional filtering. +#define GR_FDB_LIST REQUEST_TYPE(GR_L2_MODULE, 0x0004) + +struct gr_fdb_list_req { + uint16_t bridge_id; // GR_IFACE_ID_UNDEF to list all bridges. + uint16_t iface_id; // GR_IFACE_ID_UNDEF to match all interfaces. + gr_fdb_flags_t flags; // GR_FDB_F_STATIC: only static. Otherwise, list all entries. +}; + +STREAM_RESP(struct gr_fdb_entry); + +// Get FDB subsystem configuration and usage. +#define GR_FDB_CONFIG_GET REQUEST_TYPE(GR_L2_MODULE, 0x0005) + +// struct gr_fdb_config_get_req { }; + +struct gr_fdb_config_get_resp { + uint32_t max_entries; + uint32_t used_entries; +}; + +// Set FDB subsystem configuration. +// Changing max_entries requires the FDB to be empty (returns EBUSY otherwise). +#define GR_FDB_CONFIG_SET REQUEST_TYPE(GR_L2_MODULE, 0x0006) + +struct gr_fdb_config_set_req { + uint32_t max_entries; +}; + +// struct gr_fdb_config_set_resp { }; diff --git a/modules/l2/cli/fdb.c b/modules/l2/cli/fdb.c new file mode 100644 index 000000000..4ab652086 --- /dev/null +++ b/modules/l2/cli/fdb.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +static int arg_iface( + struct gr_api_client *c, + const struct ec_pnode *p, + const char *id, + gr_iface_type_t type, + uint16_t *iface_id +) { + const char *name = arg_str(p, id); + if (name == NULL) + return -errno; + + struct gr_iface *iface = iface_from_name(c, name); + if (iface == NULL) + return -errno; + + if (type != GR_IFACE_TYPE_UNDEF && iface->type != type) { + free(iface); + return errno_set(EMEDIUMTYPE); + } + + *iface_id = iface->id; + free(iface); + return 0; +} + +static cmd_status_t fdb_add(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_fdb_add_req req = {.exist_ok = true}; + + if (arg_iface(c, p, "IFACE", GR_IFACE_TYPE_UNDEF, &req.fdb.iface_id) < 0) + return CMD_ERROR; + if (arg_eth_addr(p, "MAC", &req.fdb.mac) < 0) + return CMD_ERROR; + if (arg_u16(p, "VLAN", &req.fdb.vlan_id) < 0 && errno != ENOENT) + return CMD_ERROR; + + req.fdb.flags = GR_FDB_F_STATIC; + + if (gr_api_client_send_recv(c, GR_FDB_ADD, sizeof(req), &req, NULL) < 0) + return CMD_ERROR; + + return CMD_SUCCESS; +} + +static cmd_status_t fdb_del(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_fdb_del_req req = {.missing_ok = true}; + + if (arg_iface(c, p, "BRIDGE", GR_IFACE_TYPE_BRIDGE, &req.bridge_id) < 0) + return CMD_ERROR; + if (arg_eth_addr(p, "MAC", &req.mac) < 0) + return CMD_ERROR; + if (arg_u16(p, "VLAN", &req.vlan_id) < 0 && errno != ENOENT) + return CMD_ERROR; + + if (gr_api_client_send_recv(c, GR_FDB_DEL, sizeof(req), &req, NULL) < 0) + return CMD_ERROR; + + return CMD_SUCCESS; +} + +static cmd_status_t fdb_flush(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_fdb_flush_req req = { + .bridge_id = GR_IFACE_ID_UNDEF, + .iface_id = GR_IFACE_ID_UNDEF, + .flags = GR_FDB_F_LEARN, + }; + + if (arg_str(p, "BRIDGE") != NULL) { + if (arg_iface(c, p, "BRIDGE", GR_IFACE_TYPE_BRIDGE, &req.bridge_id) < 0) + return CMD_ERROR; + } + if (arg_str(p, "IFACE") != NULL) { + if (arg_iface(c, p, "IFACE", GR_IFACE_TYPE_UNDEF, &req.iface_id) < 0) + return CMD_ERROR; + } + if (arg_eth_addr(p, "MAC", &req.mac) < 0 && errno != ENOENT) + return CMD_ERROR; + + if (arg_str(p, "all") != NULL) + req.flags |= GR_FDB_F_STATIC; + + if (gr_api_client_send_recv(c, GR_FDB_FLUSH, sizeof(req), &req, NULL) < 0) + return CMD_ERROR; + + return CMD_SUCCESS; +} + +static size_t fdb_format_flags(char *buf, size_t len, gr_fdb_flags_t flags) { + size_t n = 0; + buf[0] = 0; + if (flags & GR_FDB_F_LEARN) + SAFE_BUF(snprintf, len, "%slearn", n ? " " : ""); + if (flags & GR_FDB_F_STATIC) + SAFE_BUF(snprintf, len, "%sstatic", n ? " " : ""); +err: + return n; +} + +static cmd_status_t fdb_show(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_fdb_list_req req = { + .bridge_id = GR_IFACE_ID_UNDEF, + .iface_id = GR_IFACE_ID_UNDEF, + .flags = 0, + }; + const struct gr_fdb_entry *fdb; + char flags[128]; + int ret; + + if (arg_str(p, "BRIDGE") != NULL) { + if (arg_iface(c, p, "BRIDGE", GR_IFACE_TYPE_BRIDGE, &req.bridge_id) < 0) + return CMD_ERROR; + } + if (arg_str(p, "IFACE") != NULL) { + if (arg_iface(c, p, "IFACE", GR_IFACE_TYPE_UNDEF, &req.iface_id) < 0) + return CMD_ERROR; + } + if (arg_str(p, "static") != NULL) + req.flags |= GR_FDB_F_STATIC; + if (arg_str(p, "learn") != NULL) + req.flags |= GR_FDB_F_LEARN; + + struct libscols_table *table = scols_new_table(); + scols_table_new_column(table, "BRIDGE", 0, 0); + scols_table_new_column(table, "MAC", 0, 0); + scols_table_new_column(table, "VLAN", 0, 0); + scols_table_new_column(table, "IFACE", 0, 0); + scols_table_new_column(table, "FLAGS", 0, 0); + scols_table_new_column(table, "AGE", 0, SCOLS_FL_RIGHT); + scols_table_set_column_separator(table, " "); + + gr_api_client_stream_foreach (fdb, ret, c, GR_FDB_LIST, sizeof(req), &req) { + struct libscols_line *line = scols_table_new_line(table, NULL); + + struct gr_iface *bridge = iface_from_id(c, fdb->bridge_id); + scols_line_sprintf(line, 0, "%s", bridge ? bridge->name : "[deleted]"); + free(bridge); + + scols_line_sprintf(line, 1, ETH_F, &fdb->mac); + + if (fdb->vlan_id != 0) + scols_line_sprintf(line, 2, "%u", fdb->vlan_id); + + struct gr_iface *iface = iface_from_id(c, fdb->iface_id); + scols_line_sprintf(line, 3, "%s", iface ? iface->name : "[deleted]"); + free(iface); + + if (fdb_format_flags(flags, sizeof(flags), fdb->flags)) + scols_line_set_data(line, 4, flags); + + scols_line_sprintf( + line, 5, "%lds", (gr_clock_us() - fdb->last_seen) / CLOCKS_PER_SEC + ); + } + + scols_print_table(table); + scols_unref_table(table); + + return ret < 0 ? CMD_ERROR : CMD_SUCCESS; +} + +static cmd_status_t fdb_config_set(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_fdb_config_set_req req; + + if (arg_u32(p, "MAX", &req.max_entries) < 0) + return CMD_ERROR; + + if (gr_api_client_send_recv(c, GR_FDB_CONFIG_SET, sizeof(req), &req, NULL) < 0) + return CMD_ERROR; + + return CMD_SUCCESS; +} + +static cmd_status_t fdb_config_show(struct gr_api_client *c, const struct ec_pnode *) { + const struct gr_fdb_config_get_resp *resp; + void *resp_ptr = NULL; + float used = 0.0; + + if (gr_api_client_send_recv(c, GR_FDB_CONFIG_GET, 0, NULL, &resp_ptr) < 0) + return CMD_ERROR; + + resp = resp_ptr; + if (resp->max_entries != 0) + used = (100.0 * (float)resp->used_entries) / (float)resp->max_entries; + printf("used %u (%.01f%%)\n", resp->used_entries, used); + printf("max %u\n", resp->max_entries); + free(resp_ptr); + + return CMD_SUCCESS; +} + +#define FDB_CTX(root) CLI_CONTEXT(root, CTX_ARG("fdb", "Forwarding database.")) + +static int ctx_init(struct ec_node *root) { + int ret; + + ret = CLI_COMMAND( + FDB_CTX(root), + "add MAC iface IFACE [vlan VLAN]", + fdb_add, + "Add a static FDB entry.", + with_help("MAC address.", ec_node_re("MAC", ETH_ADDR_RE)), + with_help( + "Bridge member interface.", + ec_node_dyn("IFACE", complete_iface_names, INT2PTR(GR_IFACE_TYPE_UNDEF)) + ), + with_help("VLAN ID.", ec_node_uint("VLAN", 1, 4094, 10)) + ); + if (ret < 0) + return ret; + + ret = CLI_COMMAND( + FDB_CTX(root), + "del bridge BRIDGE MAC [vlan VLAN]", + fdb_del, + "Delete an FDB entry.", + with_help( + "Bridge interface.", + ec_node_dyn("BRIDGE", complete_iface_names, INT2PTR(GR_IFACE_TYPE_BRIDGE)) + ), + with_help("MAC address.", ec_node_re("MAC", ETH_ADDR_RE)), + with_help("VLAN ID.", ec_node_uint("VLAN", 1, 4094, 10)) + ); + if (ret < 0) + return ret; + + ret = CLI_COMMAND( + FDB_CTX(root), + "flush [(bridge BRIDGE),(iface IFACE),(mac MAC),(all)]", + fdb_flush, + "Flush dynamic FDB entries.", + with_help( + "Flush only entries on this bridge.", + ec_node_dyn("BRIDGE", complete_iface_names, INT2PTR(GR_IFACE_TYPE_BRIDGE)) + ), + with_help( + "Flush only entries on this interface.", + ec_node_dyn("IFACE", complete_iface_names, INT2PTR(GR_IFACE_TYPE_UNDEF)) + ), + with_help( + "Flush only entries matching this MAC address.", + ec_node_re("MAC", ETH_ADDR_RE) + ), + with_help("Flush all entries including static.", ec_node_str("all", "all")) + ); + if (ret < 0) + return ret; + + ret = CLI_COMMAND( + FDB_CTX(root), + "config set max MAX", + fdb_config_set, + "Change the FDB configuration.", + with_help("Maximum number of FDB entries.", ec_node_uint("MAX", 1, UINT32_MAX, 10)) + ); + if (ret < 0) + return ret; + + ret = CLI_COMMAND( + FDB_CTX(root), + "config [show]", + fdb_config_show, + "Show the current FDB configuration." + ); + if (ret < 0) + return ret; + + ret = CLI_COMMAND( + FDB_CTX(root), + "[show] [(bridge BRIDGE),(iface IFACE),(static|learn)]", + fdb_show, + "Show FDB entries.", + with_help( + "Show only entries on this bridge.", + ec_node_dyn("BRIDGE", complete_iface_names, INT2PTR(GR_IFACE_TYPE_BRIDGE)) + ), + with_help( + "Show only entries on this interface.", + ec_node_dyn("IFACE", complete_iface_names, INT2PTR(GR_IFACE_TYPE_UNDEF)) + ), + with_help("Show only static entries.", ec_node_str("static", "static")), + with_help("Show only learned entries.", ec_node_str("learn", "learn")) + ); + if (ret < 0) + return ret; + + return 0; +} + +static struct cli_context ctx = { + .name = "fdb", + .init = ctx_init, +}; + +static void fdb_event_print(uint32_t event, const void *obj) { + const struct gr_fdb_entry *fdb = obj; + const char *action; + char flags[128]; + + switch (event) { + case GR_EVENT_FDB_ADD: + action = "add"; + break; + case GR_EVENT_FDB_DEL: + action = "del"; + break; + case GR_EVENT_FDB_UPDATE: + action = "update"; + break; + default: + action = "?"; + break; + } + + printf("fdb %s: bridge=%u " ETH_F, action, fdb->bridge_id, &fdb->mac); + if (fdb->vlan_id != 0) + printf(" vlan=%u", fdb->vlan_id); + printf(" iface=%u", fdb->iface_id); + if (fdb_format_flags(flags, sizeof(flags), fdb->flags)) + printf(" %s", flags); + printf("\n"); +} + +static struct cli_event_printer printer = { + .print = fdb_event_print, + .ev_count = 3, + .ev_types = { + GR_EVENT_FDB_ADD, + GR_EVENT_FDB_DEL, + GR_EVENT_FDB_UPDATE, + }, +}; + +static void __attribute__((constructor, used)) init(void) { + cli_context_register(&ctx); + cli_event_printer_register(&printer); +} diff --git a/modules/l2/cli/meson.build b/modules/l2/cli/meson.build index 7401bdfae..53b9e5699 100644 --- a/modules/l2/cli/meson.build +++ b/modules/l2/cli/meson.build @@ -3,4 +3,5 @@ cli_src += files( 'bridge.c', + 'fdb.c', ) diff --git a/modules/l2/control/bridge.c b/modules/l2/control/bridge.c index a8bce94b1..208f37112 100644 --- a/modules/l2/control/bridge.c +++ b/modules/l2/control/bridge.c @@ -67,6 +67,7 @@ static int bridge_detach_member(struct iface *bridge, struct iface *member) { br->n_members--; member->domain_id = GR_IFACE_ID_UNDEF; member->mode = GR_IFACE_MODE_VRF; + fdb_purge_iface(member->id); break; } } @@ -87,6 +88,8 @@ static int bridge_fini(struct iface *iface) { gr_event_push(GR_EVENT_IFACE_POST_RECONFIG, member); } + fdb_purge_bridge(iface->id); + return 0; } diff --git a/modules/l2/control/fdb.c b/modules/l2/control/fdb.c new file mode 100644 index 000000000..dfea4339c --- /dev/null +++ b/modules/l2/control/fdb.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct fdb_key { + uint16_t bridge_id; + uint16_t vlan_id; + struct rte_ether_addr mac; +}; + +static unsigned fdb_max_entries; +static struct rte_hash *fdb_hash; +static struct rte_mempool *fdb_pool; + +static void fdb_free_entry(void *pool, void *fdb) { + gr_event_push(GR_EVENT_FDB_DEL, fdb); + rte_mempool_put(pool, fdb); +} + +static int fdb_reconfig(unsigned max_entries) { + char name[64]; + snprintf(name, sizeof(name), "fdb-%u", max_entries); + + struct rte_hash_parameters params = { + .name = name, + .socket_id = SOCKET_ID_ANY, + .key_len = sizeof(struct fdb_key), + .entries = max_entries, + .extra_flag = RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF + | RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT, + }; + + struct rte_hash *h = rte_hash_create(¶ms); + if (h == NULL) + return errno_log(rte_errno, "rte_hash_create"); + + struct rte_mempool *p = rte_mempool_create( + name, + rte_align32pow2(max_entries) - 1, + sizeof(struct gr_fdb_entry), + 0, // cache size + 0, // priv size + NULL, // mp_init + NULL, // mp_init_arg + NULL, // obj_init + NULL, // obj_init_arg + SOCKET_ID_ANY, + 0 // flags + ); + if (p == NULL) { + rte_hash_free(h); + return errno_log(rte_errno, "rte_mempool_create"); + } + + struct rte_hash_rcu_config conf = { + .v = gr_datapath_rcu(), + .mode = RTE_HASH_QSBR_MODE_SYNC, + .free_key_data_func = fdb_free_entry, + .key_data_ptr = p, + }; + if (rte_hash_rcu_qsbr_add(h, &conf) < 0) { + rte_hash_free(h); + rte_mempool_free(p); + return errno_log(rte_errno, "rte_hash_rcu_qsbr_add"); + } + + struct rte_hash *tmp_h = fdb_hash; + struct rte_mempool *tmp_p = fdb_pool; + fdb_hash = h; + fdb_pool = p; + + rte_rcu_qsbr_synchronize(gr_datapath_rcu(), rte_lcore_id()); + + rte_hash_free(tmp_h); + rte_mempool_free(tmp_p); + + fdb_max_entries = max_entries; + + return 0; +} + +const struct gr_fdb_entry * +fdb_lookup(uint16_t bridge_id, const struct rte_ether_addr *mac, uint16_t vlan_id) { + const struct fdb_key key = {bridge_id, vlan_id, *mac}; + void *data; + + if (rte_hash_lookup_data(fdb_hash, &key, &data) < 0) + return errno_set_null(ENOENT); + + return data; +} + +// Learn a new FDB entry or refresh its last_seen timestamp. +void fdb_learn( + uint16_t bridge_id, + uint16_t iface_id, + const struct rte_ether_addr *mac, + uint16_t vlan_id +) { + const struct fdb_key key = {bridge_id, vlan_id, *mac}; + struct gr_fdb_entry *fdb; + void *data; + + if (rte_hash_lookup_data(fdb_hash, &key, &data) < 0) { + if (rte_mempool_get(fdb_pool, &data) < 0) + return; // pool exhausted + + fdb = data; + fdb->bridge_id = bridge_id; + fdb->vlan_id = vlan_id; + fdb->mac = *mac; + fdb->flags = GR_FDB_F_LEARN; + fdb->iface_id = iface_id; + + if (rte_hash_add_key_data(fdb_hash, &key, fdb) < 0) { + // no space left in hash + rte_mempool_put(fdb_pool, fdb); + return; + } + + gr_event_push(GR_EVENT_FDB_ADD, fdb); + } else { + fdb = data; + } + + fdb->last_seen = gr_clock_us(); + + if ((fdb->flags & GR_FDB_F_LEARN) && fdb->iface_id != iface_id) { + // update in case the mac address has moved + fdb->iface_id = iface_id; + gr_event_push(GR_EVENT_FDB_UPDATE, fdb); + } +} + +void fdb_purge_iface(uint16_t iface_id) { + struct gr_fdb_entry *fdb; + uint32_t next = 0; + const void *key; + void *data; + + while (rte_hash_iterate(fdb_hash, &key, &data, &next) >= 0) { + fdb = data; + if (fdb->iface_id == iface_id) { + rte_hash_del_key(fdb_hash, key); + } + } +} + +void fdb_purge_bridge(uint16_t bridge_id) { + struct gr_fdb_entry *fdb; + uint32_t next = 0; + const void *key; + void *data; + + while (rte_hash_iterate(fdb_hash, &key, &data, &next) >= 0) { + fdb = data; + if (fdb->bridge_id == bridge_id) { + rte_hash_del_key(fdb_hash, key); + } + } +} + +static struct api_out fdb_add(const void *request, struct api_ctx *) { + const struct gr_fdb_add_req *req = request; + const struct iface *iface; + struct gr_fdb_entry *e; + void *data; + int ret; + + if (req->fdb.flags & ~GR_FDB_F_STATIC) + return api_out(EINVAL, 0, NULL); + + iface = iface_from_id(req->fdb.iface_id); + if (iface == NULL) + return api_out(errno, 0, NULL); + + iface = iface_from_id(iface->domain_id); + if (iface == NULL) + return api_out(EMEDIUMTYPE, 0, NULL); + + if (iface->type != GR_IFACE_TYPE_BRIDGE) + return api_out(EMEDIUMTYPE, 0, NULL); + + const struct fdb_key key = {iface->id, req->fdb.vlan_id, req->fdb.mac}; + + if (rte_hash_lookup_data(fdb_hash, &key, &data) < 0) { + if ((ret = rte_mempool_get(fdb_pool, &data)) < 0) + return api_out(-ret, 0, NULL); + + e = data; + *e = req->fdb; + e->bridge_id = iface->id; + e->last_seen = gr_clock_us(); + + if ((ret = rte_hash_add_key_data(fdb_hash, &key, data)) < 0) { + rte_mempool_put(fdb_pool, e); + return api_out(-ret, 0, NULL); + } + + gr_event_push(GR_EVENT_FDB_ADD, e); + } else if (req->exist_ok) { + e = data; + *e = req->fdb; + e->bridge_id = iface->id; + e->last_seen = gr_clock_us(); + + gr_event_push(GR_EVENT_FDB_UPDATE, e); + } else { + return api_out(EEXIST, 0, NULL); + } + + return api_out(0, 0, NULL); +} + +static struct gr_api_handler add_handler = { + .name = "fdb add", + .request_type = GR_FDB_ADD, + .callback = fdb_add, +}; + +static struct api_out fdb_del(const void *request, struct api_ctx *) { + const struct gr_fdb_del_req *req = request; + const struct fdb_key key = {req->bridge_id, req->vlan_id, req->mac}; + int ret; + + ret = rte_hash_del_key(fdb_hash, &key); + if (ret == -ENOENT && req->missing_ok) + ret = 0; + else if (ret > 0) + ret = 0; + + return api_out(-ret, 0, NULL); +} + +static struct gr_api_handler del_handler = { + .name = "fdb del", + .request_type = GR_FDB_DEL, + .callback = fdb_del, +}; + +static inline bool fdb_match( + const struct gr_fdb_entry *e, + gr_fdb_flags_t flags, + uint16_t bridge_id, + uint16_t iface_id, + const struct rte_ether_addr *mac +) { + if ((flags & GR_FDB_F_STATIC) && !(e->flags & GR_FDB_F_STATIC)) + return false; + if ((flags & GR_FDB_F_LEARN) && !(e->flags & GR_FDB_F_LEARN)) + return false; + if (bridge_id != GR_IFACE_ID_UNDEF && e->bridge_id != bridge_id) + return false; + if (iface_id != GR_IFACE_ID_UNDEF && e->iface_id != iface_id) + return false; + if (mac != NULL && !rte_is_zero_ether_addr(mac) && !rte_is_same_ether_addr(&e->mac, mac)) + return false; + return true; +} + +static struct api_out fdb_flush(const void *request, struct api_ctx *) { + const struct gr_fdb_flush_req *req = request; + uint32_t next = 0; + const void *key; + void *data; + int ret; + + while (rte_hash_iterate(fdb_hash, &key, &data, &next) >= 0) { + if (!fdb_match(data, req->flags, req->bridge_id, req->iface_id, &req->mac)) + continue; + + ret = rte_hash_del_key(fdb_hash, key); + if (ret < 0) + return api_out(-ret, 0, NULL); + } + + return api_out(0, 0, NULL); +} + +static struct gr_api_handler flush_handler = { + .name = "fdb flush", + .request_type = GR_FDB_FLUSH, + .callback = fdb_flush, +}; + +static struct api_out fdb_list(const void *request, struct api_ctx *ctx) { + const struct gr_fdb_list_req *req = request; + struct gr_fdb_entry *fdb; + uint32_t next = 0; + const void *key; + void *data; + + while (rte_hash_iterate(fdb_hash, &key, &data, &next) >= 0) { + if (!fdb_match(data, req->flags, req->bridge_id, req->iface_id, NULL)) + continue; + + fdb = data; + api_send(ctx, sizeof(*fdb), fdb); + } + + return api_out(0, 0, NULL); +} + +static struct gr_api_handler list_handler = { + .name = "fdb list", + .request_type = GR_FDB_LIST, + .callback = fdb_list, +}; + +static struct api_out fdb_config_get(const void * /*request*/, struct api_ctx *) { + struct gr_fdb_config_get_resp *resp = malloc(sizeof(*resp)); + + if (resp == NULL) + return api_out(ENOMEM, 0, NULL); + + resp->max_entries = fdb_max_entries; + resp->used_entries = rte_hash_count(fdb_hash); + + return api_out(0, sizeof(*resp), resp); +} + +static struct gr_api_handler config_get_handler = { + .name = "fdb config get", + .request_type = GR_FDB_CONFIG_GET, + .callback = fdb_config_get, +}; + +static struct api_out fdb_config_set(const void *request, struct api_ctx *) { + const struct gr_fdb_config_set_req *req = request; + + if (req->max_entries == 0) + return api_out(EINVAL, 0, NULL); + + if (req->max_entries != fdb_max_entries) { + if (rte_hash_count(fdb_hash) > 0) + return api_out(EBUSY, 0, NULL); + + if (fdb_reconfig(req->max_entries) < 0) + return api_out(errno, 0, NULL); + + fdb_max_entries = req->max_entries; + } + + return api_out(0, 0, NULL); +} + +static struct gr_api_handler config_set_handler = { + .name = "fdb config set", + .request_type = GR_FDB_CONFIG_SET, + .callback = fdb_config_set, +}; + +static struct gr_event_serializer serializer = { + .size = sizeof(struct gr_fdb_entry), + .ev_count = 3, + .ev_types = { + GR_EVENT_FDB_ADD, + GR_EVENT_FDB_DEL, + GR_EVENT_FDB_UPDATE, + }, +}; + +static void fdb_ageing_cb(evutil_socket_t, short /*what*/, void * /*priv*/) { + const struct iface *bridge; + struct gr_fdb_entry *fdb; + uint32_t next = 0; + uint16_t max_age; + const void *key; + clock_t now; + void *data; + time_t age; + + now = gr_clock_us(); + + while (rte_hash_iterate(fdb_hash, &key, &data, &next) >= 0) { + fdb = data; + + if ((fdb->flags & GR_FDB_F_STATIC) || !(fdb->flags & GR_FDB_F_LEARN)) + continue; + + age = (now - fdb->last_seen) / CLOCKS_PER_SEC; + + bridge = iface_from_id(fdb->bridge_id); + if (bridge != NULL) + max_age = iface_info_bridge(bridge)->ageing_time; + else + max_age = GR_BRIDGE_DEFAULT_AGEING; + + if (age > max_age) { + LOG(DEBUG, + ETH_F " vlan=%u bridge=%u iface=%u: aged out (%ld sec)", + &fdb->mac, + fdb->vlan_id, + fdb->bridge_id, + fdb->iface_id, + age); + rte_hash_del_key(fdb_hash, key); + } + } +} + +static struct event *ageing_timer; + +#define FDB_DEFAULT_MAX_ENTRIES 4096 + +static void fdb_init(struct event_base *base) { + if (fdb_reconfig(FDB_DEFAULT_MAX_ENTRIES) < 0) + ABORT("fdb_reconfig failed"); + + ageing_timer = event_new(base, -1, EV_PERSIST | EV_FINALIZE, fdb_ageing_cb, NULL); + if (ageing_timer == NULL) + ABORT("event_new() failed"); + + if (event_add(ageing_timer, &(struct timeval) {.tv_sec = 1}) < 0) + ABORT("event_add() failed"); +} + +static void fdb_fini(struct event_base *) { + if (ageing_timer != NULL) + event_free(ageing_timer); + + rte_hash_free(fdb_hash); + rte_mempool_free(fdb_pool); +} + +static struct gr_module module = { + .name = "fdb", + .depends_on = "rcu", + .init = fdb_init, + .fini = fdb_fini, +}; + +RTE_INIT(init) { + gr_register_api_handler(&add_handler); + gr_register_api_handler(&del_handler); + gr_register_api_handler(&flush_handler); + gr_register_api_handler(&list_handler); + gr_register_api_handler(&config_get_handler); + gr_register_api_handler(&config_set_handler); + gr_event_register_serializer(&serializer); + gr_register_module(&module); +} diff --git a/modules/l2/control/gr_l2_control.h b/modules/l2/control/gr_l2_control.h index e9a16239d..89284ee87 100644 --- a/modules/l2/control/gr_l2_control.h +++ b/modules/l2/control/gr_l2_control.h @@ -14,3 +14,21 @@ GR_IFACE_INFO(GR_IFACE_TYPE_BRIDGE, iface_info_bridge, { struct iface *members[GR_BRIDGE_MAX_MEMBERS]; }); + +// Lookup a FDB entry from a MAC address and VLAN +const struct gr_fdb_entry * +fdb_lookup(uint16_t bridge_id, const struct rte_ether_addr *, uint16_t vlan_id); + +// Learn a new FDB entry or refresh its last_seen timestamp. +void fdb_learn( + uint16_t bridge_id, + uint16_t iface_id, + const struct rte_ether_addr *, + uint16_t vlan_id +); + +// Delete all FDB entries referencing the provided interface. +void fdb_purge_iface(uint16_t iface_id); + +// Delete all FDB entries referencing the provided bridge. +void fdb_purge_bridge(uint16_t bridge_id); diff --git a/modules/l2/control/meson.build b/modules/l2/control/meson.build index 28731231b..e98d2892b 100644 --- a/modules/l2/control/meson.build +++ b/modules/l2/control/meson.build @@ -3,6 +3,7 @@ src += files( 'bridge.c', + 'fdb.c', ) inc += include_directories('.') From 611446a21018a2b4cea001c814fd53a9c428df3b Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Sat, 24 Jan 2026 11:10:51 +0100 Subject: [PATCH 8/9] l2: add bridge datapath nodes Add bridge_input and bridge_flood datapath nodes. bridge_input receives packets from member interfaces via GR_IFACE_MODE_BRIDGE. It learns source MAC addresses into the FDB (unless GR_BRIDGE_F_NO_LEARN is set), then looks up the destination. Known unicast destinations are forwarded to the learned output interface. Unknown unicast, broadcast and multicast are sent to bridge_flood. Hairpin packets (destination is the source interface) are dropped. When the destination is the bridge interface itself, packets are sent to iface_input for local processing. bridge_flood replicates each packet to all bridge members except the ingress interface, and to the bridge interface itself. The first output reuses the original mbuf, subsequent ones are cloned. When GR_BRIDGE_F_NO_FLOOD is set, the packet is dropped instead. Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- docs/graph.svg | 830 +++++++++++++++-------------- modules/l2/datapath/bridge_flood.c | 126 +++++ modules/l2/datapath/bridge_input.c | 150 ++++++ modules/l2/datapath/meson.build | 7 + modules/l2/meson.build | 1 + 5 files changed, 726 insertions(+), 388 deletions(-) create mode 100644 modules/l2/datapath/bridge_flood.c create mode 100644 modules/l2/datapath/bridge_input.c create mode 100644 modules/l2/datapath/meson.build diff --git a/docs/graph.svg b/docs/graph.svg index 9828f7778..59d85c29c 100644 --- a/docs/graph.svg +++ b/docs/graph.svg @@ -4,831 +4,885 @@ - - - + + + bond_output - -bond_output + +bond_output port_output - -port_output + +port_output bond_output->port_output - - + + iface_input - -iface_input + +iface_input xconnect - -xconnect + +xconnect iface_input->xconnect - - + + eth_input - -eth_input + +eth_input iface_input->eth_input - - + + + + + +bridge_input + +bridge_input + + + +iface_input->bridge_input + + iface_output - -iface_output + +iface_output - + iface_output->bond_output - - + + - + iface_output->port_output - - + + + + + +iface_output->bridge_input + + port_tx - -port_tx + +port_tx - + port_output->port_tx - - + + port_rx - -port_rx + +port_rx - + port_rx->iface_input - - + + - + xconnect->port_output - - + + lacp_input - -lacp_input + +lacp_input eth_input->lacp_input - - + + snap_input - -snap_input + +snap_input eth_input->snap_input - - + + arp_input - -arp_input + +arp_input eth_input->arp_input - - + + - + ip_input - -ip_input + +ip_input eth_input->ip_input - - + + - + ip6_input - -ip6_input + +ip6_input eth_input->ip6_input - - + + eth_output - -eth_output + +eth_output eth_output->iface_output - - + + l2_redirect - -l2_redirect + +l2_redirect lacp_output - -lacp_output + +lacp_output - + lacp_output->eth_output - - + + - + snap_input->l2_redirect - - + + arp_input_reply - -arp_input_reply + +arp_input_reply - + arp_input->arp_input_reply - - + + arp_input_request - -arp_input_request + +arp_input_request - + arp_input->arp_input_request - - + + arp_output_reply - -arp_output_reply + +arp_output_reply - + arp_output_reply->eth_output - - + + arp_output_request - -arp_output_request + +arp_output_request - + arp_output_request->eth_output - - + + - + +bridge_flood + +bridge_flood + + + +bridge_flood->iface_input + + + + + +bridge_flood->iface_output + + + + + +bridge_input->iface_input + + + + + +bridge_input->iface_output + + + + + +bridge_input->bridge_flood + + + + + ospf_redirect - -ospf_redirect + +ospf_redirect - + ospf_redirect->l2_redirect - - + + - + loopback_input - -loopback_input + +loopback_input - + loopback_input->ip_input - - + + - + loopback_input->ip6_input - - + + - + loopback_output - -loopback_output + +loopback_output - + xvrf - -xvrf + +xvrf - + xvrf->ip_input - - + + - + xvrf->ip6_input - - + + - + ip_forward - -ip_forward + +ip_forward - + ip_output - -ip_output + +ip_output - + ip_forward->ip_output - - + + - + ip_fragment - -ip_fragment + +ip_fragment - + ip_fragment->ip_output - - + + - + ip_hold - -ip_hold + +ip_hold - + ip_input->ip_forward - - + + - + ip_input_local - -ip_input_local + +ip_input_local - + ip_input->ip_input_local - - + + - + ip_input->ip_output - - + + - + dnat44_dynamic - -dnat44_dynamic + +dnat44_dynamic - + ip_input->dnat44_dynamic - - + + - + dnat44_static - -dnat44_static + +dnat44_static - + ip_input->dnat44_static - - + + - + ip_loadbalance - -ip_loadbalance + +ip_loadbalance - + ip_loadbalance->ip_output - - + + - + ip_input_local->ospf_redirect - - + + - + ipip_input - -ipip_input + +ipip_input - + ip_input_local->ipip_input - - + + - + icmp_input - -icmp_input + +icmp_input - + ip_input_local->icmp_input - - + + - + l4_input_local - -l4_input_local + +l4_input_local - + ip_input_local->l4_input_local - - + + - + ip_output->eth_output - - + + - + ip_output->xvrf - - + + - + ip_output->ip_fragment - - + + - + ip_output->ip_hold - - + + - + ip_output->ip_loadbalance - - + + - + ipip_output - -ipip_output + +ipip_output - + ip_output->ipip_output - - + + - + sr6_output - -sr6_output + +sr6_output - + ip_output->sr6_output - - + + - + ip6_forward - -ip6_forward + +ip6_forward - + ip6_output - -ip6_output + +ip6_output - + ip6_forward->ip6_output - - + + - + ip6_hold - -ip6_hold + +ip6_hold - + ip6_input->ip6_forward - - + + - + ip6_input_local - -ip6_input_local + +ip6_input_local - + ip6_input->ip6_input_local - - + + - + ip6_input->ip6_output - - + + - + sr6_local - -sr6_local + +sr6_local - + ip6_input->sr6_local - - + + - + ip6_loadbalance - -ip6_loadbalance + +ip6_loadbalance - + ip6_loadbalance->ip6_output - - + + - + ip6_input_local->ospf_redirect - - + + - + icmp6_input - -icmp6_input + +icmp6_input - + ip6_input_local->icmp6_input - - + + - + ip6_input_local->l4_input_local - - + + - + ip6_output->eth_output - - + + - + ip6_output->xvrf - - + + - + ip6_output->ip6_hold - - + + - + ip6_output->ip6_loadbalance - - + + - + ip6_output->sr6_output - - + + - + ipip_input->ip_input - - + + - + ipip_output->ip_output - - + + - + dnat44_dynamic->ip_forward - - + + - + dnat44_dynamic->ip_input_local - - + + - + dnat44_static->ip_forward - - + + - + dnat44_static->ip_input_local - - + + - + sr6_local->ip_input - - + + - + sr6_local->ip6_input - - + + - + sr6_local->ip6_input_local - - + + - + sr6_output->ip6_output - - + + - + icmp_output - -icmp_output + +icmp_output - + icmp_input->icmp_output - - + + - + icmp_local_send - -icmp_local_send + +icmp_local_send - + icmp_local_send->icmp_output - - + + - + icmp_output->ip_output - - + + - + icmp6_output - -icmp6_output + +icmp6_output - + icmp6_input->icmp6_output - - + + - + ndp_na_input - -ndp_na_input + +ndp_na_input - + icmp6_input->ndp_na_input - - + + - + ndp_ns_input - -ndp_ns_input + +ndp_ns_input - + icmp6_input->ndp_ns_input - - + + - + ndp_rs_input - -ndp_rs_input + +ndp_rs_input - + icmp6_input->ndp_rs_input - - + + - + icmp6_local_send - -icmp6_local_send + +icmp6_local_send - + icmp6_local_send->icmp6_output - - + + - + icmp6_output->ip6_output - - + + - + ndp_na_output - -ndp_na_output + +ndp_na_output - + ndp_na_output->icmp6_output - - + + - + ndp_ns_output - -ndp_ns_output + +ndp_ns_output - + ndp_ns_output->icmp6_output - - + + - + l4_loopback_output - -l4_loopback_output + +l4_loopback_output - + l4_input_local->l4_loopback_output - - + + - + dhcp_input - -dhcp_input + +dhcp_input - + l4_input_local->dhcp_input - - + + - + l4_loopback_output->loopback_output - - + + diff --git a/modules/l2/datapath/bridge_flood.c b/modules/l2/datapath/bridge_flood.c new file mode 100644 index 000000000..50ae8370b --- /dev/null +++ b/modules/l2/datapath/bridge_flood.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +enum edges { + OUTPUT = 0, + INPUT, + DROP, + EDGE_COUNT +}; + +static inline struct rte_mbuf * +clone_packet(struct rte_mbuf *m, uint16_t clone_count, const struct iface *output_iface) { + struct rte_mbuf *clone; + + // Copy packet for each output port (except the first one) + if (clone_count == 0) { + clone = m; + } else { + clone = gr_mbuf_copy(m, UINT32_MAX, sizeof(struct mbuf_data)); + if (clone == NULL) { + // TODO: add xstat + return NULL; + } + } + + mbuf_data(clone)->iface = output_iface; + + return clone; +} + +static uint16_t bridge_flood_process( + struct rte_graph *graph, + struct rte_node *node, + void **objs, + uint16_t nb_objs +) { + const struct iface *br, *member, *iface; + const struct iface_info_bridge *bridge; + struct rte_mbuf *m, *clone; + uint16_t flood_count; + uint16_t sent = 0; + + for (uint16_t i = 0; i < nb_objs; i++) { + m = objs[i]; + flood_count = 0; + + if (gr_mbuf_is_traced(m)) + gr_mbuf_trace_add(m, node, 0); + + iface = mbuf_data(m)->iface; + assert(iface != NULL); + + br = iface_from_id(iface->domain_id); + if (br == NULL || br->type != GR_IFACE_TYPE_BRIDGE) + goto next; + + bridge = iface_info_bridge(br); + + for (uint16_t j = 0; j < bridge->n_members; j++) { + member = bridge->members[j]; + + if (member == iface) + continue; // Never flood back to source + + if (!(member->flags & GR_IFACE_F_UP)) + continue; // Skip down interfaces + + clone = clone_packet(m, flood_count, member); + if (clone == NULL) + continue; + + rte_node_enqueue_x1(graph, node, OUTPUT, clone); + flood_count++; + } + if (iface != br && (br->flags & GR_IFACE_F_UP)) { + // also flood to bridge interface + clone = clone_packet(m, flood_count, br); + if (clone != NULL) { + rte_node_enqueue_x1(graph, node, INPUT, clone); + flood_count++; + } + } +next: + if (flood_count == 0) { + // If no flooding occurred, drop the original packet + rte_node_enqueue_x1(graph, node, DROP, m); + } + sent += flood_count; + } + + return sent; +} + +static struct rte_node_register node = { + .name = "bridge_flood", + .process = bridge_flood_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [OUTPUT] = "iface_output", + [INPUT] = "iface_input", + [DROP] = "bridge_flood_drop", + }, +}; + +static struct gr_node_info info = { + .node = &node, + .type = GR_NODE_T_L2, +}; + +GR_NODE_REGISTER(info); + +GR_DROP_REGISTER(bridge_flood_drop); diff --git a/modules/l2/datapath/bridge_input.c b/modules/l2/datapath/bridge_input.c new file mode 100644 index 000000000..a54f7116c --- /dev/null +++ b/modules/l2/datapath/bridge_input.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include +#include +#include + +#include + +enum edges { + OUTPUT = 0, + INPUT, + FLOOD, + BRIDGE_INVAL, + HAIRPIN, + OUT_IFACE_INVAL, + FLOOD_DISABLED, + EDGE_COUNT +}; + +struct bridge_input_trace { + uint16_t iface_id; + uint16_t bridge_id; +}; + +static uint16_t bridge_input_process( + struct rte_graph *graph, + struct rte_node *node, + void **objs, + uint16_t nb_objs +) { + const struct iface *bridge, *iface; + const struct iface_info_bridge *br; + const struct gr_fdb_entry *fdb; + struct iface_mbuf_data *d; + struct rte_ether_hdr *eth; + struct rte_mbuf *m; + rte_edge_t edge; + + for (uint16_t i = 0; i < nb_objs; i++) { + m = objs[i]; + d = iface_mbuf_data(m); + eth = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); + fdb = NULL; + + if (gr_mbuf_is_traced(m)) { + struct bridge_input_trace *t = gr_mbuf_trace_add(m, node, sizeof(*t)); + t->iface_id = d->iface->id; + t->bridge_id = d->iface->domain_id; + } + + bridge = iface_from_id(d->iface->domain_id); + if (bridge == NULL || bridge->type != GR_IFACE_TYPE_BRIDGE) { + edge = BRIDGE_INVAL; + goto next; + } + br = iface_info_bridge(bridge); + + if (rte_is_unicast_ether_addr(ð->src_addr) + && !(br->flags & GR_BRIDGE_F_NO_LEARN)) + fdb_learn(bridge->id, d->iface->id, ð->src_addr, d->vlan_id); + + if (rte_is_unicast_ether_addr(ð->dst_addr)) { + fdb = fdb_lookup(bridge->id, ð->dst_addr, d->vlan_id); + if (fdb == NULL) { + // Unknown unicast + edge = FLOOD; + goto next; + } + if (fdb->iface_id == d->iface->id) { + // Don't forward back to source interface + edge = HAIRPIN; + goto next; + } + iface = iface_from_id(fdb->iface_id); + if (iface == NULL) { + edge = OUT_IFACE_INVAL; + goto next; + } + // Direct output to learned interface + d->iface = iface; + if (iface->type == GR_IFACE_TYPE_BRIDGE) { + edge = INPUT; + } else { + edge = OUTPUT; + } + } else { + // Broadcast, multicast + edge = FLOOD; + } +next: + if (edge == FLOOD && (br->flags & GR_BRIDGE_F_NO_FLOOD)) + edge = FLOOD_DISABLED; + + rte_node_enqueue_x1(graph, node, edge, m); + } + + return nb_objs; +} + +static int bridge_input_trace_format(char *buf, size_t len, const void *data, size_t /*data_len*/) { + const struct bridge_input_trace *t = data; + const struct iface *iface = iface_from_id(t->iface_id); + const struct iface *bridge = iface_from_id(t->bridge_id); + return snprintf( + buf, + len, + "iface=%s bridge=%s", + iface ? iface->name : "[deleted]", + bridge ? bridge->name : "[deleted]" + ); +} + +static void bridge_input_register(void) { + iface_input_mode_register(GR_IFACE_MODE_BRIDGE, "bridge_input"); + iface_output_type_register(GR_IFACE_TYPE_BRIDGE, "bridge_input"); +} + +static struct rte_node_register node = { + .name = "bridge_input", + .process = bridge_input_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [OUTPUT] = "iface_output", + [INPUT] = "iface_input", + [FLOOD] = "bridge_flood", + [BRIDGE_INVAL] = "bridge_input_invalid_domain", + [HAIRPIN] = "bridge_input_hairpin", + [OUT_IFACE_INVAL] = "bridge_input_invalid_output", + [FLOOD_DISABLED] = "bridge_input_flood_disabled", + }, +}; + +static struct gr_node_info info = { + .node = &node, + .type = GR_NODE_T_L2, + .register_callback = bridge_input_register, + .trace_format = bridge_input_trace_format, +}; + +GR_NODE_REGISTER(info); + +GR_DROP_REGISTER(bridge_input_invalid_domain); +GR_DROP_REGISTER(bridge_input_hairpin); +GR_DROP_REGISTER(bridge_input_invalid_output); +GR_DROP_REGISTER(bridge_input_flood_disabled); diff --git a/modules/l2/datapath/meson.build b/modules/l2/datapath/meson.build new file mode 100644 index 000000000..d61132060 --- /dev/null +++ b/modules/l2/datapath/meson.build @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +src += files( + 'bridge_flood.c', + 'bridge_input.c', +) diff --git a/modules/l2/meson.build b/modules/l2/meson.build index a696792d8..53fa9d7b8 100644 --- a/modules/l2/meson.build +++ b/modules/l2/meson.build @@ -4,3 +4,4 @@ subdir('api') subdir('cli') subdir('control') +subdir('datapath') From 16d3df8c90cd2e60c0a5fa90f005c5110840ae31 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Sat, 24 Jan 2026 11:28:03 +0100 Subject: [PATCH 9/9] smoke: add bridge test Create a bridge with three member ports and verify L2 forwarding between namespaces, L3 reachability to the bridge interface address, and overwriting a dynamic FDB entry with a static one. Also check that detaching a member and deleting the bridge properly clean up FDB entries. Signed-off-by: Robin Jarry Reviewed-by: Christophe Fontaine --- smoke/bridge_test.sh | 54 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100755 smoke/bridge_test.sh diff --git a/smoke/bridge_test.sh b/smoke/bridge_test.sh new file mode 100755 index 000000000..31089fdab --- /dev/null +++ b/smoke/bridge_test.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +. $(dirname $0)/_init.sh + +grcli interface add bridge br0 + +port_add p0 domain br0 +port_add p1 domain br0 +port_add p2 domain br0 + +grcli interface show name br0 + +grcli address add 172.16.0.1/24 iface br0 + +for n in 0 1 2; do + p=x-p$n + ns=n$n + netns_add $ns + move_to_netns $p $ns + ip -n $ns addr add 172.16.0.1$n/24 dev $p + ip -n $ns route add default via 172.16.0.1 +done + +ip netns exec n0 ping -i0.01 -c3 -W1 -n 172.16.0.11 || fail "L2 ping n0->n1 failed" +ip netns exec n1 ping -i0.01 -c3 -W1 -n 172.16.0.12 || fail "L2 ping n1->n2 failed" +ip netns exec n2 ping -i0.01 -c3 -W1 -n 172.16.0.10 || fail "L2 ping n2->n0 failed" + +for n in 0 1 2; do + mac=$(ip netns exec n$n cat /sys/class/net/x-p$n/address) + grcli fdb show iface p$n learn | grep -F "$mac" +done + +# overwrite dynamic learned fdb entry with static one +mac=$(ip netns exec n0 cat /sys/class/net/x-p0/address) +grcli fdb add "$mac" iface p0 +grcli fdb show iface p0 static | grep -F "$mac" + +grcli ping 172.16.0.10 count 3 delay 10 + +ip netns exec n0 ping -i0.01 -c3 -W1 -n 172.16.0.1 || fail "L3 ping n0->bridge failed" +ip netns exec n1 ping -i0.01 -c3 -W1 -n 172.16.0.1 || fail "L3 ping n1->bridge failed" +ip netns exec n2 ping -i0.01 -c3 -W1 -n 172.16.0.1 || fail "L3 ping n2->bridge failed" + +grcli interface set port p1 vrf main +if grcli fdb show iface p1 | grep .; then + fail "fdb still contains entries for removed interface" +fi + +grcli interface del br0 +if grcli fdb show | grep .; then + fail "fdb still contains entries" +fi