When IP tunnel encapsulation rules are offloaded, the kernel can't see
the traffic of the offloaded flow. The neighbour for the IP tunnel
destination of the offloaded flow can mistakenly become STALE and
deleted by the kernel since its 'used' value wasn't changed.
To make sure that a neighbour which is used by the HW won't become
STALE, we proactively update the neighbour 'used' value every
DELAY_PROBE_TIME period, when packets were matched and counted by the HW
for one of the tunnel encap flows related to this neighbour.
The periodic task that updates the used neighbours is scheduled when a
tunnel encap rule is successfully offloaded into HW and keeps re-scheduling
itself as long as the representor's neighbours list isn't empty.
Add, remove, lookup and status change operations done over the
representor's neighbours list or the neighbour hash entry encaps list
are all serialized by RTNL lock.
Signed-off-by: Hadar Hen Zion <[email protected]>
Reviewed-by: Or Gerlitz <[email protected]>
Signed-off-by: Saeed Mahameed <[email protected]>
#include "en.h"
#include "en_rep.h"
#include "en_tc.h"
+#include "fs_core.h"
static const char mlx5e_rep_driver_name[] = "mlx5e_rep";
mlx5_eswitch_sqs2vport_stop(esw, rep);
}
+static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ unsigned long ipv6_interval = NEIGH_VAR(&ipv6_stub->nd_tbl->parms,
+ DELAY_PROBE_TIME);
+#else
+ unsigned long ipv6_interval = ~0UL;
+#endif
+ unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms,
+ DELAY_PROBE_TIME);
+ struct net_device *netdev = rpriv->rep->netdev;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+
+ rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
+ mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval);
+}
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+ mlx5_fc_queue_stats_work(priv->mdev,
+ &neigh_update->neigh_stats_work,
+ neigh_update->min_interval);
+}
+
+static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
+{
+ struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
+ neigh_update.neigh_stats_work.work);
+ struct net_device *netdev = rpriv->rep->netdev;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+ struct mlx5e_neigh_hash_entry *nhe;
+
+ rtnl_lock();
+ if (!list_empty(&rpriv->neigh_update.neigh_list))
+ mlx5e_rep_queue_neigh_stats_work(priv);
+
+ list_for_each_entry(nhe, &rpriv->neigh_update.neigh_list, neigh_list)
+ mlx5e_tc_update_neigh_used_value(nhe);
+
+ rtnl_unlock();
+}
+
static void mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
{
refcount_inc(&nhe->refcnt);
return NOTIFY_DONE;
m_neigh.dev = n->dev;
+ m_neigh.family = n->ops->family;
memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
/* We are in atomic context and can't take RTNL mutex, so use
INIT_LIST_HEAD(&neigh_update->neigh_list);
spin_lock_init(&neigh_update->encap_lock);
+ INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
+ mlx5e_rep_neigh_stats_work);
+ mlx5e_rep_neigh_update_init_interval(rpriv);
rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
flush_workqueue(priv->wq); /* flush neigh update works */
+ cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work);
+
rhashtable_destroy(&neigh_update->neigh_ht);
}
/* protect lookup/remove operations */
spinlock_t encap_lock;
struct notifier_block netevent_nb;
+ struct delayed_work neigh_stats_work;
+ unsigned long min_interval; /* jiffies */
};
struct mlx5e_rep_priv {
__be32 v4;
struct in6_addr v6;
} dst_ip;
+ int family;
};
struct mlx5e_neigh_hash_entry {
* it's used by the neigh notification call.
*/
refcount_t refcnt;
+
+ /* Save the last reported time offloaded trafic pass over one of the
+ * neigh hash entry flows. Use it to periodically update the neigh
+ * 'used' value and avoid neigh deleting by the kernel.
+ */
+ unsigned long reported_lastuse;
};
enum {
void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
struct mlx5e_encap_entry *e);
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
+
#endif /* __MLX5E_REP_H__ */
#include <net/tc_act/tc_tunnel_key.h>
#include <net/tc_act/tc_pedit.h>
#include <net/vxlan.h>
+#include <net/arp.h>
#include "en.h"
#include "en_rep.h"
#include "en_tc.h"
return;
}
e->flags |= MLX5_ENCAP_ENTRY_VALID;
+ mlx5e_rep_queue_neigh_stats_work(priv);
list_for_each_entry(flow, &e->flows, encap) {
flow->esw_attr->encap_id = e->encap_id;
}
}
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
+{
+ struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
+ u64 bytes, packets, lastuse = 0;
+ struct mlx5e_tc_flow *flow;
+ struct mlx5e_encap_entry *e;
+ struct mlx5_fc *counter;
+ struct neigh_table *tbl;
+ bool neigh_used = false;
+ struct neighbour *n;
+
+ if (m_neigh->family == AF_INET)
+ tbl = &arp_tbl;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (m_neigh->family == AF_INET6)
+ tbl = ipv6_stub->nd_tbl;
+#endif
+ else
+ return;
+
+ list_for_each_entry(e, &nhe->encap_list, encap_list) {
+ if (!(e->flags & MLX5_ENCAP_ENTRY_VALID))
+ continue;
+ list_for_each_entry(flow, &e->flows, encap) {
+ if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+ counter = mlx5_flow_rule_counter(flow->rule);
+ mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
+ if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
+ neigh_used = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (neigh_used) {
+ nhe->reported_lastuse = jiffies;
+
+ /* find the relevant neigh according to the cached device and
+ * dst ip pair
+ */
+ n = neigh_lookup(tbl, &m_neigh->dst_ip, m_neigh->dev);
+ if (!n) {
+ WARN(1, "The neighbour already freed\n");
+ return;
+ }
+
+ neigh_event_send(n, NULL);
+ neigh_release(n);
+ }
+}
+
static void mlx5e_detach_encap(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow)
{
* entry in the neigh hash table when a user deletes a rule
*/
e->m_neigh.dev = n->dev;
+ e->m_neigh.family = n->ops->family;
memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
e->out_dev = out_dev;
goto destroy_neigh_entry;
e->flags |= MLX5_ENCAP_ENTRY_VALID;
+ mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
neigh_release(n);
return err;
* entry in the neigh hash table when a user deletes a rule
*/
e->m_neigh.dev = n->dev;
+ e->m_neigh.family = n->ops->family;
memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
e->out_dev = out_dev;
goto destroy_neigh_entry;
e->flags |= MLX5_ENCAP_ENTRY_VALID;
+ mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
neigh_release(n);
return err;
void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
struct mlx5e_encap_entry *e);
+struct mlx5e_neigh_hash_entry;
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
+
static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
{
return atomic_read(&priv->fs.tc.ht.nelems);
int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+ struct delayed_work *dwork,
+ unsigned long delay);
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+ unsigned long interval);
int mlx5_init_fs(struct mlx5_core_dev *dev);
void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
list_splice_tail_init(&fc_stats->addlist, &tmplist);
if (!list_empty(&tmplist) || !RB_EMPTY_ROOT(&fc_stats->counters))
- queue_delayed_work(fc_stats->wq, &fc_stats->work, MLX5_FC_STATS_PERIOD);
+ queue_delayed_work(fc_stats->wq, &fc_stats->work,
+ fc_stats->sampling_interval);
spin_unlock(&fc_stats->addlist_lock);
node = mlx5_fc_stats_query(dev, counter, last->id);
}
- fc_stats->next_query = now + MLX5_FC_STATS_PERIOD;
+ fc_stats->next_query = now + fc_stats->sampling_interval;
}
struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
if (!fc_stats->wq)
return -ENOMEM;
+ fc_stats->sampling_interval = MLX5_FC_STATS_PERIOD;
INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
return 0;
counter->lastbytes = c.bytes;
counter->lastpackets = c.packets;
}
+
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+ struct delayed_work *dwork,
+ unsigned long delay)
+{
+ struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+ queue_delayed_work(fc_stats->wq, dwork, delay);
+}
+
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+ unsigned long interval)
+{
+ struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+ fc_stats->sampling_interval = min_t(unsigned long, interval,
+ fc_stats->sampling_interval);
+}
struct workqueue_struct *wq;
struct delayed_work work;
unsigned long next_query;
+ unsigned long sampling_interval; /* jiffies */
};
struct mlx5_eswitch;