You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
169 lines
5.5 KiB
169 lines
5.5 KiB
From: Pablo Neira Ayuso <pablo@netfilter.org>
|
|
Date: Sun, 7 Jan 2018 01:03:56 +0100
|
|
Subject: [PATCH] netfilter: nf_conntrack: add IPS_OFFLOAD status bit
|
|
|
|
This new bit tells us that the conntrack entry is owned by the flow
|
|
table offload infrastructure.
|
|
|
|
# cat /proc/net/nf_conntrack
|
|
ipv4 2 tcp 6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2
|
|
|
|
Note the [OFFLOAD] tag in the listing.
|
|
|
|
The timer of such conntrack entries look like stopped from userspace.
|
|
In practise, to make sure the conntrack entry does not go away, the
|
|
conntrack timer is periodically set to an arbitrary large value that
|
|
gets refreshed on every iteration from the garbage collector, so it
|
|
never expires- and they display no internal state in the case of TCP
|
|
flows. This allows us to save a bitcheck from the packet path via
|
|
nf_ct_is_expired().
|
|
|
|
Conntrack entries that have been offloaded to the flow table
|
|
infrastructure cannot be deleted/flushed via ctnetlink. The flow table
|
|
infrastructure is also responsible for releasing this conntrack entry.
|
|
|
|
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|
---
|
|
|
|
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
|
|
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
|
|
@@ -101,12 +101,16 @@ enum ip_conntrack_status {
|
|
IPS_HELPER_BIT = 13,
|
|
IPS_HELPER = (1 << IPS_HELPER_BIT),
|
|
|
|
+ /* Conntrack has been offloaded to flow table. */
|
|
+ IPS_OFFLOAD_BIT = 14,
|
|
+ IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
|
|
+
|
|
/* Be careful here, modifying these bits can make things messy,
|
|
* so don't let users modify them directly.
|
|
*/
|
|
IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
|
|
IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
|
|
- IPS_SEQ_ADJUST | IPS_TEMPLATE),
|
|
+ IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
|
|
|
|
__IPS_MAX_BIT = 14,
|
|
};
|
|
--- a/net/netfilter/nf_conntrack_core.c
|
|
+++ b/net/netfilter/nf_conntrack_core.c
|
|
@@ -901,6 +901,9 @@ static unsigned int early_drop_list(stru
|
|
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
|
|
tmp = nf_ct_tuplehash_to_ctrack(h);
|
|
|
|
+ if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
|
|
+ continue;
|
|
+
|
|
if (nf_ct_is_expired(tmp)) {
|
|
nf_ct_gc_expired(tmp);
|
|
continue;
|
|
@@ -975,6 +978,18 @@ static bool gc_worker_can_early_drop(con
|
|
return false;
|
|
}
|
|
|
|
+#define DAY (86400 * HZ)
|
|
+
|
|
+/* Set an arbitrary timeout large enough not to ever expire, this save
|
|
+ * us a check for the IPS_OFFLOAD_BIT from the packet path via
|
|
+ * nf_ct_is_expired().
|
|
+ */
|
|
+static void nf_ct_offload_timeout(struct nf_conn *ct)
|
|
+{
|
|
+ if (nf_ct_expires(ct) < DAY / 2)
|
|
+ ct->timeout = nfct_time_stamp + DAY;
|
|
+}
|
|
+
|
|
static void gc_worker(struct work_struct *work)
|
|
{
|
|
unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
|
|
@@ -1011,6 +1026,11 @@ static void gc_worker(struct work_struct
|
|
tmp = nf_ct_tuplehash_to_ctrack(h);
|
|
|
|
scanned++;
|
|
+ if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
|
|
+ nf_ct_offload_timeout(tmp);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
if (nf_ct_is_expired(tmp)) {
|
|
nf_ct_gc_expired(tmp);
|
|
expired_count++;
|
|
--- a/net/netfilter/nf_conntrack_netlink.c
|
|
+++ b/net/netfilter/nf_conntrack_netlink.c
|
|
@@ -1120,6 +1120,14 @@ static const struct nla_policy ct_nla_po
|
|
.len = NF_CT_LABELS_MAX_SIZE },
|
|
};
|
|
|
|
+static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
|
|
+{
|
|
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
|
|
+ return 0;
|
|
+
|
|
+ return ctnetlink_filter_match(ct, data);
|
|
+}
|
|
+
|
|
static int ctnetlink_flush_conntrack(struct net *net,
|
|
const struct nlattr * const cda[],
|
|
u32 portid, int report)
|
|
@@ -1132,7 +1140,7 @@ static int ctnetlink_flush_conntrack(str
|
|
return PTR_ERR(filter);
|
|
}
|
|
|
|
- nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
|
|
+ nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
|
|
portid, report);
|
|
kfree(filter);
|
|
|
|
@@ -1178,6 +1186,11 @@ static int ctnetlink_del_conntrack(struc
|
|
|
|
ct = nf_ct_tuplehash_to_ctrack(h);
|
|
|
|
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
|
|
+ nf_ct_put(ct);
|
|
+ return -EBUSY;
|
|
+ }
|
|
+
|
|
if (cda[CTA_ID]) {
|
|
u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
|
|
if (id != (u32)(unsigned long)ct) {
|
|
--- a/net/netfilter/nf_conntrack_proto_tcp.c
|
|
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
|
|
@@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_c
|
|
/* Print out the private part of the conntrack. */
|
|
static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
|
|
{
|
|
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
|
|
+ return;
|
|
+
|
|
seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
|
|
}
|
|
#endif
|
|
--- a/net/netfilter/nf_conntrack_standalone.c
|
|
+++ b/net/netfilter/nf_conntrack_standalone.c
|
|
@@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *
|
|
WARN_ON(!l4proto);
|
|
|
|
ret = -ENOSPC;
|
|
- seq_printf(s, "%-8s %u %-8s %u %ld ",
|
|
+ seq_printf(s, "%-8s %u %-8s %u ",
|
|
l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
|
|
- l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
|
|
- nf_ct_expires(ct) / HZ);
|
|
+ l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
|
|
+
|
|
+ if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
|
|
+ seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ);
|
|
|
|
if (l4proto->print_conntrack)
|
|
l4proto->print_conntrack(s, ct);
|
|
@@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *
|
|
if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
|
|
goto release;
|
|
|
|
- if (test_bit(IPS_ASSURED_BIT, &ct->status))
|
|
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
|
|
+ seq_puts(s, "[OFFLOAD] ");
|
|
+ else if (test_bit(IPS_ASSURED_BIT, &ct->status))
|
|
seq_puts(s, "[ASSURED] ");
|
|
|
|
if (seq_has_overflowed(s))
|
|
|