Skip to content

Commit

Permalink
tcp/dccp: do not touch listener sk_refcnt under synflood
Browse files Browse the repository at this point in the history
When a SYNFLOOD targets a non SO_REUSEPORT listener, multiple
cpus contend on sk->sk_refcnt and sk->sk_wmem_alloc changes.

By letting listeners use SOCK_RCU_FREE infrastructure,
we can relax TCP_LISTEN lookup rules and avoid touching sk_refcnt

Note that we still use SLAB_DESTROY_BY_RCU rules for other sockets,
only listeners are impacted by this change.

Peak performance under SYNFLOOD is increased by ~33% :

On my test machine, I could process 3.2 Mpps instead of 2.4 Mpps

Most consuming functions are now skb_set_owner_w() and sock_wfree()
contending on sk->sk_wmem_alloc when cooking SYNACK and freeing them.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and davem330 committed Apr 5, 2016
1 parent 3a5d1c0 commit 3b24d85
Show file tree
Hide file tree
Showing 10 changed files with 134 additions and 163 deletions.
12 changes: 8 additions & 4 deletions include/net/inet6_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,15 @@ static inline struct sock *__inet6_lookup(struct net *net,
const __be16 sport,
const struct in6_addr *daddr,
const u16 hnum,
const int dif)
const int dif,
bool *refcounted)
{
struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr,
sport, daddr, hnum, dif);
*refcounted = true;
if (sk)
return sk;

*refcounted = false;
return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
daddr, hnum, dif);
}
Expand All @@ -81,17 +83,19 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be16 sport,
const __be16 dport,
int iif)
int iif,
bool *refcounted)
{
struct sock *sk = skb_steal_sock(skb);

*refcounted = true;
if (sk)
return sk;

return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
doff, &ipv6_hdr(skb)->saddr, sport,
&ipv6_hdr(skb)->daddr, ntohs(dport),
iif);
iif, refcounted);
}

struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
Expand Down
40 changes: 24 additions & 16 deletions include/net/inet_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,10 @@ struct inet_bind_hashbucket {

/*
* Sockets can be hashed in established or listening table
* We must use different 'nulls' end-of-chain value for listening
* hash table, or we might find a socket that was closed and
* reallocated/inserted into established hash table
*/
#define LISTENING_NULLS_BASE (1U << 29)
struct inet_listen_hashbucket {
spinlock_t lock;
struct hlist_nulls_head head;
struct hlist_head head;
};

/* This is for listening sockets, thus all sockets which possess wildcards. */
Expand Down Expand Up @@ -304,14 +300,20 @@ static inline struct sock *__inet_lookup(struct net *net,
struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
const int dif)
const int dif,
bool *refcounted)
{
u16 hnum = ntohs(dport);
struct sock *sk = __inet_lookup_established(net, hashinfo,
saddr, sport, daddr, hnum, dif);
struct sock *sk;

return sk ? : __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
sport, daddr, hnum, dif);
sk = __inet_lookup_established(net, hashinfo, saddr, sport,
daddr, hnum, dif);
*refcounted = true;
if (sk)
return sk;
*refcounted = false;
return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
sport, daddr, hnum, dif);
}

static inline struct sock *inet_lookup(struct net *net,
Expand All @@ -322,28 +324,34 @@ static inline struct sock *inet_lookup(struct net *net,
const int dif)
{
struct sock *sk;
bool refcounted;

sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
dport, dif);
dport, dif, &refcounted);

if (sk && !refcounted && !atomic_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
}

static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
struct sk_buff *skb,
int doff,
const __be16 sport,
const __be16 dport)
const __be16 dport,
bool *refcounted)
{
struct sock *sk = skb_steal_sock(skb);
const struct iphdr *iph = ip_hdr(skb);

*refcounted = true;
if (sk)
return sk;
else
return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
doff, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb));

return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
doff, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
refcounted);
}

u32 sk_ehashfn(const struct sock *sk);
Expand Down
7 changes: 5 additions & 2 deletions net/dccp/ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
const struct iphdr *iph;
bool refcounted;
struct sock *sk;
int min_cov;

Expand Down Expand Up @@ -801,7 +802,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)

lookup:
sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
dh->dccph_sport, dh->dccph_dport);
dh->dccph_sport, dh->dccph_dport, &refcounted);
if (!sk) {
dccp_pr_debug("failed to look up flow ID in table and "
"get corresponding socket\n");
Expand Down Expand Up @@ -830,6 +831,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
goto lookup;
}
sock_hold(sk);
refcounted = true;
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
Expand Down Expand Up @@ -886,7 +888,8 @@ static int dccp_v4_rcv(struct sk_buff *skb)
return 0;

discard_and_relse:
sock_put(sk);
if (refcounted)
sock_put(sk);
goto discard_it;
}

Expand Down
7 changes: 5 additions & 2 deletions net/dccp/ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
static int dccp_v6_rcv(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
bool refcounted;
struct sock *sk;
int min_cov;

Expand Down Expand Up @@ -670,7 +671,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
lookup:
sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
dh->dccph_sport, dh->dccph_dport,
inet6_iif(skb));
inet6_iif(skb), &refcounted);
if (!sk) {
dccp_pr_debug("failed to look up flow ID in table and "
"get corresponding socket\n");
Expand Down Expand Up @@ -699,6 +700,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
goto lookup;
}
sock_hold(sk);
refcounted = true;
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
Expand Down Expand Up @@ -752,7 +754,8 @@ static int dccp_v6_rcv(struct sk_buff *skb)
return 0;

discard_and_relse:
sock_put(sk);
if (refcounted)
sock_put(sk);
goto discard_it;
}

Expand Down
3 changes: 1 addition & 2 deletions net/ipv4/inet_diag.c
Original file line number Diff line number Diff line change
Expand Up @@ -775,13 +775,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,

for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
struct sock *sk;

num = 0;
ilb = &hashinfo->listening_hash[i];
spin_lock_bh(&ilb->lock);
sk_nulls_for_each(sk, node, &ilb->head) {
sk_for_each(sk, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);

if (!net_eq(sock_net(sk), net))
Expand Down
73 changes: 25 additions & 48 deletions net/ipv4/inet_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -198,76 +198,48 @@ static inline int compute_score(struct sock *sk, struct net *net,
}

/*
* Don't inline this cruft. Here are some nice properties to exploit here. The
* BSD API does not allow a listening sock to specify the remote port nor the
* Here are some nice properties to exploit here. The BSD API
* does not allow a listening sock to specify the remote port nor the
* remote address for the connection. So always assume those are both
* wildcarded during the search since they can never be otherwise.
*/


/* called with rcu_read_lock() : No refcount taken on the socket */
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore, matches = 0, reuseport = 0;
bool select_ok = true;
int score, hiscore = 0, matches = 0, reuseport = 0;
struct sock *sk, *result = NULL;
u32 phash = 0;

begin:
result = NULL;
hiscore = 0;
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
sk_for_each_rcu(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
result = sk;
hiscore = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
if (select_ok) {
struct sock *sk2;
sk2 = reuseport_select_sock(sk, phash,
skb, doff);
if (sk2) {
result = sk2;
goto found;
}
}
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
return result;
matches = 1;
}
result = sk;
hiscore = score;
} else if (score == hiscore && reuseport) {
matches++;
if (reciprocal_scale(phash, matches) == 0)
result = sk;
phash = next_pseudo_random32(phash);
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
found:
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
select_ok = false;
goto begin;
}
}
return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
Expand Down Expand Up @@ -508,7 +480,8 @@ int __inet_hash(struct sock *sk, struct sock *osk,
if (err)
goto unlock;
}
__sk_nulls_add_node_rcu(sk, &ilb->head);
hlist_add_head_rcu(&sk->sk_node, &ilb->head);
sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
spin_unlock(&ilb->lock);
Expand All @@ -535,20 +508,25 @@ void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
spinlock_t *lock;
bool listener = false;
int done;

if (sk_unhashed(sk))
return;

if (sk->sk_state == TCP_LISTEN)
if (sk->sk_state == TCP_LISTEN) {
lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
else
listener = true;
} else {
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

}
spin_lock_bh(lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
done = __sk_nulls_del_node_init_rcu(sk);
if (listener)
done = __sk_del_node_init(sk);
else
done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
Expand Down Expand Up @@ -684,9 +662,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h)

for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
i + LISTENING_NULLS_BASE);
}
INIT_HLIST_HEAD(&h->listening_hash[i].head);
}
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);

Expand Down
Loading

0 comments on commit 3b24d85

Please sign in to comment.