Commit 0584e965 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt'



Martin KaFai says:

====================

This set allows the bpf-tcp-cc to call bpf_setsockopt.  One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).

bpf_getsockopt() is also added to have a symmetrical API, so
less usage surprise.

v2:
- Not allow switching to kernel's tcp_cdg because it is the only
  kernel tcp-cc that stores a pointer to icsk_ca_priv.
  Please see the commit log in patch 1 for details.
  Test is added in patch 4 to check switching to tcp_cdg.
- Refactor the logic finding the offset of a func ptr
  in the "struct tcp_congestion_ops" to prog_ops_moff()
  in patch 1.
- bpf_setsockopt() has been disabled in release() since v1 (please
  see commit log in patch 1 for reason).  bpf_getsockopt() is
  also disabled together in release() in v2 to avoid usage surprise
  because both of them are usually expected to be available together.
  bpf-tcp-cc can already use PTR_TO_BTF_ID to read from tcp_sock.
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 7d789bd0 574ee209
Loading
Loading
Loading
Loading
+21 −1
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@ struct bpf_struct_ops_value {

struct bpf_struct_ops_map {
	struct bpf_map map;
	struct rcu_head rcu;
	const struct bpf_struct_ops *st_ops;
	/* protect map_update */
	struct mutex lock;
@@ -622,6 +623,14 @@ bool bpf_struct_ops_get(const void *kdata)
	return refcount_inc_not_zero(&kvalue->refcnt);
}

static void bpf_struct_ops_put_rcu(struct rcu_head *head)
{
	struct bpf_struct_ops_map *st_map;

	st_map = container_of(head, struct bpf_struct_ops_map, rcu);
	bpf_map_put(&st_map->map);
}

void bpf_struct_ops_put(const void *kdata)
{
	struct bpf_struct_ops_value *kvalue;
@@ -632,6 +641,17 @@ void bpf_struct_ops_put(const void *kdata)

		st_map = container_of(kvalue, struct bpf_struct_ops_map,
				      kvalue);
		bpf_map_put(&st_map->map);
		/* The struct_ops's function may switch to another struct_ops.
		 *
		 * For example, bpf_tcp_cc_x->init() may switch to
		 * another tcp_cc_y by calling
		 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
		 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
		 * and its map->refcnt may reach 0 which then free its
		 * trampoline image while tcp_cc_x is still running.
		 *
		 * Thus, a rcu grace period is needed here.
		 */
		call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
	}
}
+6 −0
Original line number Diff line number Diff line
@@ -5051,6 +5051,12 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
	   int, optname, char *, optval, int, optlen)
{
	if (level == SOL_TCP && optname == TCP_CONGESTION) {
		if (optlen >= sizeof("cdg") - 1 &&
		    !strncmp("cdg", optval, optlen))
			return -ENOTSUPP;
	}

	return _bpf_setsockopt(sk, level, optname, optval, optlen);
}

+38 −3
Original line number Diff line number Diff line
@@ -10,6 +10,9 @@
#include <net/tcp.h>
#include <net/bpf_sk_storage.h>

/* "extern" is to avoid sparse warning.  It is only used in bpf_struct_ops.c. */
extern struct bpf_struct_ops bpf_tcp_congestion_ops;

static u32 optional_ops[] = {
	offsetof(struct tcp_congestion_ops, init),
	offsetof(struct tcp_congestion_ops, release),
@@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
	.arg2_type	= ARG_ANYTHING,
};

static u32 prog_ops_moff(const struct bpf_prog *prog)
{
	const struct btf_member *m;
	const struct btf_type *t;
	u32 midx;

	midx = prog->expected_attach_type;
	t = bpf_tcp_congestion_ops.type;
	m = &btf_type_member(t)[midx];

	return btf_member_bit_offset(t, m) / 8;
}

static const struct bpf_func_proto *
bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
			  const struct bpf_prog *prog)
@@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
		return &bpf_sk_storage_get_proto;
	case BPF_FUNC_sk_storage_delete:
		return &bpf_sk_storage_delete_proto;
	case BPF_FUNC_setsockopt:
		/* Does not allow release() to call setsockopt.
		 * release() is called when the current bpf-tcp-cc
		 * is retiring.  It is not allowed to call
		 * setsockopt() to make further changes which
		 * may potentially allocate new resources.
		 */
		if (prog_ops_moff(prog) !=
		    offsetof(struct tcp_congestion_ops, release))
			return &bpf_sk_setsockopt_proto;
		return NULL;
	case BPF_FUNC_getsockopt:
		/* Since get/setsockopt is usually expected to
		 * be available together, disable getsockopt for
		 * release also to avoid usage surprise.
		 * The bpf-tcp-cc already has a more powerful way
		 * to read tcp_sock from the PTR_TO_BTF_ID.
		 */
		if (prog_ops_moff(prog) !=
		    offsetof(struct tcp_congestion_ops, release))
			return &bpf_sk_getsockopt_proto;
		return NULL;
	default:
		return bpf_base_func_proto(func_id);
	}
@@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata)
	tcp_unregister_congestion_control(kdata);
}

/* Avoid sparse warning.  It is only used in bpf_struct_ops.c. */
extern struct bpf_struct_ops bpf_tcp_congestion_ops;

struct bpf_struct_ops bpf_tcp_congestion_ops = {
	.verifier_ops = &bpf_tcp_ca_verifier_ops,
	.reg = bpf_tcp_ca_reg,
+1 −0
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@ enum sk_pacing {

struct sock {
	struct sock_common	__sk_common;
#define sk_state		__sk_common.skc_state
	unsigned long		sk_pacing_rate;
	__u32			sk_pacing_status; /* see enum sk_pacing */
} __attribute__((preserve_access_index));
+21 −2
Original line number Diff line number Diff line
@@ -218,13 +218,18 @@ static int connect_fd_to_addr(int fd,
	return 0;
}

int connect_to_fd(int server_fd, int timeout_ms)
static const struct network_helper_opts default_opts;

int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts)
{
	struct sockaddr_storage addr;
	struct sockaddr_in *addr_in;
	socklen_t addrlen, optlen;
	int fd, type;

	if (!opts)
		opts = &default_opts;

	optlen = sizeof(type);
	if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) {
		log_err("getsockopt(SOL_TYPE)");
@@ -244,7 +249,12 @@ int connect_to_fd(int server_fd, int timeout_ms)
		return -1;
	}

	if (settimeo(fd, timeout_ms))
	if (settimeo(fd, opts->timeout_ms))
		goto error_close;

	if (opts->cc && opts->cc[0] &&
	    setsockopt(fd, SOL_TCP, TCP_CONGESTION, opts->cc,
		       strlen(opts->cc) + 1))
		goto error_close;

	if (connect_fd_to_addr(fd, &addr, addrlen))
@@ -257,6 +267,15 @@ int connect_to_fd(int server_fd, int timeout_ms)
	return -1;
}

int connect_to_fd(int server_fd, int timeout_ms)
{
	struct network_helper_opts opts = {
		.timeout_ms = timeout_ms,
	};

	return connect_to_fd_opts(server_fd, &opts);
}

int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms)
{
	struct sockaddr_storage addr;
Loading