Forum | Documentation | Website | Blog

Skip to content
Snippets Groups Projects
af_smc.c 72.9 KiB
Newer Older
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
 *  applies to SOCK_STREAM sockets only
 *  offers an alternative communication option for TCP-protocol sockets
 *  applicable with RoCE-cards only
 *
 *  Initial restrictions:
 *    - support for alternate links postponed
 *
 *  Copyright IBM Corp. 2016, 2018
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 *              based on prototype from Frank Blaschka
 */

#define KMSG_COMPONENT "smc"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/module.h>
#include <linux/socket.h>
#include <linux/workqueue.h>
#include <linux/in.h>
#include <linux/if_vlan.h>
#include <linux/rcupdate_wait.h>
#include <linux/ctype.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <net/smc.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include "smc_netns.h"

#include "smc.h"
#include "smc_clc.h"
#include "smc_llc.h"
#include "smc_cdc.h"
#include "smc_core.h"
#include "smc_ib.h"
#include "smc_ism.h"
#include "smc_pnet.h"
#include "smc_tx.h"
#include "smc_rx.h"
#include "smc_close.h"
#include "smc_stats.h"
#include "smc_tracepoint.h"
static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
						 * creation on server
						 */
static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
						 * creation on client
struct workqueue_struct	*smc_hs_wq;	/* wq for handshake work */
struct workqueue_struct	*smc_close_wq;	/* wq for close work */

static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
static void smc_set_keepalive(struct sock *sk, int val)
{
	struct smc_sock *smc = smc_sk(sk);

	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}

static struct smc_hashinfo smc_v4_hashinfo = {
	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};

static struct smc_hashinfo smc_v6_hashinfo = {
	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
};

int smc_hash_sk(struct sock *sk)
{
	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
	struct hlist_head *head;

	head = &h->ht;

	write_lock_bh(&h->lock);
	sk_add_node(sk, head);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	write_unlock_bh(&h->lock);

	return 0;
}
EXPORT_SYMBOL_GPL(smc_hash_sk);

void smc_unhash_sk(struct sock *sk)
{
	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;

	write_lock_bh(&h->lock);
	if (sk_del_node_init(sk))
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
	write_unlock_bh(&h->lock);
}
EXPORT_SYMBOL_GPL(smc_unhash_sk);

struct proto smc_proto = {
	.name		= "SMC",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
	.obj_size	= sizeof(struct smc_sock),
	.h.smc_hash	= &smc_v4_hashinfo,
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
EXPORT_SYMBOL_GPL(smc_proto);
struct proto smc_proto6 = {
	.name		= "SMC6",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
	.obj_size	= sizeof(struct smc_sock),
	.h.smc_hash	= &smc_v6_hashinfo,
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto6);

static void smc_restore_fallback_changes(struct smc_sock *smc)
{
	if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
		smc->clcsock->file->private_data = smc->sk.sk_socket;
		smc->clcsock->file = NULL;
	}
static int __smc_release(struct smc_sock *smc)
	struct sock *sk = &smc->sk;
	if (!smc->use_fallback) {
		rc = smc_close_active(smc);
		sock_set_flag(sk, SOCK_DEAD);
		sk->sk_shutdown |= SHUTDOWN_MASK;
		if (sk->sk_state != SMC_CLOSED) {
			if (sk->sk_state != SMC_LISTEN &&
			    sk->sk_state != SMC_INIT)
				sock_put(sk); /* passive closing */
			if (sk->sk_state == SMC_LISTEN) {
				/* wake up clcsock accept */
				rc = kernel_sock_shutdown(smc->clcsock,
							  SHUT_RDWR);
			}
			sk->sk_state = SMC_CLOSED;
			sk->sk_state_change(sk);
		smc_restore_fallback_changes(smc);
	sk->sk_prot->unhash(sk);

	if (sk->sk_state == SMC_CLOSED) {
		if (smc->clcsock) {
			release_sock(sk);
			smc_clcsock_release(smc);
			lock_sock(sk);
		}
		if (!smc->use_fallback)
			smc_conn_free(&smc->conn);
	}

	return rc;
}

static int smc_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc = 0;

	if (!sk)
		goto out;

	sock_hold(sk); /* sock_put below */
	smc = smc_sk(sk);

	/* cleanup for a dangling non-blocking connect */
	if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
		tcp_abort(smc->clcsock->sk, ECONNABORTED);
	flush_work(&smc->connect_work);

	if (sk->sk_state == SMC_LISTEN)
		/* smc_close_non_accepted() is called and acquires
		 * sock lock for child sockets again
		 */
		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
	else
		lock_sock(sk);

	rc = __smc_release(smc);

	/* detach socket */
	sock_orphan(sk);
	sock->sk = NULL;
	release_sock(sk);

	sock_put(sk); /* sock_hold above */
	sock_put(sk); /* final sock_put */
}

static void smc_destruct(struct sock *sk)
{
	if (sk->sk_state != SMC_CLOSED)
		return;
	if (!sock_flag(sk, SOCK_DEAD))
		return;

	sk_refcnt_debug_dec(sk);
}

static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
				   int protocol)
{
	struct smc_sock *smc;
	struct proto *prot;
	struct sock *sk;

	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
	if (!sk)
		return NULL;

	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
	sk->sk_state = SMC_INIT;
	sk->sk_destruct = smc_destruct;
	sk->sk_protocol = protocol;
	smc = smc_sk(sk);
	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
	INIT_WORK(&smc->connect_work, smc_connect_work);
	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
	INIT_LIST_HEAD(&smc->accept_q);
	spin_lock_init(&smc->accept_q_lock);
	spin_lock_init(&smc->conn.send_lock);
	sk->sk_prot->hash(sk);
	sk_refcnt_debug_inc(sk);
	mutex_init(&smc->clcsock_release_lock);

	return sk;
}

static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
		    int addr_len)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc;

	smc = smc_sk(sk);

	/* replicate tests from inet_bind(), to be safe wrt. future changes */
	rc = -EINVAL;
	if (addr_len < sizeof(struct sockaddr_in))
		goto out;

	rc = -EAFNOSUPPORT;
	if (addr->sin_family != AF_INET &&
	    addr->sin_family != AF_INET6 &&
	    addr->sin_family != AF_UNSPEC)
		goto out;
	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
	if (addr->sin_family == AF_UNSPEC &&
	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
		goto out;

	lock_sock(sk);

	/* Check if socket is already active */
	rc = -EINVAL;
	if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
		goto out_rel;

	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
	rc = kernel_bind(smc->clcsock, uaddr, addr_len);

out_rel:
	release_sock(sk);
out:
	return rc;
}

static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
				   unsigned long mask)
{
	/* options we don't get control via setsockopt for */
	nsk->sk_type = osk->sk_type;
	nsk->sk_sndbuf = osk->sk_sndbuf;
	nsk->sk_rcvbuf = osk->sk_rcvbuf;
	nsk->sk_sndtimeo = osk->sk_sndtimeo;
	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
	nsk->sk_mark = osk->sk_mark;
	nsk->sk_priority = osk->sk_priority;
	nsk->sk_rcvlowat = osk->sk_rcvlowat;
	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
	nsk->sk_err = osk->sk_err;

	nsk->sk_flags &= ~mask;
	nsk->sk_flags |= osk->sk_flags & mask;
}

#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
			     (1UL << SOCK_KEEPOPEN) | \
			     (1UL << SOCK_LINGER) | \
			     (1UL << SOCK_BROADCAST) | \
			     (1UL << SOCK_TIMESTAMP) | \
			     (1UL << SOCK_DBG) | \
			     (1UL << SOCK_RCVTSTAMP) | \
			     (1UL << SOCK_RCVTSTAMPNS) | \
			     (1UL << SOCK_LOCALROUTE) | \
			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
			     (1UL << SOCK_RXQ_OVFL) | \
			     (1UL << SOCK_WIFI_STATUS) | \
			     (1UL << SOCK_NOFCS) | \
			     (1UL << SOCK_FILTER_LOCKED) | \
			     (1UL << SOCK_TSTAMP_NEW))
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 * clc socket (since smc is not called for these options from net/core)
 */
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
}

#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
			     (1UL << SOCK_KEEPOPEN) | \
			     (1UL << SOCK_LINGER) | \
			     (1UL << SOCK_DBG))
/* copy only settings and flags relevant for smc from clc to smc socket */
static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
{
	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}

/* register the new rmb on all links */
static int smcr_lgr_reg_rmbs(struct smc_link *link,
	struct smc_link_group *lgr = link->lgr;
	int i, rc = 0;
	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
	if (rc)
		return rc;
	/* protect against parallel smc_llc_cli_rkey_exchange() and
	 * parallel smcr_link_reg_rmb()
	 */
	mutex_lock(&lgr->llc_conf_mutex);
	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
		if (!smc_link_active(&lgr->lnk[i]))
		rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);

	/* exchange confirm_rkey msg with peer */
	rc = smc_llc_do_confirm_rkey(link, rmb_desc);
	if (rc) {
		rc = -EFAULT;
		goto out;
	}
	rmb_desc->is_conf_rkey = true;
out:
	mutex_unlock(&lgr->llc_conf_mutex);
	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
static int smcr_clnt_conf_first_link(struct smc_sock *smc)
	struct smc_link *link = smc->conn.lnk;
	struct smc_llc_qentry *qentry;
	int rc;

	/* receive CONFIRM LINK request from server over RoCE fabric */
	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
			      SMC_LLC_CONFIRM_LINK);
	if (!qentry) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
	smc_llc_save_peer_uid(qentry);
	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
	if (rc)
		return SMC_CLC_DECL_RMBE_EC;

	rc = smc_ib_modify_qp_rts(link);
	if (rc)
		return SMC_CLC_DECL_ERR_RDYLNK;

	smc_wr_remember_qp_attr(link);
	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
		return SMC_CLC_DECL_ERR_REGRMB;
	/* confirm_rkey is implicit on 1st contact */
	smc->conn.rmb_desc->is_conf_rkey = true;

	/* send CONFIRM LINK response over RoCE fabric */
	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
	if (rc < 0)
		return SMC_CLC_DECL_TIMEOUT_CL;
	smc_llc_link_active(link);
	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);

	/* optional 2nd link, receive ADD LINK request from server */
	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
			      SMC_LLC_ADD_LINK);
	if (!qentry) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
		if (rc == -EAGAIN)
			rc = 0; /* no DECLINE received, go with one link */
		return rc;
	smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
	smc_llc_cli_add_link(link, qentry);
static bool smc_isascii(char *hostname)
{
	int i;

	for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
		if (!isascii(hostname[i]))
			return false;
	return true;
}

static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
					struct smc_clc_msg_accept_confirm *clc)
{
	struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
		(struct smc_clc_msg_accept_confirm_v2 *)clc;
	struct smc_clc_first_contact_ext *fce;
	int clc_v2_len;

	if (clc->hdr.version == SMC_V1 ||
	    !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
		return;

	if (smc->conn.lgr->is_smcd) {
		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid,
		       SMC_MAX_EID_LEN);
		clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
					 d1);
	} else {
		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid,
		       SMC_MAX_EID_LEN);
		clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
					 r1);
	}
	fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len);
	smc->conn.lgr->peer_os = fce->os_type;
	smc->conn.lgr->peer_smc_release = fce->release;
	if (smc_isascii(fce->hostname))
		memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
		       SMC_MAX_HOSTNAME_LEN);
}

static void smcr_conn_save_peer_info(struct smc_sock *smc,
				     struct smc_clc_msg_accept_confirm *clc)
	int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
	smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
	smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
	smc->conn.peer_rmbe_size = bufsize;
	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
static void smcd_conn_save_peer_info(struct smc_sock *smc,
				     struct smc_clc_msg_accept_confirm *clc)
{
	int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
	smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
	smc->conn.peer_token = clc->d0.token;
	/* msg header takes up space in the buffer */
	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
}

static void smc_conn_save_peer_info(struct smc_sock *smc,
				    struct smc_clc_msg_accept_confirm *clc)
{
	if (smc->conn.lgr->is_smcd)
		smcd_conn_save_peer_info(smc, clc);
	else
		smcr_conn_save_peer_info(smc, clc);
	smc_conn_save_peer_info_fce(smc, clc);
static void smc_link_save_peer_info(struct smc_link *link,
				    struct smc_clc_msg_accept_confirm *clc,
				    struct smc_init_info *ini)
	link->peer_qpn = ntoh24(clc->r0.qpn);
	memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
	memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
	link->peer_psn = ntoh24(clc->r0.psn);
	link->peer_mtu = clc->r0.qp_mtu;
static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc,
				       struct smc_stats_fback *fback_arr)
{
	int cnt;

	for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) {
		if (fback_arr[cnt].fback_code == smc->fallback_rsn) {
			fback_arr[cnt].count++;
			break;
		}
		if (!fback_arr[cnt].fback_code) {
			fback_arr[cnt].fback_code = smc->fallback_rsn;
			fback_arr[cnt].count++;
			break;
		}
	}
}

static void smc_stat_fallback(struct smc_sock *smc)
{
	struct net *net = sock_net(&smc->sk);

	mutex_lock(&net->smc.mutex_fback_rsn);
	if (smc->listen_smc) {
		smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv);
		net->smc.fback_rsn->srv_fback_cnt++;
		smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt);
		net->smc.fback_rsn->clnt_fback_cnt++;
	mutex_unlock(&net->smc.mutex_fback_rsn);
}

static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
{
	smc->use_fallback = true;
	smc->fallback_rsn = reason_code;
	smc_stat_fallback(smc);
	trace_smc_switch_to_fallback(smc, reason_code);
	if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
		smc->clcsock->file = smc->sk.sk_socket->file;
		smc->clcsock->file->private_data = smc->clcsock;
		smc->clcsock->wq.fasync_list =
			smc->sk.sk_socket->wq.fasync_list;
/* fall back during connect */
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
	smc_switch_to_fallback(smc, reason_code);
	smc_copy_sock_settings_to_clc(smc);
	smc->connect_nonblock = 0;
	if (smc->sk.sk_state == SMC_INIT)
		smc->sk.sk_state = SMC_ACTIVE;
	return 0;
}
/* decline and fall back during connect */
static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
					u8 version)
	struct net *net = sock_net(&smc->sk);
	if (reason_code < 0) { /* error, fallback is not possible */
		this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
		if (smc->sk.sk_state == SMC_INIT)
			sock_put(&smc->sk); /* passive closing */
	if (reason_code != SMC_CLC_DECL_PEERDECL) {
		rc = smc_clc_send_decline(smc, reason_code, version);
			this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
			if (smc->sk.sk_state == SMC_INIT)
				sock_put(&smc->sk); /* passive closing */
	return smc_connect_fallback(smc, reason_code);
static void smc_conn_abort(struct smc_sock *smc, int local_first)
	if (local_first)
		smc_lgr_cleanup_early(&smc->conn);
	else
		smc_conn_free(&smc->conn);
}

/* check if there is a rdma device available for this connection. */
/* called for connect and listen */
static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
	/* PNET table look up: search active ib_device and port
	 * within same PNETID that also contains the ethernet device
	 * used for the internal TCP socket
	 */
	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
	if (!ini->check_smcrv2 && !ini->ib_dev)
		return SMC_CLC_DECL_NOSMCRDEV;
	if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
		return SMC_CLC_DECL_NOSMCRDEV;
/* check if there is an ISM device available for this connection. */
/* called for connect and listen */
static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
{
	/* Find ISM device with same PNETID as connecting interface  */
	smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
	if (!ini->ism_dev[0])
		return SMC_CLC_DECL_NOSMCDDEV;
	else
		ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
/* is chid unique for the ism devices that are already determined? */
static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
					   int cnt)
{
	int i = (!ini->ism_dev[0]) ? 1 : 0;

	for (; i < cnt; i++)
		if (ini->ism_chid[i] == chid)
			return false;
	return true;
}

/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
 * PNETID matching net_device)
 */
static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
				       struct smc_init_info *ini)
{
	int rc = SMC_CLC_DECL_NOSMCDDEV;
	struct smcd_dev *smcd;
	int i = 1;

	if (smcd_indicated(ini->smc_type_v1))
		rc = 0;		/* already initialized for V1 */
	mutex_lock(&smcd_dev_list.mutex);
	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
		if (smcd->going_away || smcd == ini->ism_dev[0])
			continue;
		chid = smc_ism_get_chid(smcd);
		if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
			continue;
		if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
		    smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
			ini->ism_dev[i] = smcd;
			ini->is_smcd = true;
			rc = 0;
			i++;
			if (i > SMC_MAX_ISM_DEVS)
				break;
		}
	}
	mutex_unlock(&smcd_dev_list.mutex);
	ini->ism_offered_cnt = i - 1;
	if (!ini->ism_dev[0] && !ini->ism_dev[1])
		ini->smcd_version = 0;

	return rc;
}

/* Check for VLAN ID and register it on ISM device just for CLC handshake */
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
				      struct smc_init_info *ini)
	if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
		return SMC_CLC_DECL_ISMVLANERR;
static int smc_find_proposal_devices(struct smc_sock *smc,
				     struct smc_init_info *ini)
{
	int rc = 0;

	/* check if there is an ism device available */
	if (!(ini->smcd_version & SMC_V1) ||
	    smc_find_ism_device(smc, ini) ||
	    smc_connect_ism_vlan_setup(smc, ini))
		ini->smcd_version &= ~SMC_V1;
	/* else ISM V1 is supported for this connection */

	/* check if there is an rdma device available */
	if (!(ini->smcr_version & SMC_V1) ||
	    smc_find_rdma_device(smc, ini))
		ini->smcr_version &= ~SMC_V1;
	/* else RDMA is supported for this connection */

	ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
					      ini->smcr_version & SMC_V1);

	/* check if there is an ism v2 device available */
	if (!(ini->smcd_version & SMC_V2) ||
	    !smc_ism_is_v2_capable() ||
	    smc_find_ism_v2_device_clnt(smc, ini))
		ini->smcd_version &= ~SMC_V2;

	/* check if there is an rdma v2 device available */
	ini->check_smcrv2 = true;
	ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
	if (!(ini->smcr_version & SMC_V2) ||
	    smc->clcsock->sk->sk_family != AF_INET ||
	    !smc_clc_ueid_count() ||
	    smc_find_rdma_device(smc, ini))
		ini->smcr_version &= ~SMC_V2;
	ini->check_smcrv2 = false;

	ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
					      ini->smcr_version & SMC_V2);

	/* if neither ISM nor RDMA are supported, fallback */
	if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
		rc = SMC_CLC_DECL_NOSMCDEV;

	return rc;
}

/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
 * used, the VLAN ID will be registered again during the connection setup.
 */
static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
					struct smc_init_info *ini)
	if (!smcd_indicated(ini->smc_type_v1))
	if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
		return SMC_CLC_DECL_CNFERR;
	return 0;
}

#define SMC_CLC_MAX_ACCEPT_LEN \
	(sizeof(struct smc_clc_msg_accept_confirm_v2) + \
	 sizeof(struct smc_clc_first_contact_ext) + \
	 sizeof(struct smc_clc_msg_trail))

/* CLC handshake during connect */
static int smc_connect_clc(struct smc_sock *smc,
			   struct smc_clc_msg_accept_confirm_v2 *aclc2,
			   struct smc_init_info *ini)

	/* do inband token exchange */
	rc = smc_clc_send_proposal(smc, ini);
	/* receive SMC Accept CLC message */
	return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
				SMC_CLC_ACCEPT, CLC_WAIT_TIME);
void smc_fill_gid_list(struct smc_link_group *lgr,
		       struct smc_gidlist *gidlist,
		       struct smc_ib_device *known_dev, u8 *known_gid)
{
	struct smc_init_info *alt_ini = NULL;

	memset(gidlist, 0, sizeof(*gidlist));
	memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);

	alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
	if (!alt_ini)
		goto out;

	alt_ini->vlan_id = lgr->vlan_id;
	alt_ini->check_smcrv2 = true;
	alt_ini->smcrv2.saddr = lgr->saddr;
	smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);

	if (!alt_ini->smcrv2.ib_dev_v2)
		goto out;

	memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
	       SMC_GID_SIZE);

out:
	kfree(alt_ini);
}

static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
				       struct smc_clc_msg_accept_confirm *aclc,
				       struct smc_init_info *ini)
{
	struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
		(struct smc_clc_msg_accept_confirm_v2 *)aclc;
	struct smc_clc_first_contact_ext *fce =
		(struct smc_clc_first_contact_ext *)
			(((u8 *)clc_v2) + sizeof(*clc_v2));

	if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
		return 0;

	if (fce->v2_direct) {
		memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
		ini->smcrv2.uses_gateway = false;
	} else {
		if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr,
				      smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
				      ini->smcrv2.nexthop_mac,
				      &ini->smcrv2.uses_gateway))
			return SMC_CLC_DECL_NOROUTE;
		if (!ini->smcrv2.uses_gateway) {
			/* mismatch: peer claims indirect, but its direct */
			return SMC_CLC_DECL_NOINDIRECT;
		}
	}
	return 0;
}

/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc,
			    struct smc_clc_msg_accept_confirm *aclc,
			    struct smc_init_info *ini)
	int i, reason_code = 0;
	struct smc_link *link;
	ini->is_smcd = false;
	ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
	ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
	memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
	memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
	memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);

	reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
	if (reason_code)
		return reason_code;
	mutex_lock(&smc_client_lgr_pending);
	reason_code = smc_conn_create(smc, ini);
	if (reason_code) {
		mutex_unlock(&smc_client_lgr_pending);
		return reason_code;
	smc_conn_save_peer_info(smc, aclc);
	if (ini->first_contact_local) {
		link = smc->conn.lnk;
	} else {
		/* set link that was assigned by server */
		link = NULL;
		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
			struct smc_link *l = &smc->conn.lgr->lnk[i];

			if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
			    !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
				    SMC_GID_SIZE) &&
			    (aclc->hdr.version > SMC_V1 ||
			     !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
				     sizeof(l->peer_mac)))) {
		if (!link) {
			reason_code = SMC_CLC_DECL_NOSRVLINK;
			goto connect_abort;
		}
		smc_switch_link_and_count(&smc->conn, link);
	/* create send buffer and rmb */
	if (smc_buf_create(smc, false)) {
		reason_code = SMC_CLC_DECL_MEM;
		goto connect_abort;
	}
	if (ini->first_contact_local)
		smc_link_save_peer_info(link, aclc, ini);
	if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
		reason_code = SMC_CLC_DECL_ERR_RTOK;
		goto connect_abort;
	}
	smc_close_init(smc);
	smc_rx_init(smc);

	if (ini->first_contact_local) {
		if (smc_ib_ready_link(link)) {
			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
			goto connect_abort;
		}
		if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
			reason_code = SMC_CLC_DECL_ERR_REGRMB;
			goto connect_abort;
		}
	smc_rmb_sync_sg_for_device(&smc->conn);
	if (aclc->hdr.version > SMC_V1) {
		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
			(struct smc_clc_msg_accept_confirm_v2 *)aclc;

		eid = clc_v2->r1.eid;
		if (ini->first_contact_local)
			smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
					  link->smcibdev, link->gid);
	}

	reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
					   aclc->hdr.version, eid, ini);
		goto connect_abort;
	if (ini->first_contact_local) {
		/* QP confirmation over RoCE fabric */
		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
		reason_code = smcr_clnt_conf_first_link(smc);
		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
			goto connect_abort;
	mutex_unlock(&smc_client_lgr_pending);
	smc_copy_sock_settings_to_clc(smc);
	smc->connect_nonblock = 0;
	if (smc->sk.sk_state == SMC_INIT)
		smc->sk.sk_state = SMC_ACTIVE;
	smc_conn_abort(smc, ini->first_contact_local);
	mutex_unlock(&smc_client_lgr_pending);
	smc->connect_nonblock = 0;

	return reason_code;
/* The server has chosen one of the proposed ISM devices for the communication.
 * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
 */
static int
smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
			       struct smc_init_info *ini)
{
	int i;

	for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
		if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
			ini->ism_selected = i;
			return 0;
		}
	}

	return -EPROTO;
}

/* setup for ISM connection of client */
static int smc_connect_ism(struct smc_sock *smc,
			   struct smc_clc_msg_accept_confirm *aclc,
			   struct smc_init_info *ini)