Newer
Older
// SPDX-License-Identifier: GPL-2.0-only
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* AF_SMC protocol family socket handler keeping the AF_INET sock address type
* applies to SOCK_STREAM sockets only
* offers an alternative communication option for TCP-protocol sockets
* applicable with RoCE-cards only
*
* Initial restrictions:
* - support for alternate links postponed
*
*
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
* based on prototype from Frank Blaschka
*/
#define KMSG_COMPONENT "smc"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sched/signal.h>
#include <asm/ioctls.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include "smc_netns.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
* creation on server
*/
static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
* creation on client
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
static void smc_set_keepalive(struct sock *sk, int val)
{
struct smc_sock *smc = smc_sk(sk);
smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}
static struct smc_hashinfo smc_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};
static struct smc_hashinfo smc_v6_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
};
int smc_hash_sk(struct sock *sk)
{
struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
struct hlist_head *head;
head = &h->ht;
write_lock_bh(&h->lock);
sk_add_node(sk, head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&h->lock);
return 0;
}
EXPORT_SYMBOL_GPL(smc_hash_sk);
void smc_unhash_sk(struct sock *sk)
{
struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
write_lock_bh(&h->lock);
if (sk_del_node_init(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
write_unlock_bh(&h->lock);
}
EXPORT_SYMBOL_GPL(smc_unhash_sk);
struct proto smc_proto = {
.name = "SMC",
.owner = THIS_MODULE,
.keepalive = smc_set_keepalive,
.hash = smc_hash_sk,
.unhash = smc_unhash_sk,
.slab_flags = SLAB_TYPESAFE_BY_RCU,
struct proto smc_proto6 = {
.name = "SMC6",
.owner = THIS_MODULE,
.keepalive = smc_set_keepalive,
.hash = smc_hash_sk,
.unhash = smc_unhash_sk,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v6_hashinfo,
.slab_flags = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto6);
static int __smc_release(struct smc_sock *smc)
struct sock *sk = &smc->sk;
if (!smc->use_fallback) {
rc = smc_close_active(smc);
sock_set_flag(sk, SOCK_DEAD);
sk->sk_shutdown |= SHUTDOWN_MASK;
} else {
if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
sock_put(sk); /* passive closing */
if (sk->sk_state == SMC_LISTEN) {
/* wake up clcsock accept */
rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
}
sk->sk_state = SMC_CLOSED;
sk->sk_state_change(sk);
}
sk->sk_prot->unhash(sk);
if (sk->sk_state == SMC_CLOSED) {
if (smc->clcsock) {
release_sock(sk);
smc_clcsock_release(smc);
lock_sock(sk);
}
if (!smc->use_fallback)
smc_conn_free(&smc->conn);
}
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
return rc;
}
static int smc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc = 0;
if (!sk)
goto out;
smc = smc_sk(sk);
/* cleanup for a dangling non-blocking connect */
if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
tcp_abort(smc->clcsock->sk, ECONNABORTED);
flush_work(&smc->connect_work);
if (sk->sk_state == SMC_LISTEN)
/* smc_close_non_accepted() is called and acquires
* sock lock for child sockets again
*/
lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
else
lock_sock(sk);
rc = __smc_release(smc);
/* detach socket */
sock_orphan(sk);
sock->sk = NULL;
release_sock(sk);
sock_put(sk); /* final sock_put */
}
static void smc_destruct(struct sock *sk)
{
if (sk->sk_state != SMC_CLOSED)
return;
if (!sock_flag(sk, SOCK_DEAD))
return;
sk_refcnt_debug_dec(sk);
}
static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
int protocol)
prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
if (!sk)
return NULL;
sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
sk->sk_state = SMC_INIT;
sk->sk_destruct = smc_destruct;
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
INIT_WORK(&smc->connect_work, smc_connect_work);
INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
INIT_LIST_HEAD(&smc->accept_q);
spin_lock_init(&smc->accept_q_lock);
spin_lock_init(&smc->conn.send_lock);
mutex_init(&smc->clcsock_release_lock);
return sk;
}
static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc;
smc = smc_sk(sk);
/* replicate tests from inet_bind(), to be safe wrt. future changes */
rc = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;
rc = -EAFNOSUPPORT;
if (addr->sin_family != AF_INET &&
addr->sin_family != AF_INET6 &&
addr->sin_family != AF_UNSPEC)
goto out;
/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
if (addr->sin_family == AF_UNSPEC &&
addr->sin_addr.s_addr != htonl(INADDR_ANY))
goto out;
lock_sock(sk);
/* Check if socket is already active */
rc = -EINVAL;
if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
goto out_rel;
smc->clcsock->sk->sk_reuse = sk->sk_reuse;
rc = kernel_bind(smc->clcsock, uaddr, addr_len);
out_rel:
release_sock(sk);
out:
return rc;
}
static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
unsigned long mask)
{
/* options we don't get control via setsockopt for */
nsk->sk_type = osk->sk_type;
nsk->sk_sndbuf = osk->sk_sndbuf;
nsk->sk_rcvbuf = osk->sk_rcvbuf;
nsk->sk_sndtimeo = osk->sk_sndtimeo;
nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
nsk->sk_mark = osk->sk_mark;
nsk->sk_priority = osk->sk_priority;
nsk->sk_rcvlowat = osk->sk_rcvlowat;
nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
nsk->sk_err = osk->sk_err;
nsk->sk_flags &= ~mask;
nsk->sk_flags |= osk->sk_flags & mask;
}
#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
(1UL << SOCK_KEEPOPEN) | \
(1UL << SOCK_LINGER) | \
(1UL << SOCK_BROADCAST) | \
(1UL << SOCK_TIMESTAMP) | \
(1UL << SOCK_DBG) | \
(1UL << SOCK_RCVTSTAMP) | \
(1UL << SOCK_RCVTSTAMPNS) | \
(1UL << SOCK_LOCALROUTE) | \
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
(1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_WIFI_STATUS) | \
(1UL << SOCK_NOFCS) | \
(1UL << SOCK_FILTER_LOCKED) | \
(1UL << SOCK_TSTAMP_NEW))
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
* clc socket (since smc is not called for these options from net/core)
*/
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
}
#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
(1UL << SOCK_KEEPOPEN) | \
(1UL << SOCK_LINGER) | \
(1UL << SOCK_DBG))
/* copy only settings and flags relevant for smc from clc to smc socket */
static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
{
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}
/* register a new rmb, send confirm_rkey msg to register with peer */
static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
bool conf_rkey)
if (!rmb_desc->wr_reg) {
/* register memory region for new rmb */
if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
rmb_desc->regerr = 1;
return -EFAULT;
}
rmb_desc->wr_reg = 1;
if (!conf_rkey)
return 0;
/* exchange confirm_rkey msg with peer */
if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
rmb_desc->regerr = 1;
return -EFAULT;
}
static int smc_clnt_conf_first_link(struct smc_sock *smc)
struct net *net = sock_net(smc->clcsock->sk);
struct smc_link_group *lgr = smc->conn.lgr;
struct smc_link *link;
int rest;
int rc;
link = &lgr->lnk[SMC_SINGLE_LINK];
/* receive CONFIRM LINK request from server over RoCE fabric */
rest = wait_for_completion_interruptible_timeout(
&link->llc_confirm,
SMC_LLC_WAIT_FIRST_TIME);
if (rest <= 0) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
if (link->llc_confirm_rc)
return SMC_CLC_DECL_RMBE_EC;
rc = smc_ib_modify_qp_rts(link);
if (rc)
if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
/* send CONFIRM LINK response over RoCE fabric */
rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
/* receive ADD LINK request from server over RoCE fabric */
rest = wait_for_completion_interruptible_timeout(&link->llc_add,
SMC_LLC_WAIT_TIME);
if (rest <= 0) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
}
/* send add link reject message, only one link supported for now */
rc = smc_llc_send_add_link(link,
link->smcibdev->mac[link->ibport - 1],
smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
static void smcr_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
smc->conn.peer_rmbe_idx = clc->rmbe_idx;
smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
smc->conn.peer_rmbe_size = bufsize;
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
static void smcd_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
smc->conn.peer_rmbe_idx = clc->dmbe_idx;
smc->conn.peer_token = clc->token;
/* msg header takes up space in the buffer */
smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
}
static void smc_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
if (smc->conn.lgr->is_smcd)
smcd_conn_save_peer_info(smc, clc);
else
smcr_conn_save_peer_info(smc, clc);
}
static void smc_link_save_peer_info(struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc)
{
link->peer_qpn = ntoh24(clc->qpn);
memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
link->peer_psn = ntoh24(clc->psn);
link->peer_mtu = clc->qp_mtu;
}
static void smc_switch_to_fallback(struct smc_sock *smc)
{
smc->use_fallback = true;
if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
smc->clcsock->file = smc->sk.sk_socket->file;
smc->clcsock->file->private_data = smc->clcsock;
}
}
/* fall back during connect */
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
smc_switch_to_fallback(smc);
smc_copy_sock_settings_to_clc(smc);
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
return 0;
}
/* decline and fall back during connect */
static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
{
int rc;
if (reason_code < 0) { /* error, fallback is not possible */
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
if (reason_code != SMC_CLC_DECL_PEERDECL) {
rc = smc_clc_send_decline(smc, reason_code);
if (rc < 0) {
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
return smc_connect_fallback(smc, reason_code);
/* abort connecting */
static int smc_connect_abort(struct smc_sock *smc, int reason_code,
int local_contact)
{
if (local_contact == SMC_FIRST_CONTACT)
smc_lgr_forget(smc->conn.lgr);
if (smc->conn.lgr->is_smcd)
/* there is only one lgr role for SMC-D; use server lock */
mutex_unlock(&smc_server_lgr_pending);
else
mutex_unlock(&smc_client_lgr_pending);
smc_conn_free(&smc->conn);
return reason_code;
}
/* check if there is a rdma device available for this connection. */
/* called for connect and listen */
static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
/* PNET table look up: search active ib_device and port
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
if (!ini->ib_dev)
return SMC_CLC_DECL_NOSMCRDEV;
/* check if there is an ISM device available for this connection. */
/* called for connect and listen */
static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
{
/* Find ISM device with same PNETID as connecting interface */
smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
if (!ini->ism_dev)
return SMC_CLC_DECL_NOSMCDDEV;
return 0;
}
/* Check for VLAN ID and register it on ISM device just for CLC handshake */
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
return SMC_CLC_DECL_ISMVLANERR;
return 0;
}
/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
* used, the VLAN ID will be registered again during the connection setup.
*/
static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
return SMC_CLC_DECL_CNFERR;
return 0;
}
/* CLC handshake during connect */
static int smc_connect_clc(struct smc_sock *smc, int smc_type,
struct smc_clc_msg_accept_confirm *aclc,
/* do inband token exchange */
rc = smc_clc_send_proposal(smc, smc_type, ini);
if (rc)
return rc;
/* receive SMC Accept CLC message */
return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
CLC_WAIT_TIME);
}
/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
{
struct smc_link *link;
int reason_code = 0;
ini->is_smcd = false;
ini->ib_lcl = &aclc->lcl;
ini->ib_clcqpn = ntoh24(aclc->qpn);
ini->srv_first_contact = aclc->hdr.flag;
mutex_lock(&smc_client_lgr_pending);
reason_code = smc_conn_create(smc, ini);
if (reason_code) {
mutex_unlock(&smc_client_lgr_pending);
return reason_code;
}
link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
smc_conn_save_peer_info(smc, aclc);
/* create send buffer and rmb */
if (smc_buf_create(smc, false))
return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
ini->cln_first_contact);
if (ini->cln_first_contact == SMC_FIRST_CONTACT)
smc_link_save_peer_info(link, aclc);
if (smc_rmb_rtoken_handling(&smc->conn, aclc))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
smc_close_init(smc);
smc_rx_init(smc);
if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
if (smc_ib_ready_link(link))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
smc_rmb_sync_sg_for_device(&smc->conn);
reason_code = smc_clc_send_confirm(smc);
if (reason_code)
return smc_connect_abort(smc, reason_code,
ini->cln_first_contact);
smc_tx_init(smc);
if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
reason_code = smc_clnt_conf_first_link(smc);
if (reason_code)
return smc_connect_abort(smc, reason_code,
mutex_unlock(&smc_client_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
/* setup for ISM connection of client */
static int smc_connect_ism(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
ini->is_smcd = true;
ini->ism_gid = aclc->gid;
ini->srv_first_contact = aclc->hdr.flag;
/* there is only one lgr role for SMC-D; use server lock */
mutex_lock(&smc_server_lgr_pending);
rc = smc_conn_create(smc, ini);
if (rc) {
mutex_unlock(&smc_server_lgr_pending);
/* Create send and receive buffers */
if (smc_buf_create(smc, true))
return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
ini->cln_first_contact);
smc_conn_save_peer_info(smc, aclc);
smc_close_init(smc);
smc_rx_init(smc);
smc_tx_init(smc);
rc = smc_clc_send_confirm(smc);
if (rc)
return smc_connect_abort(smc, rc, ini->cln_first_contact);
mutex_unlock(&smc_server_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
return 0;
}
/* perform steps before actually connecting */
static int __smc_connect(struct smc_sock *smc)
{
bool ism_supported = false, rdma_supported = false;
struct smc_clc_msg_accept_confirm aclc;
struct smc_init_info ini = {0};
sock_hold(&smc->sk); /* sock put in passive closing */
if (smc->use_fallback)
return smc_connect_fallback(smc, smc->fallback_rsn);
/* if peer has not signalled SMC-capability, fall back */
if (!tcp_sk(smc->clcsock->sk)->syn_smc)
return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(smc))
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
return smc_connect_decline_fallback(smc,
SMC_CLC_DECL_GETVLANERR);
/* check if there is an ism device available */
if (!smc_find_ism_device(smc, &ini) &&
!smc_connect_ism_vlan_setup(smc, &ini)) {
/* ISM is supported for this connection */
ism_supported = true;
smc_type = SMC_TYPE_D;
}
/* check if there is a rdma device available */
if (!smc_find_rdma_device(smc, &ini)) {
/* RDMA is supported for this connection */
rdma_supported = true;
if (ism_supported)
smc_type = SMC_TYPE_B; /* both */
else
smc_type = SMC_TYPE_R; /* only RDMA */
}
/* if neither ISM nor RDMA are supported, fallback */
if (!rdma_supported && !ism_supported)
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
/* perform CLC handshake */
rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
return smc_connect_decline_fallback(smc, rc);
/* depending on previous steps, connect using rdma or ism */
if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
rc = smc_connect_rdma(smc, &aclc, &ini);
else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
rc = smc_connect_ism(smc, &aclc, &ini);
smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
return smc_connect_decline_fallback(smc, rc);
smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
static void smc_connect_work(struct work_struct *work)
{
struct smc_sock *smc = container_of(work, struct smc_sock,
connect_work);
long timeo = smc->sk.sk_sndtimeo;
int rc = 0;
if (!timeo)
timeo = MAX_SCHEDULE_TIMEOUT;
lock_sock(smc->clcsock->sk);
if (smc->clcsock->sk->sk_err) {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
} else if ((1 << smc->clcsock->sk->sk_state) &
(TCPF_SYN_SENT | TCP_SYN_RECV)) {
rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
if ((rc == -EPIPE) &&
((1 << smc->clcsock->sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
rc = 0;
release_sock(smc->clcsock->sk);
lock_sock(&smc->sk);
if (rc != 0 || smc->sk.sk_err) {
smc->sk.sk_state = SMC_CLOSED;
if (rc == -EPIPE || rc == -EAGAIN)
smc->sk.sk_err = EPIPE;
else if (signal_pending(current))
smc->sk.sk_err = -sock_intr_errno(timeo);
goto out;
}
rc = __smc_connect(smc);
if (rc < 0)
smc->sk.sk_err = -rc;
out:
if (!sock_flag(&smc->sk, SOCK_DEAD)) {
if (smc->sk.sk_err) {
smc->sk.sk_state_change(&smc->sk);
} else { /* allow polling before and after fallback decision */
smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
smc->sk.sk_write_space(&smc->sk);
}
}
static int smc_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc = -EINVAL;
smc = smc_sk(sk);
/* separate smc parameter checking to be safe */
if (alen < sizeof(addr->sa_family))
goto out_err;
if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
goto out_err;
lock_sock(sk);
switch (sk->sk_state) {
default:
goto out;
case SMC_ACTIVE:
rc = -EISCONN;
goto out;
case SMC_INIT:
rc = 0;
break;
}
smc_copy_sock_settings_to_clc(smc);
if (smc->connect_nonblock) {
rc = -EALREADY;
goto out;
}
rc = kernel_connect(smc->clcsock, addr, alen, flags);
if (rc && rc != -EINPROGRESS)
goto out;
if (schedule_work(&smc->connect_work))
smc->connect_nonblock = 1;
rc = -EINPROGRESS;
} else {
rc = __smc_connect(smc);
if (rc < 0)
goto out;
else
rc = 0; /* success cases including fallback */
}
out:
release_sock(sk);
out_err:
return rc;
}
static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
{
struct socket *new_clcsock = NULL;
struct sock *lsk = &lsmc->sk;
new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
lsk->sk_err = ENOMEM;
goto out;
}
*new_smc = smc_sk(new_sk);
mutex_lock(&lsmc->clcsock_release_lock);
if (lsmc->clcsock)
rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
mutex_unlock(&lsmc->clcsock_release_lock);
if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
if (new_clcsock)
sock_release(new_clcsock);
new_sk->sk_state = SMC_CLOSED;
sock_set_flag(new_sk, SOCK_DEAD);
sock_put(new_sk); /* final */
*new_smc = NULL;
goto out;
}
(*new_smc)->clcsock = new_clcsock;
out:
return rc;
}
/* add a just created sock to the accept queue of the listen sock as
* candidate for a following socket accept call from user space
*/
static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
{
struct smc_sock *par = smc_sk(parent);
sock_hold(sk); /* sock_put in smc_accept_unlink () */
spin_lock(&par->accept_q_lock);
list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
spin_unlock(&par->accept_q_lock);
sk_acceptq_added(parent);
}
/* remove a socket from the accept queue of its parental listening socket */
static void smc_accept_unlink(struct sock *sk)
{
struct smc_sock *par = smc_sk(sk)->listen_smc;
spin_lock(&par->accept_q_lock);
list_del_init(&smc_sk(sk)->accept_q);
spin_unlock(&par->accept_q_lock);
sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
sock_put(sk); /* sock_hold in smc_accept_enqueue */
}
/* remove a sock from the accept queue to bind it to a new socket created
* for a socket accept call from user space
*/
struct sock *smc_accept_dequeue(struct sock *parent,
struct socket *new_sock)
{
struct smc_sock *isk, *n;
struct sock *new_sk;
list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
new_sk = (struct sock *)isk;
smc_accept_unlink(new_sk);
if (new_sk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
if (isk->clcsock) {
sock_release(isk->clcsock);
isk->clcsock = NULL;
}
sock_put(new_sk); /* final */
sock_graft(new_sk, new_sock);
if (isk->use_fallback) {
smc_sk(new_sk)->clcsock->file = new_sock->file;
isk->clcsock->file->private_data = isk->clcsock;
}
}
return new_sk;
}
return NULL;
}
/* clean up for a created but never accepted sock */
void smc_close_non_accepted(struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
lock_sock(sk);
if (!sk->sk_lingertime)
/* wait for peer closing */
sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
__smc_release(smc);
sock_put(sk); /* final sock_put */
static int smc_serv_conf_first_link(struct smc_sock *smc)
{
struct net *net = sock_net(smc->clcsock->sk);
struct smc_link_group *lgr = smc->conn.lgr;
struct smc_link *link;
int rest;
int rc;
link = &lgr->lnk[SMC_SINGLE_LINK];
if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
/* send CONFIRM LINK request to client over the RoCE fabric */
rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
/* receive CONFIRM LINK response from client over the RoCE fabric */