blob: dd9edd471872b3044f22af91634b8d59eec0eb1f [file] [log] [blame]
/*
* Software iWARP device driver for Linux
*
* Authors: Bernard Metzler <bmt@zurich.ibm.com>
* Fredy Neeser <nfd@zurich.ibm.com>
*
* Copyright (c) 2008-2010, IBM Corporation
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of IBM nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
#include <linux/scatterlist.h>
#include <linux/highmem.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/tcp.h>
#include <rdma/iw_cm.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_umem.h>
#include "siw.h"
#include "siw_obj.h"
#include "siw_cm.h"
/*
* ----------------------------
* DDP reassembly for Softiwarp
* ----------------------------
* For the ordering of transmitted DDP segments, the relevant iWARP ordering
* rules are as follows:
*
* - RDMAP (RFC 5040): Section 7.5, Rule 17:
* "RDMA Read Response Message processing at the Remote Peer (reading
* the specified Tagged Buffer) MUST be started only after the RDMA
* Read Request Message has been Delivered by the DDP layer (thus,
* all previous RDMA Messages have been properly submitted for
* ordered Placement)."
*
* - DDP (RFC 5041): Section 5.3:
* "At the Data Source, DDP:
* o MUST transmit DDP Messages in the order they were submitted to
* the DDP layer,
* o SHOULD transmit DDP Segments within a DDP Message in increasing
* MO order for Untagged DDP Messages, and in increasing TO order
* for Tagged DDP Messages."
*
* Combining these rules implies that, although RDMAP does not provide
* ordering between operations that are generated from the two ends of an
* RDMAP stream, DDP *must not* transmit an RDMA Read Response Message before
* it has finished transmitting SQ operations that were already submitted
* to the DDP layer. It follows that an iWARP transmitter must fully
* serialize RDMAP messages belonging to the same QP.
*
* Given that a TCP socket receives DDP segments in peer transmit order,
* we obtain the following ordering of received DDP segments:
*
* (i) the received DDP segments of RDMAP messages for the same QP
* cannot be interleaved
* (ii) the received DDP segments of a single RDMAP message *should*
* arrive in order.
*
* The Softiwarp transmitter obeys rule #2 in DDP Section 5.3.
* With this property, the "should" becomes a "must" in (ii) above,
* which simplifies DDP reassembly considerably.
* The Softiwarp receiver currently relies on this property
* and reports an error if DDP segments of the same RDMAP message
* do not arrive in sequence.
*/
static inline int siw_crc_rxhdr(struct siw_iwarp_rx *ctx)
{
crypto_hash_init(&ctx->mpa_crc_hd);
return siw_crc_array(&ctx->mpa_crc_hd, (u8 *)&ctx->hdr,
ctx->fpdu_part_rcvd);
}
/*
* siw_rx_umem_init()
*
* Given memory region @mr and tagged offset @t_off within @mr,
* resolve corresponding ib_umem_chunk memory chunk pointer
* and update receive context variables to point at receive position.
* returns 0 on sucess and failure otherwise.
*
* NOTE: This function expects virtual addresses.
* TODO: Function needs generalization to support relative adressing
* aka "ZBVA".
*
* @rctx: Receive Context to be updated
* @mr: Memory Region
* @t_off: Offset within Memory Region
*
*/
static int siw_rx_umem_init(struct siw_iwarp_rx *rctx, struct siw_mr *mr,
u64 t_off)
{
struct ib_umem_chunk *chunk;
u64 off_mr; /* offset into MR */
int psge_idx; /* Index of PSGE */
off_mr = t_off - (mr->mem.va & PAGE_MASK);
/*
* Equivalent to
* off_mr = t_off - mr->mem.va;
* off_mr += mr->umem->offset;
*/
/* Skip pages not referenced by t_off */
psge_idx = off_mr >> PAGE_SHIFT;
list_for_each_entry(chunk, &mr->umem->chunk_list, list) {
if (psge_idx < chunk->nents)
break;
psge_idx -= chunk->nents;
}
if (psge_idx >= chunk->nents) {
dprint(DBG_MM|DBG_ON, "(QP%d): Short chunk list\n",
RX_QPID(rctx));
return -EINVAL;
}
rctx->pg_idx = psge_idx;
rctx->pg_off = off_mr & ~PAGE_MASK;
rctx->umem_chunk = chunk;
dprint(DBG_MM, "(QP%d): New chunk, idx %d\n", RX_QPID(rctx), psge_idx);
return 0;
}
/*
* siw_rx_umem()
*
* Receive data of @len into target referenced by @rctx.
* This function does not check if umem is within bounds requested by
* @len and @t_off. @umem_ends indicates if routine should
* not update chunk position pointers after the point it is
* currently receiving
*
* @rctx: Receive Context
* @len: Number of bytes to place
* @umen_ends: 1, if rctx chunk pointer should not be updated after len.
*/
static int siw_rx_umem(struct siw_iwarp_rx *rctx, int len, int umem_ends)
{
struct scatterlist *p_list;
void *dest;
struct ib_umem_chunk *chunk = rctx->umem_chunk;
int pg_off = rctx->pg_off,
copied = 0,
bytes,
rv;
while (len) {
bytes = min(len, (int)PAGE_SIZE - pg_off);
p_list = &chunk->page_list[rctx->pg_idx];
dest = kmap_atomic(sg_page(p_list), KM_SOFTIRQ0);
rv = skb_copy_bits(rctx->skb, rctx->skb_offset, dest + pg_off,
bytes);
dprint(DBG_RX, "(QP%d): Page #%d, "
"bytes=%u, rv=%d returned by skb_copy_bits()\n",
RX_QPID(rctx), rctx->pg_idx, bytes, rv);
if (likely(!rv)) {
if (rctx->crc_enabled)
rv = siw_crc_sg(&rctx->mpa_crc_hd, p_list,
pg_off, bytes);
rctx->skb_offset += bytes;
copied += bytes;
len -= bytes;
pg_off += bytes;
}
kunmap_atomic(dest, KM_SOFTIRQ0);
if (unlikely(rv)) {
rctx->skb_copied += copied;
rctx->skb_new -= copied;
copied = -EFAULT;
dprint(DBG_RX|DBG_ON, "(QP%d): failed with %d\n",
RX_QPID(rctx), rv);
goto out;
}
if (pg_off == PAGE_SIZE) {
/*
* end of page
*/
pg_off = 0;
/*
* reference next page chunk if
* - all pages in chunk used AND
* - current loop fills more into this umem
* OR the next receive will go into this umem
* starting at the position where we are leaving
* the routine.
*/
if (++rctx->pg_idx == chunk->nents &&
(len > 0 || !umem_ends)) {
rctx->pg_idx = 0;
chunk = mem_chunk_next(chunk);
}
}
}
/*
* store chunk position for resume
*/
rctx->umem_chunk = chunk;
rctx->pg_off = pg_off;
rctx->skb_copied += copied;
rctx->skb_new -= copied;
out:
return copied;
}
/*
* siw_rresp_check_ntoh()
*
* Check incoming RRESP fragment header against expected
* header values and update expected values for potential next
* fragment.
*
* NOTE: This function must be called only if a RRESP DDP segment
* starts but not for fragmented consecutive pieces of an
* already started DDP segement.
*/
static inline int siw_rresp_check_ntoh(struct siw_iwarp_rx *rctx)
{
struct iwarp_rdma_rresp *rresp = &rctx->hdr.rresp;
struct siw_wqe *wqe = rctx->dest.wqe;
rresp->sink_stag = be32_to_cpu(rresp->sink_stag);
rresp->sink_to = be64_to_cpu(rresp->sink_to);
if (rctx->first_ddp_seg) {
rctx->ddp_stag = wqe->wr.rread.sge[0].lkey;
rctx->ddp_to = wqe->wr.rread.sge[0].addr;
}
if (rctx->ddp_stag != rresp->sink_stag) {
dprint(DBG_RX|DBG_ON,
" received STAG=%08x, expected STAG=%08x\n",
rresp->sink_stag, rctx->ddp_stag);
/*
* Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
*/
return -EINVAL;
}
if (rctx->ddp_to != rresp->sink_to) {
dprint(DBG_RX|DBG_ON,
" received TO=%016llx, expected TO=%016llx\n",
(unsigned long long)rresp->sink_to,
(unsigned long long)rctx->ddp_to);
/*
* Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
*/
return -EINVAL;
}
if (rctx->more_ddp_segs)
rctx->ddp_to += rctx->fpdu_part_rem;
else if (wqe->processed + rctx->fpdu_part_rem != wqe->bytes) {
dprint(DBG_RX|DBG_ON,
" RRESP length does not match RREQ, "
"peer sent=%d, expected %d\n",
wqe->processed + rctx->fpdu_part_rem, wqe->bytes);
return -EINVAL;
}
return 0;
}
/*
* siw_write_check_ntoh()
*
* Check incoming WRITE fragment header against expected
* header values and update expected values for potential next
* fragment
*
* NOTE: This function must be called only if a WRITE DDP segment
* starts but not for fragmented consecutive pieces of an
* already started DDP segement.
*/
static inline int siw_write_check_ntoh(struct siw_iwarp_rx *rctx)
{
struct iwarp_rdma_write *write = &rctx->hdr.rwrite;
write->sink_stag = be32_to_cpu(write->sink_stag);
write->sink_to = be64_to_cpu(write->sink_to);
if (rctx->first_ddp_seg) {
rctx->ddp_stag = write->sink_stag;
rctx->ddp_to = write->sink_to;
} else {
if (rctx->ddp_stag != write->sink_stag) {
dprint(DBG_RX|DBG_ON,
" received STAG=%08x, expected STAG=%08x\n",
write->sink_stag, rctx->ddp_stag);
/*
* Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
*/
return -EINVAL;
}
if (rctx->ddp_to != write->sink_to) {
dprint(DBG_RX|DBG_ON,
" received TO=%016llx, expected TO=%016llx\n",
(unsigned long long)write->sink_to,
(unsigned long long)rctx->ddp_to);
/*
* Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
*/
return -EINVAL;
}
}
/*
* Update expected target offset for next incoming DDP segment
*/
if (rctx->more_ddp_segs != 0)
rctx->ddp_to += rctx->fpdu_part_rem;
return 0;
}
/*
* siw_send_check_ntoh()
*
* Check incoming SEND fragment header against expected
* header values and update expected MSN if no next
* fragment expected
*
* NOTE: This function must be called only if a SEND DDP segment
* starts but not for fragmented consecutive pieces of an
* already started DDP segement.
*/
static inline int siw_send_check_ntoh(struct siw_iwarp_rx *rctx)
{
struct iwarp_send *send = &rctx->hdr.send;
struct siw_wqe *wqe = rctx->dest.wqe;
send->ddp_msn = be32_to_cpu(send->ddp_msn);
send->ddp_mo = be32_to_cpu(send->ddp_mo);
send->ddp_qn = be32_to_cpu(send->ddp_qn);
if (send->ddp_qn != RDMAP_UNTAGGED_QN_SEND) {
dprint(DBG_RX|DBG_ON, " Invalid DDP QN %d for SEND\n",
send->ddp_qn);
return -EINVAL;
}
if (send->ddp_msn != rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]) {
dprint(DBG_RX|DBG_ON, " received MSN=%d, expected MSN=%d\n",
rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND], send->ddp_msn);
/*
* TODO: Error handling
* async_event= RI_EVENT_QP_RQ_PROTECTION_ERROR_MSN_GAP;
* cmpl_status= RI_WC_STATUS_LOCAL_QP_CATASTROPHIC;
*/
return -EINVAL;
}
if (send->ddp_mo != wqe->processed) {
dprint(DBG_RX|DBG_ON, " Received MO=%u, expected MO=%u\n",
send->ddp_mo, wqe->processed);
/*
* Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
*/
return -EINVAL;
}
if (rctx->first_ddp_seg) {
/* initialize user memory write position */
rctx->sge_idx = 0;
rctx->sge_off = 0;
}
if (wqe->bytes < wqe->processed + rctx->fpdu_part_rem) {
dprint(DBG_RX|DBG_ON, " Receive space short: %d < %d\n",
wqe->bytes - wqe->processed, rctx->fpdu_part_rem);
wqe->wc_status = IB_WC_LOC_LEN_ERR;
return -EINVAL;
}
return 0;
}
static inline struct siw_wqe *siw_get_rqe(struct siw_qp *qp)
{
struct siw_wqe *wqe = NULL;
if (!qp->srq) {
lock_rq(qp);
if (!list_empty(&qp->rq)) {
wqe = list_first_wqe(&qp->rq);
list_del_init(&wqe->list);
unlock_rq(qp);
} else {
unlock_rq(qp);
dprint(DBG_RX, " QP(%d): RQ empty!\n", QP_ID(qp));
}
} else {
wqe = siw_srq_fetch_wqe(qp);
if (!wqe)
dprint(DBG_RX, " QP(%d): SRQ empty!\n", QP_ID(qp));
}
return wqe;
}
/*
* siw_proc_send:
*
* Process one incoming SEND and place data into memory referenced by
* receive wqe.
*
* Function supports partially received sends (suspending/resuming
* current receive wqe processing)
*
* return value:
* 0: reached the end of a DDP segment
* -EAGAIN: to be called again to finish the DDP segment
*/
int siw_proc_send(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
struct siw_wqe *wqe;
struct siw_sge *sge;
struct siw_mr *mr;
u32 data_bytes, /* all data bytes available */
rcvd_bytes; /* sum of data bytes rcvd */
int rv = 0;
if (rctx->first_ddp_seg) {
WARN_ON(rx_wqe(qp) != NULL);
wqe = siw_get_rqe(qp);
if (!wqe)
return -ENOENT;
rx_wqe(qp) = wqe;
wqe->wr_status = SR_WR_INPROGRESS;
} else {
wqe = rx_wqe(qp);
if (!wqe) {
/*
* this is a siw bug!
*/
dprint(DBG_ON, "QP(%d): RQ failure\n", QP_ID(qp));
return -EPROTO;
}
}
if (rctx->state == SIW_GET_DATA_START) {
rv = siw_send_check_ntoh(rctx);
if (rv) {
siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
return rv;
}
if (!rctx->fpdu_part_rem) /* zero length SEND */
return 0;
}
data_bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
rcvd_bytes = 0;
while (data_bytes) {
struct siw_pd *pd;
u32 sge_bytes; /* data bytes avail for SGE */
int umem_ends; /* 1 if umem ends with current rcv */
sge = &wqe->wr.sgl.sge[rctx->sge_idx];
if (!sge->len) {
/* just skip empty sge's */
rctx->sge_idx++;
rctx->sge_off = 0;
continue;
}
sge_bytes = min(data_bytes, sge->len - rctx->sge_off);
/*
* check with QP's PD if no SRQ present, SRQ's PD otherwise
*/
pd = qp->srq == NULL ? qp->pd : qp->srq->pd;
rv = siw_check_sge(pd, sge, SR_MEM_LWRITE, rctx->sge_off,
sge_bytes);
if (rv) {
siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
break;
}
mr = siw_mem2mr(sge->mem.obj);
if (rctx->sge_off == 0) {
/*
* started a new sge: update receive pointers
*/
rv = siw_rx_umem_init(rctx, mr, sge->addr);
if (rv)
break;
}
/*
* Are we going to finish placing
* - the last fragment of the current SGE or
* - the last DDP segment (L=1) of the current RDMAP message?
*
* siw_rx_umem() must advance umem page_chunk position
* after sucessful receive only, if receive into current
* umem does not end. umem ends, if:
* - current SGE gets completely filled, OR
* - current MPA FPDU is last AND gets consumed now
*/
umem_ends = ((sge_bytes + rctx->sge_off == sge->len) ||
(!rctx->more_ddp_segs &&
rctx->fpdu_part_rcvd + sge_bytes ==
rctx->fpdu_part_rem)) ? 1 : 0;
rv = siw_rx_umem(rctx, sge_bytes, umem_ends);
if (rv != sge_bytes) {
/*
* siw_rx_umem() must have updated
* skb_new and skb_copied
*/
wqe->processed += rcvd_bytes;
return -EINVAL;
}
rctx->sge_off += rv;
if (rctx->sge_off == sge->len) {
rctx->sge_idx++;
rctx->sge_off = 0;
}
data_bytes -= rv;
rcvd_bytes += rv;
rctx->fpdu_part_rem -= rv;
rctx->fpdu_part_rcvd += rv;
}
wqe->processed += rcvd_bytes;
if (!rctx->fpdu_part_rem)
return 0;
return (rv < 0) ? rv : -EAGAIN;
}
/*
* siw_proc_write:
*
* Place incoming WRITE after referencing and checking target buffer
* Function supports partially received WRITEs (suspending/resuming
* current receive processing)
*
* return value:
* 0: reached the end of a DDP segment
* -EAGAIN: to be called again to finish the DDP segment
*/
int siw_proc_write(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
struct siw_dev *dev = qp->hdr.dev;
struct iwarp_rdma_write *write = &rctx->hdr.rwrite;
struct siw_mem *mem;
int bytes,
last_write,
rv;
if (rctx->state == SIW_GET_DATA_START) {
if (!rctx->fpdu_part_rem) /* zero length WRITE */
return 0;
rv = siw_write_check_ntoh(rctx);
if (rv) {
siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
return rv;
}
}
bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
/*
* NOTE: bytes > 0 is always true, since this routine
* gets only called if so.
*/
if (rctx->first_ddp_seg) {
/* DEBUG Code, to be removed */
if (rx_mem(qp) != 0) {
dprint(DBG_RX|DBG_ON, "(QP%d): Stale rctx state!\n",
QP_ID(qp));
return -EFAULT;
}
rx_mem(qp) = siw_mem_id2obj(dev, rctx->ddp_stag >> 8);
}
if (rx_mem(qp) == NULL) {
dprint(DBG_RX|DBG_ON, "(QP%d): "
"Sink STag not found or invalid, STag=0x%08x\n",
QP_ID(qp), rctx->ddp_stag);
return -EINVAL;
}
mem = rx_mem(qp);
/*
* Rtag not checked against mem's tag again because
* hdr check guarantees same tag as before if fragmented
*/
rv = siw_check_mem(qp->pd, mem, write->sink_to + rctx->fpdu_part_rcvd,
SR_MEM_RWRITE, bytes);
if (rv) {
siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
return rv;
}
if (rctx->first_ddp_seg) {
rv = siw_rx_umem_init(rctx, siw_mem2mr(mem), write->sink_to);
if (rv)
return -EINVAL;
} else if (!rctx->umem_chunk) {
/*
* This should never happen.
*
* TODO: Remove tentative debug aid.
*/
dprint(DBG_RX|DBG_ON, "(QP%d): "
"Umem chunk not resolved!\n", QP_ID(qp));
return -EINVAL;
}
/*
* Are we going to place the last piece of the last
* DDP segment of the current RDMAP message?
*
* It is last if:
* - rctx->fpdu_part_rem <= rctx->skb_new AND
* - payload_rem (of current DDP segment) <= rctx->skb_new
*/
last_write = ((rctx->fpdu_part_rem <= rctx->skb_new) &&
!rctx->more_ddp_segs) ? 1 : 0;
rv = siw_rx_umem(rctx, bytes, last_write);
if (rv != bytes)
return -EINVAL;
rctx->fpdu_part_rem -= rv;
rctx->fpdu_part_rcvd += rv;
if (!rctx->fpdu_part_rem)
return 0;
return (rv < 0) ? rv : -EAGAIN;
}
/*
* inbound RREQ's cannot carry user data.
*/
int siw_proc_rreq(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
if (!rctx->fpdu_part_rem)
return 0;
dprint(DBG_ON|DBG_RX, "(QP%d): RREQ with MPA len %d\n", QP_ID(qp),
rctx->hdr.ctrl.mpa_len);
return -EPROTO;
}
/*
* siw_init_rresp:
*
* Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
* Put it at the tail of the IRQ, if there is another WQE currently in
* transmit processing. If not, make it the current WQE to be processed
* and schedule transmit processing.
*
* Can be called from softirq context and from process
* context (RREAD socket loopback case!)
*
* return value:
* 0: success,
* failure code otherwise
*/
int siw_init_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
struct siw_wqe *rsp;
rsp = siw_wqe_get(qp, SIW_WR_RDMA_READ_RESP);
if (rsp) {
rsp->wr.rresp.sge.len = be32_to_cpu(rctx->hdr.rreq.read_size);
rsp->bytes = rsp->wr.rresp.sge.len; /* redundant */
rsp->processed = 0;
rsp->wr.rresp.sge.addr = be64_to_cpu(rctx->hdr.rreq.source_to);
rsp->wr.rresp.num_sge = rsp->bytes ? 1 : 0;
rsp->wr.rresp.sge.mem.obj = NULL; /* defer lookup */
rsp->wr.rresp.sge.lkey =
be32_to_cpu(rctx->hdr.rreq.source_stag);
rsp->wr.rresp.raddr = be64_to_cpu(rctx->hdr.rreq.sink_to);
rsp->wr.rresp.rtag = rctx->hdr.rreq.sink_stag; /* NBO */
} else {
dprint(DBG_RX|DBG_ON, "(QP%d): IRD exceeded!\n", QP_ID(qp));
return -EPROTO;
}
rsp->wr_status = SR_WR_QUEUED;
/*
* Insert into IRQ
*
* TODO: Revisit ordering of genuine SQ WRs and Read Response
* pseudo-WRs. RDMAP specifies that there is no ordering among
* the two directions of transmission, so there is a degree of
* freedom.
*
* The current logic favours Read Responses over SQ work requests
* that are queued but not already in progress.
*/
lock_sq(qp);
if (!tx_wqe(qp)) {
tx_wqe(qp) = rsp;
unlock_sq(qp);
/*
* schedule TX work, even if SQ was supended due to
* ORD limit: it is always OK (and may even prevent peers
* from appl lock) to send RRESPONSE's
*/
siw_sq_queue_work(qp);
} else {
list_add_tail(&rsp->list, &qp->irq);
unlock_sq(qp);
}
return 0;
}
/*
* siw_proc_rresp:
*
* Place incoming RRESP data into memory referenced by RREQ WQE.
*
* Function supports partially received RRESP's (suspending/resuming
* current receive processing)
*/
int siw_proc_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
struct siw_wqe *wqe;
struct siw_mr *mr;
struct siw_sge *sge;
int bytes,
is_last,
rv;
if (rctx->first_ddp_seg) {
WARN_ON(rx_wqe(qp) != NULL);
/*
* fetch pending RREQ from orq
*/
lock_orq(qp);
if (!list_empty(&qp->orq)) {
wqe = list_first_entry(&qp->orq, struct siw_wqe, list);
list_del_init(&wqe->list);
} else {
unlock_orq(qp);
dprint(DBG_RX|DBG_ON, "(QP%d): ORQ empty\n",
QP_ID(qp));
/*
* TODO: Should generate an async error
*/
rv = -ENODATA; /* or -ENOENT ? */
goto done;
}
unlock_orq(qp);
rx_wqe(qp) = wqe;
if (wr_type(wqe) != SIW_WR_RDMA_READ_REQ || wqe->processed) {
WARN_ON(wqe->processed);
WARN_ON(wr_type(wqe) != SIW_WR_RDMA_READ_REQ);
rv = -EINVAL;
goto done;
}
wqe->wr_status = SR_WR_INPROGRESS;
rv = siw_rresp_check_ntoh(rctx);
if (rv) {
siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
goto done;
}
} else {
wqe = rx_wqe(qp);
if (!wqe) {
WARN_ON(1);
rv = -ENODATA;
goto done;
}
}
if (!rctx->fpdu_part_rem) /* zero length RRESPONSE */
return 0;
bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
sge = wqe->wr.rread.sge; /* there is only one */
/*
* check target memory which resolves memory on first fragment
*/
rv = siw_check_sge(qp->pd, sge, SR_MEM_LWRITE, wqe->processed, bytes);
if (rv) {
dprint(DBG_RX|DBG_ON, "(QP%d): siw_check_sge failed: %d\n",
QP_ID(qp), rv);
wqe->wc_status = IB_WC_LOC_PROT_ERR;
siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
goto done;
}
mr = siw_mem2mr(sge->mem.obj);
if (rctx->first_ddp_seg) {
rv = siw_rx_umem_init(rctx, mr, sge->addr);
if (rv) {
wqe->wc_status = IB_WC_LOC_PROT_ERR;
goto done;
}
} else if (!rctx->umem_chunk) {
/*
* This should never happen.
*
* TODO: Remove tentative debug aid.
*/
dprint(DBG_RX|DBG_ON, "(QP%d): No target mem!\n", QP_ID(qp));
wqe->wc_status = IB_WC_GENERAL_ERR;
rv = -EPROTO;
goto done;
}
/*
* Are we going to finish placing the last DDP segment (L=1)
* of the current RDMAP message?
*
* NOTE: siw_rresp_check_ntoh() guarantees that the
* last inbound RDMAP Read Response message exactly matches
* with the RREQ WR.
*/
is_last = (bytes + wqe->processed == wqe->bytes) ? 1 : 0;
rv = siw_rx_umem(rctx, bytes, is_last);
if (rv != bytes) {
wqe->wc_status = IB_WC_GENERAL_ERR;
rv = -EINVAL;
goto done;
}
rctx->fpdu_part_rem -= rv;
rctx->fpdu_part_rcvd += rv;
wqe->processed += rv;
if (!rctx->fpdu_part_rem)
return 0;
done:
return (rv < 0) ? rv : -EAGAIN;
}
static void siw_drain_pkt(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
char buf[4096];
int len;
dprint(DBG_ON|DBG_RX, " (QP%d): drain %d bytes\n",
QP_ID(qp), rctx->fpdu_part_rem);
while (rctx->fpdu_part_rem) {
len = min(rctx->fpdu_part_rem, 4096);
skb_copy_bits(rctx->skb, rctx->skb_offset,
buf, rctx->fpdu_part_rem);
rctx->skb_copied += len;
rctx->skb_offset += len;
rctx->skb_new -= len;
rctx->fpdu_part_rem -= len;
}
}
int siw_proc_unsupp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
WARN_ON(1);
siw_drain_pkt(qp, rctx);
return 0;
}
int siw_proc_terminate(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
struct iwarp_terminate *term = &rctx->hdr.terminate;
printk(KERN_INFO "(QP%d): RX Terminate: etype=%d, layer=%d, ecode=%d\n",
QP_ID(qp), term->term_ctrl.etype, term->term_ctrl.layer,
term->term_ctrl.ecode);
siw_drain_pkt(qp, rctx);
return 0;
}
static int siw_get_trailer(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
{
struct sk_buff *skb = rctx->skb;
u8 *tbuf = (u8 *)&rctx->trailer.crc - rctx->pad;
int avail;
avail = min(rctx->skb_new, rctx->fpdu_part_rem);
skb_copy_bits(skb, rctx->skb_offset,
tbuf + rctx->fpdu_part_rcvd, avail);
rctx->fpdu_part_rcvd += avail;
rctx->fpdu_part_rem -= avail;
rctx->skb_new -= avail;
rctx->skb_offset += avail;
rctx->skb_copied += avail;
dprint(DBG_RX, " (QP%d): %d remaining (%d)\n", QP_ID(qp),
rctx->fpdu_part_rem, avail);
if (!rctx->fpdu_part_rem) {
u32 crc_in, crc_own = 0;
/*
* check crc if required
*/
if (!rctx->crc_enabled)
return 0;
if (rctx->pad && siw_crc_array(&rctx->mpa_crc_hd,
tbuf, rctx->pad) != 0)
return -EINVAL;
crypto_hash_final(&rctx->mpa_crc_hd, (u8 *)&crc_own);
/*
* CRC32 is computed, transmitted and received directly in NBO,
* so there's never a reason to convert byte order.
*/
crc_in = rctx->trailer.crc;
if (crc_in != crc_own) {
dprint(DBG_RX|DBG_ON,
" (QP%d): CRC ERROR in:=%08x, own=%08x\n",
QP_ID(qp), crc_in, crc_own);
return -EINVAL;
}
return 0;
}
return -EAGAIN;
}
static int siw_get_hdr(struct siw_iwarp_rx *rctx)
{
struct sk_buff *skb = rctx->skb;
struct iwarp_ctrl *c_hdr = &rctx->hdr.ctrl;
int bytes;
if (rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) {
/*
* copy first fix part of iwarp hdr
*/
bytes = min_t(int, rctx->skb_new,
sizeof(struct iwarp_ctrl) - rctx->fpdu_part_rcvd);
skb_copy_bits(skb, rctx->skb_offset,
(char *)c_hdr + rctx->fpdu_part_rcvd, bytes);
rctx->fpdu_part_rcvd += bytes;
rctx->skb_new -= bytes;
rctx->skb_offset += bytes;
rctx->skb_copied += bytes;
if (!rctx->skb_new ||
rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) {
return -EAGAIN;
}
if (c_hdr->opcode > RDMAP_TERMINATE) {
dprint(DBG_RX|DBG_ON, " opcode %d\n", c_hdr->opcode);
return -EINVAL;
}
if (c_hdr->dv != DDP_VERSION) {
dprint(DBG_RX|DBG_ON, " dversion %d\n", c_hdr->dv);
return -EINVAL;
}
if (c_hdr->rv != RDMAP_VERSION) {
dprint(DBG_RX|DBG_ON, " rversion %d\n", c_hdr->rv);
return -EINVAL;
}
dprint(DBG_RX, "(QP%d): New Header, opcode:%d\n",
RX_QPID(rctx), c_hdr->opcode);
}
/*
* figure out len of current hdr: variable length of
* iwarp hdr forces us to copy hdr information
*/
bytes = min(rctx->skb_new,
iwarp_pktinfo[c_hdr->opcode].hdr_len - rctx->fpdu_part_rcvd);
skb_copy_bits(skb, rctx->skb_offset,
(char *)c_hdr + rctx->fpdu_part_rcvd, bytes);
rctx->fpdu_part_rcvd += bytes;
rctx->skb_new -= bytes;
rctx->skb_offset += bytes;
rctx->skb_copied += bytes;
if (rctx->fpdu_part_rcvd == iwarp_pktinfo[c_hdr->opcode].hdr_len) {
/*
* HDR receive completed. Check if the current DDP segment
* starts a new RDMAP message or continues a previously
* started RDMAP message.
*
* Note well from the comments on DDP reassembly:
* - Support for unordered reception of DDP segments
* (or FPDUs) from different RDMAP messages is not needed.
* - Unordered reception of DDP segments of the same
* RDMAP message is not supported. It is probably not
* needed with most peers.
*/
siw_dprint_hdr(&rctx->hdr, RX_QPID(rctx), "HDR received");
if (rctx->more_ddp_segs != 0) {
rctx->first_ddp_seg = 0;
if (rctx->prev_ddp_opcode != c_hdr->opcode) {
dprint(DBG_ON,
"packet intersection: %d <> %d\n",
rctx->prev_ddp_opcode, c_hdr->opcode);
return -EPROTO;
}
} else {
rctx->prev_ddp_opcode = c_hdr->opcode;
rctx->first_ddp_seg = 1;
}
rctx->more_ddp_segs = (c_hdr->l == 0) ? 1 : 0;
return 0;
}
return -EAGAIN;
}
static inline int siw_fpdu_payload_len(struct siw_iwarp_rx *rctx)
{
return ((int)(rctx->hdr.ctrl.mpa_len) - rctx->fpdu_part_rcvd)
+ MPA_HDR_SIZE;
}
static inline int siw_fpdu_trailer_len(struct siw_iwarp_rx *rctx)
{
int mpa_len = (int)rctx->hdr.ctrl.mpa_len + MPA_HDR_SIZE;
return MPA_CRC_SIZE + (-mpa_len & 0x3);
}
/*
* siw_rreq_complete()
*
* Complete the current READ REQUEST after READ RESPONSE processing.
* It may complete consecutive WQE's which were already SQ
* processed before but are awaiting completion due to completion
* ordering (see verbs 8.2.2.2).
* The READ RESPONSE may also resume SQ processing if it was stalled
* due to ORD exhaustion (see verbs 8.2.2.18)
* Function stops completion when next READ REQUEST found or ORQ empty.
*/
static void siw_rreq_complete(struct siw_wqe *wqe, int error)
{
struct siw_qp *qp = wqe->qp;
int num_wc = 1;
enum ib_send_flags flags;
LIST_HEAD(c_list);
flags = wr_flags(wqe);
if (flags & IB_SEND_SIGNALED)
list_add(&wqe->list, &c_list);
else {
atomic_inc(&qp->sq_space);
siw_wqe_put(wqe);
num_wc = 0;
}
lock_orq(qp);
/* More WQE's to complete following this RREQ? */
if (!list_empty(&qp->orq)) {
struct list_head *pos, *n;
list_for_each_safe(pos, n, &qp->orq) {
wqe = list_entry_wqe(pos);
if (wr_type(wqe) == SIW_WR_RDMA_READ_REQ)
break;
flags |= wr_flags(wqe);
num_wc++;
dprint(DBG_WR|DBG_ON,
"(QP%d): Resume completion, wr_type %d\n",
QP_ID(qp), wr_type(wqe));
list_move_tail(pos, &c_list);
}
}
unlock_orq(qp);
if (num_wc)
siw_sq_complete(&c_list, qp, num_wc, flags);
/*
* Check if SQ processing was stalled due to ORD limit
*/
if (ORD_SUSPEND_SQ(qp)) {
lock_sq(qp);
wqe = siw_next_tx_wqe(qp);
if (wqe && !tx_wqe(qp)) {
WARN_ON(wr_type(wqe) != SIW_WR_RDMA_READ_REQ);
list_del_init(&wqe->list);
tx_wqe(qp) = wqe;
list_add_tail(&wqe->list, &qp->orq);
unlock_sq(qp);
dprint(DBG_RX, "(QP%d): SQ resume (%d)\n",
QP_ID(qp), atomic_read(&qp->sq_space));
siw_sq_queue_work(qp);
} else {
/* only new ORQ space if not next RREQ queued */
atomic_inc(&qp->orq_space);
unlock_sq(qp);
}
} else
atomic_inc(&qp->orq_space);
}
/*
* siw_rdmap_complete()
*
* complete processing of an RDMA message after receiving all
* DDP segmens
*
* o SENDs + RRESPs will need for completion,
* o RREQs need for READ RESPONSE initialization
* o WRITEs need memory dereferencing
*
* TODO: Could siw_[s,r]_complete() fail? (CQ full)
*/
static inline int siw_rdmap_complete(struct siw_qp *qp,
struct siw_iwarp_rx *rctx)
{
struct siw_wqe *wqe;
int rv = 0;
switch (rctx->hdr.ctrl.opcode) {
case RDMAP_SEND_SE:
wr_flags(rx_wqe(qp)) |= IB_SEND_SOLICITED;
case RDMAP_SEND:
rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
wqe = rx_wqe(qp);
wqe->wc_status = IB_WC_SUCCESS;
wqe->wr_status = SR_WR_DONE;
siw_rq_complete(wqe, qp);
break;
case RDMAP_RDMA_READ_RESP:
rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
wqe = rx_wqe(qp);
wqe->wc_status = IB_WC_SUCCESS;
wqe->wr_status = SR_WR_DONE;
siw_rreq_complete(wqe, 0);
break;
case RDMAP_RDMA_READ_REQ:
rv = siw_init_rresp(qp, rctx);
break;
case RDMAP_RDMA_WRITE:
/*
* Free References from memory object if
* attached to receive context (inbound WRITE)
* While a zero-length WRITE is allowed, the
* current implementation does not create
* a memory reference (it is unclear if memory
* rights should be checked in that case!).
*
* TODO: check zero length WRITE semantics
*/
if (rx_mem(qp))
siw_mem_put(rx_mem(qp));
break;
default:
break;
}
rctx->umem_chunk = NULL; /* DEBUG aid, tentatively */
rx_wqe(qp) = NULL; /* also clears MEM object for WRITE */
return rv;
}
/*
* siw_rdmap_error()
*
* Abort processing of RDMAP message after failure.
* SENDs + RRESPs will need for receive completion, if
* already started.
*
* TODO: WRITE need local error to be surfaced.
*
*/
static inline void
siw_rdmap_error(struct siw_qp *qp, struct siw_iwarp_rx *rctx, int status)
{
struct siw_wqe *wqe;
switch (rctx->hdr.ctrl.opcode) {
case RDMAP_SEND_SE:
case RDMAP_SEND:
rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
wqe = rx_wqe(qp);
if (!wqe)
return;
if (rctx->hdr.ctrl.opcode == RDMAP_SEND_SE)
wr_flags(wqe) |= IB_SEND_SOLICITED;
if (!wqe->wc_status)
wqe->wc_status = IB_WC_GENERAL_ERR;
wqe->wr_status = SR_WR_DONE;
siw_rq_complete(wqe, qp);
break;
case RDMAP_RDMA_READ_RESP:
/*
* A READ RESPONSE may flush consecutive WQE's
* which were SQ processed before
*/
rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
if (rctx->state == SIW_GET_HDR || status == -ENODATA)
/* eventual RREQ left untouched */
break;
wqe = rx_wqe(qp);
if (wqe) {
if (status)
wqe->wc_status = status;
else
wqe->wc_status = IB_WC_GENERAL_ERR;
wqe->wr_status = SR_WR_DONE;
/*
* All errors turn the wqe into signalled.
*/
wr_flags(wqe) |= IB_SEND_SIGNALED;
siw_rreq_complete(wqe, status);
}
break;
case RDMAP_RDMA_WRITE:
/*
* Free References from memory object if
* attached to receive context (inbound WRITE)
* While a zero-length WRITE is allowed, the
* current implementation does not create
* a memory reference (it is unclear if memory
* rights should be checked in that case!).
*
* TODO: check zero length WRITE semantics
*/
if (rx_mem(qp))
siw_mem_put(rx_mem(qp));
break;
default:
break;
}
rctx->umem_chunk = NULL; /* DEBUG aid, tentatively */
rx_wqe(qp) = NULL; /* also clears MEM object for WRITE */
}
/*
* siw_tcp_rx_data()
*
* Main routine to consume inbound TCP payload
*
* @rd_desc: read descriptor
* @skb: socket buffer
* @off: offset in skb
* @len: skb->len - offset : payload in skb
*/
int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
unsigned int off, size_t len)
{
struct siw_qp *qp = rd_desc->arg.data;
struct siw_iwarp_rx *rctx = &qp->rx_ctx;
int rv;
rctx->skb = skb;
rctx->skb_new = skb->len - off;
rctx->skb_offset = off;
rctx->skb_copied = 0;
dprint(DBG_RX, "(QP%d): new data %d, rx-state %d\n", QP_ID(qp),
rctx->skb_new, rctx->state);
if (unlikely(rctx->rx_suspend == 1 ||
qp->attrs.state != SIW_QP_STATE_RTS)) {
dprint(DBG_RX|DBG_ON, "(QP%d): failed. state rx:%d, qp:%d\n",
QP_ID(qp), qp->rx_ctx.state, qp->attrs.state);
return 0;
}
while (rctx->skb_new) {
switch (rctx->state) {
case SIW_GET_HDR:
rv = siw_get_hdr(rctx);
if (!rv) {
if (rctx->crc_enabled &&
siw_crc_rxhdr(rctx) != 0) {
rv = -EINVAL;
break;
}
rctx->hdr.ctrl.mpa_len =
ntohs(rctx->hdr.ctrl.mpa_len);
rctx->fpdu_part_rem =
siw_fpdu_payload_len(rctx);
if (rctx->fpdu_part_rem)
rctx->pad = -rctx->fpdu_part_rem & 0x3;
else
rctx->pad = 0;
rctx->state = SIW_GET_DATA_START;
rctx->fpdu_part_rcvd = 0;
}
break;
case SIW_GET_DATA_MORE:
/*
* Another data fragment of the same DDP segment.
* Headers will not be checked again by the
* opcode-specific data receive function below.
* Setting first_ddp_seg = 0 avoids repeating
* initializations that may occur only once per
* DDP segment.
*/
rctx->first_ddp_seg = 0;
case SIW_GET_DATA_START:
/*
* Headers will be checked by the opcode-specific
* data receive function below.
*/
rv = siw_rx_data(qp, rctx);
if (!rv) {
rctx->fpdu_part_rem =
siw_fpdu_trailer_len(rctx);
rctx->fpdu_part_rcvd = 0;
rctx->state = SIW_GET_TRAILER;
} else
rctx->state = SIW_GET_DATA_MORE;
break;
case SIW_GET_TRAILER:
/*
* read CRC + any padding
*/
rv = siw_get_trailer(qp, rctx);
if (!rv) {
/*
* FPDU completed.
* complete RDMAP message if last fragment
*/
rctx->state = SIW_GET_HDR;
rctx->fpdu_part_rcvd = 0;
if (!rctx->hdr.ctrl.l)
/* more frags */
break;
rv = siw_rdmap_complete(qp, rctx);
if (rv)
break;
}
break;
default:
WARN_ON(1);
rv = -EAGAIN;
}
if (unlikely(rv != 0 && rv != -EAGAIN)) {
/*
* TODO: implement graceful error handling including
* generation (and processing) of TERMINATE
* messages.
*
* for now we are left with a bogus rx status
* unable to receive any further byte.
* BUT: code must handle difference between
*
* o protocol syntax (FATAL, framing lost)
* o crc (FATAL, framing lost since we do not
* trust packet header (??))
* o local resource (maybe non fatal, framing
* not lost)
*
* errors.
*/
siw_rdmap_error(qp, rctx, rv);
dprint(DBG_RX|DBG_ON,
"(QP%d): RX ERROR %d at RX state %d\n",
QP_ID(qp), rv, rctx->state);
siw_dprint_rctx(rctx);
/*
* Calling siw_cm_queue_work() is safe without
* releasing qp->state_lock because the QP state
* will be transitioned to SIW_QP_STATE_ERROR
* by the siw_work_handler() workqueue handler
* after we return from siw_qp_llp_data_ready().
*/
siw_qp_cm_drop(qp, 1);
break;
}
if (rv) {
dprint(DBG_RX, "(QP%d): "
"Misaligned FPDU: State: %d, missing: %d\n",
QP_ID(qp), rctx->state, rctx->fpdu_part_rem);
break;
}
}
return rctx->skb_copied;
}