cublasgemvBatched()
cublasStatus_t cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans,
int m, int n,
const float *alpha,
const float *Aarray[], int lda,
const float *xarray[], int incx,
const float *beta,
float *yarray[], int incy,
int batchCount)
cublasStatus_t cublasDgemvBatched(cublasHandle_t handle, cublasOperation_t trans,
int m, int n,
const double *alpha,
const double *Aarray[], int lda,
const double *xarray[], int incx,
const double *beta,
double *yarray[], int incy,
int batchCount)
cublasStatus_t cublasCgemvBatched(cublasHandle_t handle, cublasOperation_t trans,
int m, int n,
const cuComplex *alpha,
const cuComplex *Aarray[], int lda,
const cuComplex *xarray[], int incx,
const cuComplex *beta,
cuComplex *yarray[], int incy,
int batchCount)
cublasStatus_t cublasZgemvBatched(cublasHandle_t handle, cublasOperation_t trans,
int m, int n,
const cuDoubleComplex *alpha,
const cuDoubleComplex *Aarray[], int lda,
const cuDoubleComplex *xarray[], int incx,
const cuDoubleComplex *beta,
cuDoubleComplex *yarray[], int incy,
int batchCount)
cublasStatus_t cublasHSHgemvBatched(cublasHandle_t handle, cublasOperation_t trans,
int m, int n,
const float *alpha,
const __half *Aarray[], int lda,
const __half *xarray[], int incx,
const float *beta,
__half *yarray[], int incy,
int batchCount)
cublasStatus_t cublasHSSgemvBatched(cublasHandle_t handle, cublasOperation_t trans,
int m, int n,
const float *alpha,
const __half *Aarray[], int lda,
const __half *xarray[], int incx,
const float *beta,
float *yarray[], int incy,
int batchCount)
cublasStatus_t cublasTSTgemvBatched(cublasHandle_t handle, cublasOperation_t trans,
int m, int n,
const float *alpha,
const __nv_bfloat16 *Aarray[], int lda,
const __nv_bfloat16 *xarray[], int incx,
const float *beta,
__nv_bfloat16 *yarray[], int incy,
int batchCount)
cublasStatus_t cublasTSSgemvBatched(cublasHandle_t handle, cublasOperation_t trans,
int m, int n,
const float *alpha,
const __nv_bfloat16 *Aarray[], int lda,
const __nv_bfloat16 *xarray[], int incx,
const float *beta,
float *yarray[], int incy,
int batchCount)
此函数执行一批矩阵和向量的矩阵向量乘法。 该批处理被认为是“统一的”,即所有实例对于它们各自的 A 矩阵、x 和 y 向量具有相同的维度 (m, n)、前导维度 (lda)、增量 (incx, incy) 和转置 (trans) . 输入矩阵和向量的地址,以及批处理的每个实例的输出向量,都是从调用者传递给函数的指针数组中读取的。
y [ i ] = α o p ( A [ i ] ) x [ i ] + β y [ i ] , f o r i ∈ [ 0. b a t c h C o u n t − 1 ] y[i] = \alpha op(A[i])x[i] + \beta y[i], for i\in [0. batchCount-1] y[i]=αop(A[i])x[i]+βy[i],fori∈[0.batchCount−1]
其中 α \alpha α和 β \beta β是标量, A 是指向矩阵 A[i] 的指针数组,以列优先格式存储,维度为 m x n ,x 和 y 是指向向量的指针数组。 此外,对于 matrixA[i] ,
o
p
(
A
[
i
]
)
=
{
A
[
i
]
如
果
t
r
a
n
s
a
=
=
C
U
B
L
A
S
_
O
P
_
N
,
A
[
i
]
T
如
果
t
r
a
n
s
a
=
=
C
U
B
L
A
S
_
O
P
_
T
,
A
[
i
]
H
如
果
t
r
a
n
s
a
=
=
C
U
B
L
A
S
_
O
P
_
C
op(A[i])= {A[i] 如果transa==CUBLAS_OP_N,A[i]T 如果transa==CUBLAS_OP_T,A[i]H 如果transa==CUBLAS_OP_C
op(A[i])=⎩⎪⎨⎪⎧A[i] 如果transa==CUBLAS_OP_N,A[i]T 如果transa==CUBLAS_OP_T,A[i]H 如果transa==CUBLAS_OP_C
注意:y[i] 向量不能重叠,也就是说,各个 gemv 操作必须是可独立计算的; 否则,会出现未定义的行为。
对于某些规模的问题,在不同的 CUDA 流中多次调用 cublas
可能比使用此 API 更有利。
Param. | Memory | In/out | Meaning |
---|---|---|---|
handle | input | handle to the cuBLAS library context. | |
trans | input | Operation op(A[i]) that is non- or (conj.) transpose. | |
m | input | Number of rows of matrix A[i]. | |
n | input | number of columns of matrix A. | |
alpha | host or device | input | scalar used for multiplication. |
Aarray | device | input | Array of pointers to array, with each array of dim. lda x n with lda>=max(1,m).All pointers must meet certain alignment criteria. Please see below for details. |
lda | input | Leading dimension of two-dimensional array used to store each matrix A[i]. | |
xarray | device | input | Array of pointers to array, with each dimension n if trans==CUBLAS_OP_N and m otherwise.All pointers must meet certain alignment criteria. Please see below for details. |
incx | input | stride between consecutive elements of x. | |
beta | host or device | input | scalar used for multiplication. If beta == 0, y does not have to be a valid input. |
yarray | device | in/out | Array of pointers to array. It has dimensions m if trans==CUBLAS_OP_N and n otherwise. Vectors y[i] should not overlap; otherwise, undefined behavior is expected.All pointers must meet certain alignment criteria. Please see below for details. |
incy | input | Stride of each one-dimensional array y[i]. | |
batchCount | input | Number of pointers contained in Aarray, xarray and yarray. |
如果数学模式在使用 cublasSgemvBatched() 时启用快速数学模式,则放置在 GPU 内存中的指针(不是指针数组)必须正确对齐以避免未对齐的内存访问错误。 理想情况下,所有指针都对齐到至少 16 字节。 否则建议他们满足以下规则:
该函数可能返回的错误值及其含义如下表所示:
ErrorValue | Meaning |
---|---|
CUBLAS_STATUS_SUCCESS | 操作成功完成 |
CUBLAS_STATUS_NOT_INITIALIZED | 库未初始化 |
CUBLAS_STATUS_INVALID_VALUE | 参数 m,n,batchCount<0 . |
CUBLAS_STATUS_EXECUTION_FAILED | 该功能无法在 GPU 上启动 |