1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.renderscript;
18 
19 import android.annotation.IntDef;
20 import java.lang.annotation.Retention;
21 import java.lang.annotation.RetentionPolicy;
22 
23 /**
24  *
25  * ScriptIntrinsicBLAS class provides high performance RenderScript APIs to BLAS.
26  *
27  * The BLAS (Basic Linear Algebra Subprograms) are routines that provide standard
28  * building blocks for performing basic vector and matrix operations.
29  *
30  * For detailed description of BLAS, please refer to http://www.netlib.org/blas/
31  *
32  **/
33 public final class ScriptIntrinsicBLAS extends ScriptIntrinsic {
34     private Allocation mLUT;
35 
ScriptIntrinsicBLAS(long id, RenderScript rs)36     private ScriptIntrinsicBLAS(long id, RenderScript rs) {
37         super(id, rs);
38     }
39 
40     private static final int RsBlas_sdsdot = 1;
41     private static final int RsBlas_dsdot = 2;
42     private static final int RsBlas_sdot = 3;
43     private static final int RsBlas_ddot = 4;
44     private static final int RsBlas_cdotu_sub = 5;
45     private static final int RsBlas_cdotc_sub = 6;
46     private static final int RsBlas_zdotu_sub = 7;
47     private static final int RsBlas_zdotc_sub = 8;
48     private static final int RsBlas_snrm2 = 9;
49     private static final int RsBlas_sasum = 10;
50     private static final int RsBlas_dnrm2 = 11;
51     private static final int RsBlas_dasum = 12;
52     private static final int RsBlas_scnrm2 = 13;
53     private static final int RsBlas_scasum = 14;
54     private static final int RsBlas_dznrm2 = 15;
55     private static final int RsBlas_dzasum = 16;
56     private static final int RsBlas_isamax = 17;
57     private static final int RsBlas_idamax = 18;
58     private static final int RsBlas_icamax = 19;
59     private static final int RsBlas_izamax = 20;
60     private static final int RsBlas_sswap = 21;
61     private static final int RsBlas_scopy = 22;
62     private static final int RsBlas_saxpy = 23;
63     private static final int RsBlas_dswap = 24;
64     private static final int RsBlas_dcopy = 25;
65     private static final int RsBlas_daxpy = 26;
66     private static final int RsBlas_cswap = 27;
67     private static final int RsBlas_ccopy = 28;
68     private static final int RsBlas_caxpy = 29;
69     private static final int RsBlas_zswap = 30;
70     private static final int RsBlas_zcopy = 31;
71     private static final int RsBlas_zaxpy = 32;
72     private static final int RsBlas_srotg = 33;
73     private static final int RsBlas_srotmg = 34;
74     private static final int RsBlas_srot = 35;
75     private static final int RsBlas_srotm = 36;
76     private static final int RsBlas_drotg = 37;
77     private static final int RsBlas_drotmg = 38;
78     private static final int RsBlas_drot = 39;
79     private static final int RsBlas_drotm = 40;
80     private static final int RsBlas_sscal = 41;
81     private static final int RsBlas_dscal = 42;
82     private static final int RsBlas_cscal = 43;
83     private static final int RsBlas_zscal = 44;
84     private static final int RsBlas_csscal = 45;
85     private static final int RsBlas_zdscal = 46;
86     private static final int RsBlas_sgemv = 47;
87     private static final int RsBlas_sgbmv = 48;
88     private static final int RsBlas_strmv = 49;
89     private static final int RsBlas_stbmv = 50;
90     private static final int RsBlas_stpmv = 51;
91     private static final int RsBlas_strsv = 52;
92     private static final int RsBlas_stbsv = 53;
93     private static final int RsBlas_stpsv = 54;
94     private static final int RsBlas_dgemv = 55;
95     private static final int RsBlas_dgbmv = 56;
96     private static final int RsBlas_dtrmv = 57;
97     private static final int RsBlas_dtbmv = 58;
98     private static final int RsBlas_dtpmv = 59;
99     private static final int RsBlas_dtrsv = 60;
100     private static final int RsBlas_dtbsv = 61;
101     private static final int RsBlas_dtpsv = 62;
102     private static final int RsBlas_cgemv = 63;
103     private static final int RsBlas_cgbmv = 64;
104     private static final int RsBlas_ctrmv = 65;
105     private static final int RsBlas_ctbmv = 66;
106     private static final int RsBlas_ctpmv = 67;
107     private static final int RsBlas_ctrsv = 68;
108     private static final int RsBlas_ctbsv = 69;
109     private static final int RsBlas_ctpsv = 70;
110     private static final int RsBlas_zgemv = 71;
111     private static final int RsBlas_zgbmv = 72;
112     private static final int RsBlas_ztrmv = 73;
113     private static final int RsBlas_ztbmv = 74;
114     private static final int RsBlas_ztpmv = 75;
115     private static final int RsBlas_ztrsv = 76;
116     private static final int RsBlas_ztbsv = 77;
117     private static final int RsBlas_ztpsv = 78;
118     private static final int RsBlas_ssymv = 79;
119     private static final int RsBlas_ssbmv = 80;
120     private static final int RsBlas_sspmv = 81;
121     private static final int RsBlas_sger = 82;
122     private static final int RsBlas_ssyr = 83;
123     private static final int RsBlas_sspr = 84;
124     private static final int RsBlas_ssyr2 = 85;
125     private static final int RsBlas_sspr2 = 86;
126     private static final int RsBlas_dsymv = 87;
127     private static final int RsBlas_dsbmv = 88;
128     private static final int RsBlas_dspmv = 89;
129     private static final int RsBlas_dger = 90;
130     private static final int RsBlas_dsyr = 91;
131     private static final int RsBlas_dspr = 92;
132     private static final int RsBlas_dsyr2 = 93;
133     private static final int RsBlas_dspr2 = 94;
134     private static final int RsBlas_chemv = 95;
135     private static final int RsBlas_chbmv = 96;
136     private static final int RsBlas_chpmv = 97;
137     private static final int RsBlas_cgeru = 98;
138     private static final int RsBlas_cgerc = 99;
139     private static final int RsBlas_cher = 100;
140     private static final int RsBlas_chpr = 101;
141     private static final int RsBlas_cher2 = 102;
142     private static final int RsBlas_chpr2 = 103;
143     private static final int RsBlas_zhemv = 104;
144     private static final int RsBlas_zhbmv = 105;
145     private static final int RsBlas_zhpmv = 106;
146     private static final int RsBlas_zgeru = 107;
147     private static final int RsBlas_zgerc = 108;
148     private static final int RsBlas_zher = 109;
149     private static final int RsBlas_zhpr = 110;
150     private static final int RsBlas_zher2 = 111;
151     private static final int RsBlas_zhpr2 = 112;
152     private static final int RsBlas_sgemm = 113;
153     private static final int RsBlas_ssymm = 114;
154     private static final int RsBlas_ssyrk = 115;
155     private static final int RsBlas_ssyr2k = 116;
156     private static final int RsBlas_strmm = 117;
157     private static final int RsBlas_strsm = 118;
158     private static final int RsBlas_dgemm = 119;
159     private static final int RsBlas_dsymm = 120;
160     private static final int RsBlas_dsyrk = 121;
161     private static final int RsBlas_dsyr2k = 122;
162     private static final int RsBlas_dtrmm = 123;
163     private static final int RsBlas_dtrsm = 124;
164     private static final int RsBlas_cgemm = 125;
165     private static final int RsBlas_csymm = 126;
166     private static final int RsBlas_csyrk = 127;
167     private static final int RsBlas_csyr2k = 128;
168     private static final int RsBlas_ctrmm = 129;
169     private static final int RsBlas_ctrsm = 130;
170     private static final int RsBlas_zgemm = 131;
171     private static final int RsBlas_zsymm = 132;
172     private static final int RsBlas_zsyrk = 133;
173     private static final int RsBlas_zsyr2k = 134;
174     private static final int RsBlas_ztrmm = 135;
175     private static final int RsBlas_ztrsm = 136;
176     private static final int RsBlas_chemm = 137;
177     private static final int RsBlas_cherk = 138;
178     private static final int RsBlas_cher2k = 139;
179     private static final int RsBlas_zhemm = 140;
180     private static final int RsBlas_zherk = 141;
181     private static final int RsBlas_zher2k = 142;
182 
183     // BLAS extensions start here
184     private static final int RsBlas_bnnm = 1000;
185 
186     /**
187      * Create an intrinsic to access BLAS subroutines.
188      *
189      * @param rs The RenderScript context
190      * @return ScriptIntrinsicBLAS
191      */
create(RenderScript rs)192     public static ScriptIntrinsicBLAS create(RenderScript rs) {
193         long id = rs.nScriptIntrinsicCreate(13, Element.U32(rs).getID(rs));
194         return new ScriptIntrinsicBLAS(id, rs);
195     }
196 
197     /**
198      * @hide
199      */
200     @IntDef({NO_TRANSPOSE, TRANSPOSE, CONJ_TRANSPOSE})
201     @Retention(RetentionPolicy.SOURCE)
202     public @interface Transpose {}
203 
204     /**
205      * @hide
206      */
207     @IntDef({UPPER, LOWER})
208     @Retention(RetentionPolicy.SOURCE)
209     public @interface Uplo {}
210 
211     /**
212      * @hide
213      */
214     @IntDef({NON_UNIT, UNIT})
215     @Retention(RetentionPolicy.SOURCE)
216     public @interface Diag {}
217 
218     /**
219      * @hide
220      */
221     @IntDef({LEFT, RIGHT})
222     @Retention(RetentionPolicy.SOURCE)
223     public @interface Side {}
224 
225     public static final int NO_TRANSPOSE = 111;
226     public static final int TRANSPOSE = 112;
227     public static final int CONJ_TRANSPOSE = 113;
228 
229     public static final int UPPER = 121;
230     public static final int LOWER = 122;
231 
232     public static final int NON_UNIT = 131;
233     public static final int UNIT = 132;
234 
235     public static final int LEFT = 141;
236     public static final int RIGHT = 142;
237 
validateSide(@ide int Side)238     static void validateSide(@Side int Side) {
239         if (Side != LEFT && Side != RIGHT) {
240             throw new RSRuntimeException("Invalid side passed to BLAS");
241         }
242     }
243 
validateTranspose(@ranspose int Trans)244     static void validateTranspose(@Transpose int Trans) {
245         if (Trans != NO_TRANSPOSE && Trans != TRANSPOSE &&
246             Trans != CONJ_TRANSPOSE) {
247             throw new RSRuntimeException("Invalid transpose passed to BLAS");
248         }
249     }
250 
validateConjTranspose(@ranspose int Trans)251     static void validateConjTranspose(@Transpose int Trans) {
252         if (Trans != NO_TRANSPOSE &&
253             Trans != CONJ_TRANSPOSE) {
254             throw new RSRuntimeException("Invalid transpose passed to BLAS");
255         }
256     }
257 
validateDiag(@iag int Diag)258     static void validateDiag(@Diag int Diag) {
259         if (Diag != NON_UNIT && Diag != UNIT) {
260             throw new RSRuntimeException("Invalid diag passed to BLAS");
261         }
262     }
263 
validateUplo(@plo int Uplo)264     static void validateUplo(@Uplo int Uplo) {
265         if (Uplo != UPPER && Uplo != LOWER) {
266             throw new RSRuntimeException("Invalid uplo passed to BLAS");
267         }
268     }
269 
270 
271     /**
272      * Level 2 BLAS
273      */
274 
validateGEMV(Element e, int TransA, Allocation A, Allocation X, int incX, Allocation Y, int incY)275     static void validateGEMV(Element e, int TransA, Allocation A, Allocation X, int incX, Allocation Y, int incY) {
276         validateTranspose(TransA);
277         int M = A.getType().getY();
278         int N = A.getType().getX();
279         if (!A.getType().getElement().isCompatible(e) ||
280             !X.getType().getElement().isCompatible(e) ||
281             !Y.getType().getElement().isCompatible(e)) {
282             throw new RSRuntimeException("Called BLAS with wrong Element type");
283         }
284         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
285             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
286         }
287 
288         if (incX <= 0 || incY <= 0) {
289             throw new RSRuntimeException("Vector increments must be greater than 0");
290         }
291         int expectedXDim = -1, expectedYDim = -1;
292         if (TransA == NO_TRANSPOSE) {
293             expectedXDim = 1 + (N - 1) * incX;
294             expectedYDim = 1 + (M - 1) * incY;
295         } else {
296             expectedXDim = 1 + (M - 1) * incX;
297             expectedYDim = 1 + (N - 1) * incY;
298         }
299         if (X.getType().getX() != expectedXDim ||
300             Y.getType().getX() != expectedYDim) {
301             throw new RSRuntimeException("Incorrect vector dimensions for GEMV");
302         }
303     }
304 
305     /**
306      * SGEMV performs one of the matrix-vector operations
307      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
308      *
309      * Details: http://www.netlib.org/lapack/explore-html/db/d58/sgemv_8f.html
310      *
311      * @param TransA The type of transpose applied to matrix A.
312      * @param alpha The scalar alpha.
313      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
314      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
315      * @param incX The increment for the elements of vector x, must be larger than zero.
316      * @param beta The scalar beta.
317      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
318      * @param incY The increment for the elements of vector y, must be larger than zero.
319      */
SGEMV(@ranspose int TransA, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY)320     public void SGEMV(@Transpose int TransA, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
321         validateGEMV(Element.F32(mRS), TransA, A, X, incX, Y, incY);
322         int M = A.getType().getY();
323         int N = A.getType().getX();
324         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
325     }
326 
327     /**
328      * DGEMV performs one of the matrix-vector operations
329      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
330      *
331      * Details: http://www.netlib.org/lapack/explore-html/dc/da8/dgemv_8f.html
332      *
333      * @param TransA The type of transpose applied to matrix A.
334      * @param alpha The scalar alpha.
335      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
336      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
337      * @param incX The increment for the elements of vector x, must be larger than zero.
338      * @param beta The scalar beta.
339      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
340      * @param incY The increment for the elements of vector y, must be larger than zero.
341      */
DGEMV(@ranspose int TransA, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY)342     public void DGEMV(@Transpose int TransA, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
343         validateGEMV(Element.F64(mRS), TransA, A, X, incX, Y, incY);
344         int M = A.getType().getY();
345         int N = A.getType().getX();
346         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
347     }
348 
349     /**
350      * CGEMV performs one of the matrix-vector operations
351      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
352      *
353      * Details: http://www.netlib.org/lapack/explore-html/d4/d8a/cgemv_8f.html
354      *
355      * @param TransA The type of transpose applied to matrix A.
356      * @param alpha The scalar alpha.
357      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
358      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
359      * @param incX The increment for the elements of vector x, must be larger than zero.
360      * @param beta The scalar beta.
361      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
362      * @param incY The increment for the elements of vector y, must be larger than zero.
363      */
CGEMV(@ranspose int TransA, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY)364     public void CGEMV(@Transpose int TransA, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
365         validateGEMV(Element.F32_2(mRS), TransA, A, X, incX, Y, incY);
366         int M = A.getType().getY();
367         int N = A.getType().getX();
368         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
369     }
370 
371     /**
372      * ZGEMV performs one of the matrix-vector operations
373      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
374      *
375      * Details: http://www.netlib.org/lapack/explore-html/db/d40/zgemv_8f.html
376      *
377      * @param TransA The type of transpose applied to matrix A.
378      * @param alpha The scalar alpha.
379      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
380      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
381      * @param incX The increment for the elements of vector x, must be larger than zero.
382      * @param beta The scalar beta.
383      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
384      * @param incY The increment for the elements of vector y, must be larger than zero.
385      */
ZGEMV(@ranspose int TransA, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY)386     public void ZGEMV(@Transpose int TransA, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
387         validateGEMV(Element.F64_2(mRS), TransA, A, X, incX, Y, incY);
388         int M = A.getType().getY();
389         int N = A.getType().getX();
390         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
391     }
392 
393     /**
394      * SGBMV performs one of the matrix-vector operations
395      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
396      *
397      * Details: http://www.netlib.org/lapack/explore-html/d6/d46/sgbmv_8f.html
398      *
399      * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
400      *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
401      *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
402      *           for i in range(0, m):
403      *              for j in range(max(0, i-kl), min(i+ku+1, n)):
404      *                  b[i, j-i+kl] = a[i, j]
405      *
406      * @param TransA The type of transpose applied to matrix A.
407      * @param KL The number of sub-diagonals of the matrix A.
408      * @param KU The number of super-diagonals of the matrix A.
409      * @param alpha The scalar alpha.
410      * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F32}.
411      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
412      * @param incX The increment for the elements of vector x, must be larger than zero.
413      * @param beta The scalar beta.
414      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
415      * @param incY The increment for the elements of vector y, must be larger than zero.
416      */
SGBMV(@ranspose int TransA, int KL, int KU, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY)417     public void SGBMV(@Transpose int TransA, int KL, int KU, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
418         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
419         validateGEMV(Element.F32(mRS), TransA, A, X, incX, Y, incY);
420         if (KL < 0 || KU < 0) {
421             throw new RSRuntimeException("KL and KU must be greater than or equal to 0");
422         }
423         int M = A.getType().getY();
424         int N = A.getType().getX();
425         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, KL, KU);
426     }
427 
428     /**
429      * DGBMV performs one of the matrix-vector operations
430      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
431      *
432      * Details: http://www.netlib.org/lapack/explore-html/d2/d3f/dgbmv_8f.html
433      *
434      * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
435      *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
436      *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
437      *           for i in range(0, m):
438      *              for j in range(max(0, i-kl), min(i+ku+1, n)):
439      *                  b[i, j-i+kl] = a[i, j]
440      *
441      * @param TransA The type of transpose applied to matrix A.
442      * @param KL The number of sub-diagonals of the matrix A.
443      * @param KU The number of super-diagonals of the matrix A.
444      * @param alpha The scalar alpha.
445      * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F64}.
446      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
447      * @param incX The increment for the elements of vector x, must be larger than zero.
448      * @param beta The scalar beta.
449      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
450      * @param incY The increment for the elements of vector y, must be larger than zero.
451      */
DGBMV(@ranspose int TransA, int KL, int KU, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY)452     public void DGBMV(@Transpose int TransA, int KL, int KU, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
453         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
454         validateGEMV(Element.F64(mRS), TransA, A, X, incX, Y, incY);
455         if (KL < 0 || KU < 0) {
456             throw new RSRuntimeException("KL and KU must be greater than or equal to 0");
457         }
458         int M = A.getType().getY();
459         int N = A.getType().getX();
460         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, KL, KU);
461     }
462 
463     /**
464      * CGBMV performs one of the matrix-vector operations
465      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
466      *
467      * Details: http://www.netlib.org/lapack/explore-html/d0/d75/cgbmv_8f.html
468      *
469      * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
470      *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
471      *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
472      *           for i in range(0, m):
473      *              for j in range(max(0, i-kl), min(i+ku+1, n)):
474      *                  b[i, j-i+kl] = a[i, j]
475      *
476      * @param TransA The type of transpose applied to matrix A.
477      * @param KL The number of sub-diagonals of the matrix A.
478      * @param KU The number of super-diagonals of the matrix A.
479      * @param alpha The scalar alpha.
480      * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F32_2}.
481      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
482      * @param incX The increment for the elements of vector x, must be larger than zero.
483      * @param beta The scalar beta.
484      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
485      * @param incY The increment for the elements of vector y, must be larger than zero.
486      */
CGBMV(@ranspose int TransA, int KL, int KU, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY)487     public void CGBMV(@Transpose int TransA, int KL, int KU, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
488         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
489         validateGEMV(Element.F32_2(mRS), TransA, A, X, incX, Y, incY);
490         if (KL < 0 || KU < 0) {
491             throw new RSRuntimeException("KL and KU must be greater than or equal to 0");
492         }
493         int M = A.getType().getY();
494         int N = A.getType().getX();
495         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, KL, KU);
496     }
497 
498     /**
499      * ZGBMV performs one of the matrix-vector operations
500      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
501      *
502      * Details: http://www.netlib.org/lapack/explore-html/d9/d46/zgbmv_8f.html
503      *
504      * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
505      *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
506      *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
507      *           for i in range(0, m):
508      *              for j in range(max(0, i-kl), min(i+ku+1, n)):
509      *                  b[i, j-i+kl] = a[i, j]
510      *
511      * @param TransA The type of transpose applied to matrix A.
512      * @param KL The number of sub-diagonals of the matrix A.
513      * @param KU The number of super-diagonals of the matrix A.
514      * @param alpha The scalar alpha.
515      * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F64_2}.
516      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
517      * @param incX The increment for the elements of vector x, must be larger than zero.
518      * @param beta The scalar beta.
519      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
520      * @param incY The increment for the elements of vector y, must be larger than zero.
521      */
ZGBMV(@ranspose int TransA, int KL, int KU, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY)522     public void ZGBMV(@Transpose int TransA, int KL, int KU, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
523         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
524         validateGEMV(Element.F64_2(mRS), TransA, A, X, incX, Y, incY);
525         if (KL < 0 || KU < 0) {
526             throw new RSRuntimeException("KL and KU must be greater than or equal to 0");
527         }
528         int M = A.getType().getY();
529         int N = A.getType().getX();
530         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, KL, KU);
531     }
532 
validateTRMV(Element e, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)533     static void validateTRMV(Element e, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
534         validateTranspose(TransA);
535         validateUplo(Uplo);
536         validateDiag(Diag);
537         int N = A.getType().getY();
538         if (A.getType().getX() != N) {
539             throw new RSRuntimeException("A must be a square matrix for TRMV");
540         }
541         if (!A.getType().getElement().isCompatible(e) ||
542             !X.getType().getElement().isCompatible(e)) {
543             throw new RSRuntimeException("Called BLAS with wrong Element type");
544         }
545         if (X.getType().getY() > 1) {
546             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
547         }
548 
549         if (incX <= 0) {
550             throw new RSRuntimeException("Vector increments must be greater than 0");
551         }
552         int expectedXDim = 1 + (N - 1) * incX;
553         if (X.getType().getX() != expectedXDim) {
554             throw new RSRuntimeException("Incorrect vector dimensions for TRMV");
555         }
556     }
557 
validateTPMV(Element e, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)558     static int validateTPMV(Element e, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX) {
559         validateTranspose(TransA);
560         validateUplo(Uplo);
561         validateDiag(Diag);
562         if (!Ap.getType().getElement().isCompatible(e) ||
563             !X.getType().getElement().isCompatible(e)) {
564             throw new RSRuntimeException("Called BLAS with wrong Element type");
565         }
566         if (X.getType().getY() > 1) {
567             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
568         }
569 
570         if (Ap.getType().getY() > 1) {
571             throw new RSRuntimeException("Ap must have a Y dimension of 0 or 1");
572         }
573 
574         int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
575         //is it really doing anything?
576         if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
577             throw new RSRuntimeException("Invalid dimension for Ap");
578         }
579         if (incX <= 0) {
580             throw new RSRuntimeException("Vector increments must be greater than 0");
581         }
582         int expectedXDim = 1 + (N - 1) * incX;
583         if (X.getType().getX() != expectedXDim) {
584             throw new RSRuntimeException("Incorrect vector dimensions for TPMV");
585         }
586 
587         return N;
588     }
589 
590     /**
591      * STRMV performs one of the matrix-vector operations
592      * x := A*x   or   x := A**T*x
593      *
594      * Details: http://www.netlib.org/lapack/explore-html/de/d45/strmv_8f.html
595      *
596      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
597      * @param TransA The type of transpose applied to matrix A.
598      * @param Diag Specifies whether or not A is unit triangular.
599      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
600      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
601      * @param incX The increment for the elements of vector x, must be larger than zero.
602      */
STRMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)603     public void STRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
604         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
605         int N = A.getType().getY();
606         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
607     }
608 
609     /**
610      * DTRMV performs one of the matrix-vector operations
611      * x := A*x   or   x := A**T*x
612      *
613      * Details: http://www.netlib.org/lapack/explore-html/dc/d7e/dtrmv_8f.html
614      *
615      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
616      * @param TransA The type of transpose applied to matrix A.
617      * @param Diag Specifies whether or not A is unit triangular.
618      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
619      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
620      * @param incX The increment for the elements of vector x, must be larger than zero.
621      */
DTRMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)622     public void DTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
623         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
624         int N = A.getType().getY();
625         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
626     }
627 
628     /**
629      * CTRMV performs one of the matrix-vector operations
630      * x := A*x   or   x := A**T*x   or   x := A**H*x
631      *
632      * Details: http://www.netlib.org/lapack/explore-html/df/d78/ctrmv_8f.html
633      *
634      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
635      * @param TransA The type of transpose applied to matrix A.
636      * @param Diag Specifies whether or not A is unit triangular.
637      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
638      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
639      * @param incX The increment for the elements of vector x, must be larger than zero.
640      */
CTRMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)641     public void CTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
642         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
643         int N = A.getType().getY();
644         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
645     }
646 
647     /**
648      * ZTRMV performs one of the matrix-vector operations
649      * x := A*x   or   x := A**T*x   or   x := A**H*x
650      *
651      * Details: http://www.netlib.org/lapack/explore-html/d0/dd1/ztrmv_8f.html
652      *
653      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
654      * @param TransA The type of transpose applied to matrix A.
655      * @param Diag Specifies whether or not A is unit triangular.
656      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
657      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
658      * @param incX The increment for the elements of vector x, must be larger than zero.
659      */
ZTRMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)660     public void ZTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
661         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
662         int N = A.getType().getY();
663         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
664     }
665 
666     /**
667      * STBMV performs one of the matrix-vector operations
668      * x := A*x   or   x := A**T*x
669      *
670      * Details: http://www.netlib.org/lapack/explore-html/d6/d7d/stbmv_8f.html
671      *
672      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
673      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
674      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
675      *           for i in range(0, n):
676      *              for j in range(i, min(i+k+1, n)):
677      *                  b[i, j-i] = a[i, j]
678      *
679      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
680      * @param TransA The type of transpose applied to matrix A.
681      * @param Diag Specifies whether or not A is unit triangular.
682      * @param K The number of off-diagonals of the matrix A
683      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
684      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
685      * @param incX The increment for the elements of vector x, must be larger than zero.
686      */
STBMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, int K, Allocation A, Allocation X, int incX)687     public void STBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
688         // TBMV has the same requirements as TRMV + K >= 0
689         if (K < 0) {
690             throw new RSRuntimeException("K must be greater than or equal to 0");
691         }
692         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
693         int N = A.getType().getY();
694         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
695     }
696 
697     /**
698      * DTBMV performs one of the matrix-vector operations
699      * x := A*x   or   x := A**T*x
700      *
701      * Details: http://www.netlib.org/lapack/explore-html/df/d29/dtbmv_8f.html
702      *
703      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
704      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
705      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
706      *           for i in range(0, n):
707      *              for j in range(i, min(i+k+1, n)):
708      *                  b[i, j-i] = a[i, j]
709      *
710      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
711      * @param TransA The type of transpose applied to matrix A.
712      * @param Diag Specifies whether or not A is unit triangular.
713      * @param K The number of off-diagonals of the matrix A
714      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
715      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
716      * @param incX The increment for the elements of vector x, must be larger than zero.
717      */
DTBMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, int K, Allocation A, Allocation X, int incX)718     public void DTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
719         // TBMV has the same requirements as TRMV + K >= 0
720         if (K < 0) {
721             throw new RSRuntimeException("K must be greater than or equal to 0");
722         }
723         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
724         int N = A.getType().getY();
725         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
726     }
727 
728     /**
729      * CTBMV performs one of the matrix-vector operations
730      * x := A*x   or   x := A**T*x   or   x := A**H*x
731      *
732      * Details: http://www.netlib.org/lapack/explore-html/d3/dcd/ctbmv_8f.html
733      *
734      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
735      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
736      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
737      *           for i in range(0, n):
738      *              for j in range(i, min(i+k+1, n)):
739      *                  b[i, j-i] = a[i, j]
740      *
741      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
742      * @param TransA The type of transpose applied to matrix A.
743      * @param Diag Specifies whether or not A is unit triangular.
744      * @param K The number of off-diagonals of the matrix A
745      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
746      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
747      * @param incX The increment for the elements of vector x, must be larger than zero.
748      */
CTBMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, int K, Allocation A, Allocation X, int incX)749     public void CTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
750         // TBMV has the same requirements as TRMV + K >= 0
751         if (K < 0) {
752             throw new RSRuntimeException("K must be greater than or equal to 0");
753         }
754         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
755         int N = A.getType().getY();
756         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
757     }
758 
759     /**
760      * ZTBMV performs one of the matrix-vector operations
761      * x := A*x   or   x := A**T*x   or   x := A**H*x
762      *
763      * Details: http://www.netlib.org/lapack/explore-html/d3/d39/ztbmv_8f.html
764      *
765      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
766      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
767      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
768      *           for i in range(0, n):
769      *              for j in range(i, min(i+k+1, n)):
770      *                  b[i, j-i] = a[i, j]
771      *
772      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
773      * @param TransA The type of transpose applied to matrix A.
774      * @param Diag Specifies whether or not A is unit triangular.
775      * @param K The number of off-diagonals of the matrix A
776      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
777      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
778      * @param incX The increment for the elements of vector x, must be larger than zero.
779      */
ZTBMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, int K, Allocation A, Allocation X, int incX)780     public void ZTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
781         // TBMV has the same requirements as TRMV + K >= 0
782         if (K < 0) {
783             throw new RSRuntimeException("K must be greater than or equal to 0");
784         }
785         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
786         int N = A.getType().getY();
787         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
788     }
789 
790     /**
791      * STPMV performs one of the matrix-vector operations
792      * x := A*x   or   x := A**T*x
793      *
794      * Details: http://www.netlib.org/lapack/explore-html/db/db1/stpmv_8f.html
795      *
796      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
797      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
798      *       'a' to packed matrix 'b'.
799      *           k = 0
800      *           for i in range(0, n):
801      *              for j in range(i, n):
802      *                  b[k++] = a[i, j]
803      *
804      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
805      * @param TransA The type of transpose applied to matrix A.
806      * @param Diag Specifies whether or not A is unit triangular.
807      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32}.
808      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
809      * @param incX The increment for the elements of vector x, must be larger than zero.
810      */
STPMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)811     public void STPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
812         int N = validateTPMV(Element.F32(mRS), Uplo, TransA, Diag, Ap, X, incX);
813         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
814     }
815 
816     /**
817      * DTPMV performs one of the matrix-vector operations
818      * x := A*x   or   x := A**T*x
819      *
820      * Details: http://www.netlib.org/lapack/explore-html/dc/dcd/dtpmv_8f.html
821      *
822      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
823      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
824      *       'a' to packed matrix 'b'.
825      *           k = 0
826      *           for i in range(0, n):
827      *              for j in range(i, n):
828      *                  b[k++] = a[i, j]
829      *
830      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
831      * @param TransA The type of transpose applied to matrix A.
832      * @param Diag Specifies whether or not A is unit triangular.
833      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64}.
834      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
835      * @param incX The increment for the elements of vector x, must be larger than zero.
836      */
DTPMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)837     public void DTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
838         int N = validateTPMV(Element.F64(mRS), Uplo, TransA, Diag, Ap, X, incX);
839         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
840     }
841 
842     /**
843      * CTPMV performs one of the matrix-vector operations
844      * x := A*x   or   x := A**T*x   or   x := A**H*x
845      *
846      * Details: http://www.netlib.org/lapack/explore-html/d4/dbb/ctpmv_8f.html
847      *
848      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
849      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
850      *       'a' to packed matrix 'b'.
851      *           k = 0
852      *           for i in range(0, n):
853      *              for j in range(i, n):
854      *                  b[k++] = a[i, j]
855      *
856      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
857      * @param TransA The type of transpose applied to matrix A.
858      * @param Diag Specifies whether or not A is unit triangular.
859      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32_2}.
860      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
861      * @param incX The increment for the elements of vector x, must be larger than zero.
862      */
CTPMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)863     public void CTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
864         int N = validateTPMV(Element.F32_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
865         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
866     }
867 
868     /**
869      * ZTPMV performs one of the matrix-vector operations
870      * x := A*x   or   x := A**T*x   or   x := A**H*x
871      *
872      * Details: http://www.netlib.org/lapack/explore-html/d2/d9e/ztpmv_8f.html
873      *
874      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
875      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
876      *       'a' to packed matrix 'b'.
877      *           k = 0
878      *           for i in range(0, n):
879      *              for j in range(i, n):
880      *                  b[k++] = a[i, j]
881      *
882      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
883      * @param TransA The type of transpose applied to matrix A.
884      * @param Diag Specifies whether or not A is unit triangular.
885      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64_2}.
886      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
887      * @param incX The increment for the elements of vector x, must be larger than zero.
888      */
ZTPMV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)889     public void ZTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
890         int N = validateTPMV(Element.F64_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
891         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
892     }
893 
894     /**
895      * STRSV solves one of the systems of equations
896      * A*x = b   or   A**T*x = b
897      *
898      * Details: http://www.netlib.org/lapack/explore-html/d0/d2a/strsv_8f.html
899      *
900      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
901      * @param TransA The type of transpose applied to matrix A.
902      * @param Diag Specifies whether or not A is unit triangular.
903      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
904      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
905      * @param incX The increment for the elements of vector x, must be larger than zero.
906      */
STRSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)907     public void STRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
908         // TRSV is the same as TRMV
909         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
910         int N = A.getType().getY();
911         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
912 
913     }
914 
915     /**
916      * DTRSV solves one of the systems of equations
917      * A*x = b   or   A**T*x = b
918      *
919      * Details: http://www.netlib.org/lapack/explore-html/d6/d96/dtrsv_8f.html
920      *
921      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
922      * @param TransA The type of transpose applied to matrix A.
923      * @param Diag Specifies whether or not A is unit triangular.
924      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
925      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
926      * @param incX The increment for the elements of vector x, must be larger than zero.
927      */
DTRSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)928     public void DTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
929         // TRSV is the same as TRMV
930         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
931         int N = A.getType().getY();
932         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
933 
934     }
935 
936     /**
937      * CTRSV solves one of the systems of equations
938      * A*x = b   or   A**T*x = b   or   A**H*x = b
939      *
940      * Details: http://www.netlib.org/lapack/explore-html/d4/dc8/ctrsv_8f.html
941      *
942      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
943      * @param TransA The type of transpose applied to matrix A.
944      * @param Diag Specifies whether or not A is unit triangular.
945      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
946      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
947      * @param incX The increment for the elements of vector x, must be larger than zero.
948      */
CTRSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)949     public void CTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
950         // TRSV is the same as TRMV
951         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
952         int N = A.getType().getY();
953         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
954 
955     }
956 
957     /**
958      * ZTRSV solves one of the systems of equations
959      * A*x = b   or   A**T*x = b   or   A**H*x = b
960      *
961      * Details: http://www.netlib.org/lapack/explore-html/d1/d2f/ztrsv_8f.html
962      *
963      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
964      * @param TransA The type of transpose applied to matrix A.
965      * @param Diag Specifies whether or not A is unit triangular.
966      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
967      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
968      * @param incX The increment for the elements of vector x, must be larger than zero.
969      */
ZTRSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX)970     public void ZTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
971         // TRSV is the same as TRMV
972         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
973         int N = A.getType().getY();
974         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
975 
976     }
977 
978     /**
979      * STBSV solves one of the systems of equations
980      * A*x = b   or   A**T*x = b
981      *
982      * Details: http://www.netlib.org/lapack/explore-html/d0/d1f/stbsv_8f.html
983      *
984      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
985      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
986      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
987      *           for i in range(0, n):
988      *              for j in range(i, min(i+k+1, n)):
989      *                  b[i, j-i] = a[i, j]
990      *
991      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
992      * @param TransA The type of transpose applied to matrix A.
993      * @param Diag Specifies whether or not A is unit triangular.
994      * @param K The number of off-diagonals of the matrix A
995      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
996      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
997      * @param incX The increment for the elements of vector x, must be larger than zero.
998      */
STBSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, int K, Allocation A, Allocation X, int incX)999     public void STBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
1000         // TBSV is the same as TRMV + K >= 0
1001         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
1002         int N = A.getType().getY();
1003         if (K < 0) {
1004             throw new RSRuntimeException("Number of diagonals must be positive");
1005         }
1006         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
1007     }
1008 
1009     /**
1010      * DTBSV solves one of the systems of equations
1011      * A*x = b   or   A**T*x = b
1012      *
1013      * Details: http://www.netlib.org/lapack/explore-html/d4/dcf/dtbsv_8f.html
1014      *
1015      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
1016      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
1017      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
1018      *           for i in range(0, n):
1019      *              for j in range(i, min(i+k+1, n)):
1020      *                  b[i, j-i] = a[i, j]
1021      *
1022      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
1023      * @param TransA The type of transpose applied to matrix A.
1024      * @param Diag Specifies whether or not A is unit triangular.
1025      * @param K The number of off-diagonals of the matrix A
1026      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
1027      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1028      * @param incX The increment for the elements of vector x, must be larger than zero.
1029      */
DTBSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, int K, Allocation A, Allocation X, int incX)1030     public void DTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
1031         // TBSV is the same as TRMV + K >= 0
1032         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
1033         int N = A.getType().getY();
1034         if (K < 0) {
1035             throw new RSRuntimeException("Number of diagonals must be positive");
1036         }
1037         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
1038     }
1039 
1040     /**
1041      * CTBSV solves one of the systems of equations
1042      * A*x = b   or   A**T*x = b   or   A**H*x = b
1043      *
1044      * Details: http://www.netlib.org/lapack/explore-html/d9/d5f/ctbsv_8f.html
1045      *
1046      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
1047      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
1048      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
1049      *           for i in range(0, n):
1050      *              for j in range(i, min(i+k+1, n)):
1051      *                  b[i, j-i] = a[i, j]
1052      *
1053      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
1054      * @param TransA The type of transpose applied to matrix A.
1055      * @param Diag Specifies whether or not A is unit triangular.
1056      * @param K The number of off-diagonals of the matrix A
1057      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
1058      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1059      * @param incX The increment for the elements of vector x, must be larger than zero.
1060      */
CTBSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, int K, Allocation A, Allocation X, int incX)1061     public void CTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
1062         // TBSV is the same as TRMV + K >= 0
1063         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
1064         int N = A.getType().getY();
1065         if (K < 0) {
1066             throw new RSRuntimeException("Number of diagonals must be positive");
1067         }
1068         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
1069     }
1070 
1071     /**
1072      * ZTBSV solves one of the systems of equations
1073      * A*x = b   or   A**T*x = b   or   A**H*x = b
1074      *
1075      * Details: http://www.netlib.org/lapack/explore-html/d4/d5a/ztbsv_8f.html
1076      *
1077      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
1078      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
1079      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
1080      *           for i in range(0, n):
1081      *              for j in range(i, min(i+k+1, n)):
1082      *                  b[i, j-i] = a[i, j]
1083      *
1084      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
1085      * @param TransA The type of transpose applied to matrix A.
1086      * @param Diag Specifies whether or not A is unit triangular.
1087      * @param K The number of off-diagonals of the matrix A
1088      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
1089      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
1090      * @param incX The increment for the elements of vector x, must be larger than zero.
1091      */
ZTBSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, int K, Allocation A, Allocation X, int incX)1092     public void ZTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
1093         // TBSV is the same as TRMV + K >= 0
1094         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
1095         int N = A.getType().getY();
1096         if (K < 0) {
1097             throw new RSRuntimeException("Number of diagonals must be positive");
1098         }
1099         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
1100     }
1101 
1102     /**
1103      * STPSV solves one of the systems of equations
1104      * A*x = b   or   A**T*x = b
1105      *
1106      * Details: http://www.netlib.org/lapack/explore-html/d0/d7c/stpsv_8f.html
1107      *
1108      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1109      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1110      *       'a' to packed matrix 'b'.
1111      *           k = 0
1112      *           for i in range(0, n):
1113      *              for j in range(i, n):
1114      *                  b[k++] = a[i, j]
1115      *
1116      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
1117      * @param TransA The type of transpose applied to matrix A.
1118      * @param Diag Specifies whether or not A is unit triangular.
1119      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32}.
1120      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1121      * @param incX The increment for the elements of vector x, must be larger than zero.
1122      */
STPSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)1123     public void STPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
1124         // TPSV is same as TPMV
1125         int N = validateTPMV(Element.F32(mRS), Uplo, TransA, Diag, Ap, X, incX);
1126         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
1127     }
1128 
1129     /**
1130      * DTPSV solves one of the systems of equations
1131      * A*x = b   or   A**T*x = b
1132      *
1133      * Details: http://www.netlib.org/lapack/explore-html/d9/d84/dtpsv_8f.html
1134      *
1135      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1136      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1137      *       'a' to packed matrix 'b'.
1138      *           k = 0
1139      *           for i in range(0, n):
1140      *              for j in range(i, n):
1141      *                  b[k++] = a[i, j]
1142      *
1143      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
1144      * @param TransA The type of transpose applied to matrix A.
1145      * @param Diag Specifies whether or not A is unit triangular.
1146      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64}.
1147      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1148      * @param incX The increment for the elements of vector x, must be larger than zero.
1149      */
DTPSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)1150     public void DTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
1151         // TPSV is same as TPMV
1152         int N = validateTPMV(Element.F64(mRS), Uplo, TransA, Diag, Ap, X, incX);
1153         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
1154     }
1155 
1156     /**
1157      * CTPSV solves one of the systems of equations
1158      * A*x = b   or   A**T*x = b   or   A**H*x = b
1159      *
1160      * Details: http://www.netlib.org/lapack/explore-html/d8/d56/ctpsv_8f.html
1161      *
1162      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1163      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1164      *       'a' to packed matrix 'b'.
1165      *           k = 0
1166      *           for i in range(0, n):
1167      *              for j in range(i, n):
1168      *                  b[k++] = a[i, j]
1169      *
1170      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
1171      * @param TransA The type of transpose applied to matrix A.
1172      * @param Diag Specifies whether or not A is unit triangular.
1173      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32_2}.
1174      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1175      * @param incX The increment for the elements of vector x, must be larger than zero.
1176      */
CTPSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)1177     public void CTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
1178         // TPSV is same as TPMV
1179         int N = validateTPMV(Element.F32_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
1180         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
1181     }
1182 
1183     /**
1184      * ZTPSV solves one of the systems of equations
1185      * A*x = b   or   A**T*x = b   or   A**H*x = b
1186      *
1187      * Details: http://www.netlib.org/lapack/explore-html/da/d57/ztpsv_8f.html
1188      *
1189      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1190      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1191      *       'a' to packed matrix 'b'.
1192      *           k = 0
1193      *           for i in range(0, n):
1194      *              for j in range(i, n):
1195      *                  b[k++] = a[i, j]
1196      *
1197      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
1198      * @param TransA The type of transpose applied to matrix A.
1199      * @param Diag Specifies whether or not A is unit triangular.
1200      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64_2}.
1201      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
1202      * @param incX The increment for the elements of vector x, must be larger than zero.
1203      */
ZTPSV(@plo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX)1204     public void ZTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
1205         // TPSV is same as TPMV
1206         int N = validateTPMV(Element.F64_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
1207         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
1208     }
1209 
1210     /**
1211      * Level 2, S and D only
1212      */
validateSYMV(Element e, @Uplo int Uplo, Allocation A, Allocation X, Allocation Y, int incX, int incY)1213     static int validateSYMV(Element e, @Uplo int Uplo, Allocation A, Allocation X, Allocation Y, int incX, int incY) {
1214         validateUplo(Uplo);
1215         int N = A.getType().getY();
1216         if (A.getType().getX() != N) {
1217             throw new RSRuntimeException("A must be a square matrix for SYMV");
1218         }
1219         if (!A.getType().getElement().isCompatible(e) ||
1220             !X.getType().getElement().isCompatible(e) ||
1221             !Y.getType().getElement().isCompatible(e) ) {
1222             throw new RSRuntimeException("Called BLAS with wrong Element type");
1223         }
1224         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
1225             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
1226         }
1227 
1228         if (incX <= 0 || incY <= 0) {
1229             throw new RSRuntimeException("Vector increments must be greater than 0");
1230         }
1231         int expectedXDim = 1 + (N - 1) * incX;
1232         if (X.getType().getX() != expectedXDim) {
1233             throw new RSRuntimeException("Incorrect vector dimensions for SYMV");
1234         }
1235         int expectedYDim = 1 + (N - 1) * incY;
1236         if (Y.getType().getX() != expectedYDim) {
1237             throw new RSRuntimeException("Incorrect vector dimensions for SYMV");
1238         }
1239         return N;
1240     }
validateSPMV(Element e, @Uplo int Uplo, Allocation Ap, Allocation X, int incX, Allocation Y, int incY)1241     static int validateSPMV(Element e, @Uplo int Uplo, Allocation Ap, Allocation X, int incX, Allocation Y, int incY) {
1242         validateUplo(Uplo);
1243         if (!Ap.getType().getElement().isCompatible(e) ||
1244             !X.getType().getElement().isCompatible(e) ||
1245             !Y.getType().getElement().isCompatible(e)) {
1246             throw new RSRuntimeException("Called BLAS with wrong Element type");
1247         }
1248         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
1249             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
1250         }
1251 
1252         if (Ap.getType().getY() > 1) {
1253             throw new RSRuntimeException("Ap must have a Y dimension of 0 or 1");
1254         }
1255 
1256         int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
1257         if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
1258             throw new RSRuntimeException("Invalid dimension for Ap");
1259         }
1260         if (incX <= 0 || incY <= 0) {
1261             throw new RSRuntimeException("Vector increments must be greater than 0");
1262         }
1263         int expectedXDim = 1 + (N - 1) * incX;
1264         if (X.getType().getX() != expectedXDim) {
1265             throw new RSRuntimeException("Incorrect vector dimensions for SPMV");
1266         }
1267         int expectedYDim = 1 + (N - 1) * incY;
1268         if (Y.getType().getX() != expectedYDim) {
1269             throw new RSRuntimeException("Incorrect vector dimensions for SPMV");
1270         }
1271 
1272         return N;
1273     }
validateGER(Element e, Allocation X, int incX, Allocation Y, int incY, Allocation A)1274     static void validateGER(Element e, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1275         if (!A.getType().getElement().isCompatible(e) ||
1276             !X.getType().getElement().isCompatible(e) ||
1277             !Y.getType().getElement().isCompatible(e) ) {
1278             throw new RSRuntimeException("Called BLAS with wrong Element type");
1279         }
1280 
1281         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
1282             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
1283         }
1284 
1285         int M = A.getType().getY();
1286         int N = A.getType().getX();
1287 
1288         if (N < 1 || M < 1) {
1289             throw new RSRuntimeException("M and N must be 1 or greater for GER");
1290         }
1291         if (incX <= 0 || incY <= 0) {
1292             throw new RSRuntimeException("Vector increments must be greater than 0");
1293         }
1294         int expectedXDim = 1 + (M - 1) * incX;
1295         if (X.getType().getX() != expectedXDim) {
1296             throw new RSRuntimeException("Incorrect vector dimensions for GER");
1297         }
1298         int expectedYDim = 1 + (N - 1) * incY;
1299         if (Y.getType().getX() != expectedYDim) {
1300             throw new RSRuntimeException("Incorrect vector dimensions for GER");
1301         }
1302 
1303 
1304     }
validateSYR(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation A)1305     static int validateSYR(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation A) {
1306         validateUplo(Uplo);
1307         if (!A.getType().getElement().isCompatible(e) ||
1308             !X.getType().getElement().isCompatible(e)) {
1309             throw new RSRuntimeException("Called BLAS with wrong Element type");
1310         }
1311 
1312         int N = A.getType().getX();
1313 
1314         if (X.getType().getY() > 1) {
1315             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
1316         }
1317         if (N != A.getType().getY()) {
1318             throw new RSRuntimeException("A must be a symmetric matrix");
1319         }
1320         if (incX <= 0) {
1321             throw new RSRuntimeException("Vector increments must be greater than 0");
1322         }
1323         int expectedXDim = 1 + (N - 1) * incX;
1324         if (X.getType().getX() != expectedXDim) {
1325             throw new RSRuntimeException("Incorrect vector dimensions for SYR");
1326         }
1327         return N;
1328     }
validateSPR(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Ap)1329     static int validateSPR(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Ap) {
1330         validateUplo(Uplo);
1331         if (!Ap.getType().getElement().isCompatible(e) ||
1332             !X.getType().getElement().isCompatible(e)) {
1333             throw new RSRuntimeException("Called BLAS with wrong Element type");
1334         }
1335         if (X.getType().getY() > 1) {
1336             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
1337         }
1338 
1339         if (Ap.getType().getY() > 1) {
1340             throw new RSRuntimeException("Ap must have a Y dimension of 0 or 1");
1341         }
1342 
1343         int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
1344         if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
1345             throw new RSRuntimeException("Invalid dimension for Ap");
1346         }
1347         if (incX <= 0) {
1348             throw new RSRuntimeException("Vector increments must be greater than 0");
1349         }
1350         int expectedXDim = 1 + (N - 1) * incX;
1351         if (X.getType().getX() != expectedXDim) {
1352             throw new RSRuntimeException("Incorrect vector dimensions for SPR");
1353         }
1354 
1355         return N;
1356     }
1357 
validateSYR2(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Y, int incY, Allocation A)1358     static int validateSYR2(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1359         validateUplo(Uplo);
1360         if (!A.getType().getElement().isCompatible(e) ||
1361             !X.getType().getElement().isCompatible(e) ||
1362             !Y.getType().getElement().isCompatible(e)) {
1363             throw new RSRuntimeException("Called BLAS with wrong Element type");
1364         }
1365 
1366         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
1367             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
1368         }
1369 
1370         int N = A.getType().getX();
1371 
1372         if (N != A.getType().getY()) {
1373             throw new RSRuntimeException("A must be a symmetric matrix");
1374         }
1375         if (incX <= 0 || incY <= 0) {
1376             throw new RSRuntimeException("Vector increments must be greater than 0");
1377         }
1378         int expectedXDim = 1 + (N - 1) * incX;
1379         int expectedYDim = 1 + (N - 1) * incY;
1380         if (X.getType().getX() != expectedXDim || Y.getType().getX() != expectedYDim) {
1381             throw new RSRuntimeException("Incorrect vector dimensions for SYR");
1382         }
1383         return N;
1384 
1385     }
validateSPR2(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Y, int incY, Allocation Ap)1386     static int validateSPR2(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
1387         validateUplo(Uplo);
1388         if (!Ap.getType().getElement().isCompatible(e) ||
1389             !X.getType().getElement().isCompatible(e) ||
1390             !Y.getType().getElement().isCompatible(e)) {
1391             throw new RSRuntimeException("Called BLAS with wrong Element type");
1392         }
1393         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
1394             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
1395         }
1396 
1397         if (Ap.getType().getY() > 1) {
1398             throw new RSRuntimeException("Ap must have a Y dimension of 0 or 1");
1399         }
1400 
1401         int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
1402         if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
1403             throw new RSRuntimeException("Invalid dimension for Ap");
1404         }
1405         if (incX <= 0 || incY <= 0) {
1406             throw new RSRuntimeException("Vector increments must be greater than 0");
1407         }
1408         int expectedXDim = 1 + (N - 1) * incX;
1409         int expectedYDim = 1 + (N - 1) * incY;
1410         if (X.getType().getX() != expectedXDim || Y.getType().getX() != expectedYDim) {
1411             throw new RSRuntimeException("Incorrect vector dimensions for SPR2");
1412         }
1413 
1414         return N;
1415     }
1416 
1417     /**
1418      * SSYMV performs the matrix-vector operation
1419      * y := alpha*A*x + beta*y
1420      *
1421      * Details: http://www.netlib.org/lapack/explore-html/d2/d94/ssymv_8f.html
1422      *
1423      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1424      * @param alpha The scalar alpha.
1425      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
1426      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1427      * @param incX The increment for the elements of vector x, must be larger than zero.
1428      * @param beta The scalar beta.
1429      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
1430      * @param incY The increment for the elements of vector y, must be larger than zero.
1431      */
SSYMV(@plo int Uplo, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY)1432     public void SSYMV(@Uplo int Uplo, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
1433         int N = validateSYMV(Element.F32(mRS), Uplo, A, X, Y, incX, incY);
1434         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssymv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
1435     }
1436 
1437     /**
1438      * SSBMV performs the matrix-vector operation
1439      * y := alpha*A*x + beta*y
1440      *
1441      * Details: http://www.netlib.org/lapack/explore-html/d3/da1/ssbmv_8f.html
1442      *
1443      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
1444      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
1445      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
1446      *           for i in range(0, n):
1447      *              for j in range(i, min(i+k+1, n)):
1448      *                  b[i, j-i] = a[i, j]
1449      *
1450      * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
1451      * @param K The number of off-diagonals of the matrix A
1452      * @param alpha The scalar alpha.
1453      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
1454      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1455      * @param incX The increment for the elements of vector x, must be larger than zero.
1456      * @param beta The scalar beta.
1457      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
1458      * @param incY The increment for the elements of vector y, must be larger than zero.
1459      */
SSBMV(@plo int Uplo, int K, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY)1460     public void SSBMV(@Uplo int Uplo, int K, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
1461         // SBMV is the same as SYMV + K >= 0
1462         if (K < 0) {
1463             throw new RSRuntimeException("K must be greater than or equal to 0");
1464         }
1465         int N = validateSYMV(Element.F32(mRS), Uplo, A, X, Y, incX, incY);
1466         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
1467     }
1468 
1469     /**
1470      * SSPMV performs the matrix-vector operation
1471      * y := alpha*A*x + beta*y
1472      *
1473      * Details: http://www.netlib.org/lapack/explore-html/d8/d68/sspmv_8f.html
1474      *
1475      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1476      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1477      *       'a' to packed matrix 'b'.
1478      *           k = 0
1479      *           for i in range(0, n):
1480      *              for j in range(i, n):
1481      *                  b[k++] = a[i, j]
1482      *
1483      * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
1484      * @param alpha The scalar alpha.
1485      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
1486      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1487      * @param incX The increment for the elements of vector x, must be larger than zero.
1488      * @param beta The scalar beta.
1489      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
1490      * @param incY The increment for the elements of vector y, must be larger than zero.
1491      */
SSPMV(@plo int Uplo, float alpha, Allocation Ap, Allocation X, int incX, float beta, Allocation Y, int incY)1492     public void SSPMV(@Uplo int Uplo, float alpha, Allocation Ap, Allocation X, int incX, float beta, Allocation Y, int incY) {
1493         int N = validateSPMV(Element.F32(mRS), Uplo, Ap, X, incX, Y, incY);
1494         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, Ap.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
1495     }
1496 
1497     /**
1498      * SGER performs the rank 1 operation
1499      * A := alpha*x*y**T + A
1500      *
1501      * Details: http://www.netlib.org/lapack/explore-html/db/d5c/sger_8f.html
1502      *
1503      * @param alpha The scalar alpha.
1504      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1505      * @param incX The increment for the elements of vector x, must be larger than zero.
1506      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
1507      * @param incY The increment for the elements of vector y, must be larger than zero.
1508      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
1509      */
SGER(float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)1510     public void SGER(float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1511         int M = A.getType().getY();
1512         int N = A.getType().getX();
1513         validateGER(Element.F32(mRS), X, incX, Y, incY, A);
1514         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sger, 0, 0, 0, 0, 0, M, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0.f, A.getID(mRS), incX, incY, 0, 0);
1515     }
1516 
1517     /**
1518      * SSYR performs the rank 1 operation
1519      * A := alpha*x*x**T + A
1520      *
1521      * Details: http://www.netlib.org/lapack/explore-html/d6/dac/ssyr_8f.html
1522      *
1523      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1524      * @param alpha The scalar alpha.
1525      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1526      * @param incX The increment for the elements of vector x, must be larger than zero.
1527      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
1528      */
SSYR(@plo int Uplo, float alpha, Allocation X, int incX, Allocation A)1529     public void SSYR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation A) {
1530         int N = validateSYR(Element.F32(mRS), Uplo, X, incX, A);
1531         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), A.getID(mRS), 0.f, 0, incX, 0, 0, 0);
1532     }
1533 
1534     /**
1535      * SSPR performs the rank 1 operation
1536      * A := alpha*x*x**T + A
1537      *
1538      * Details: http://www.netlib.org/lapack/explore-html/d2/d9b/sspr_8f.html
1539      *
1540      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1541      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1542      *       'a' to packed matrix 'b'.
1543      *           k = 0
1544      *           for i in range(0, n):
1545      *              for j in range(i, n):
1546      *                  b[k++] = a[i, j]
1547      *
1548      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
1549      * @param alpha The scalar alpha.
1550      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1551      * @param incX The increment for the elements of vector x, must be larger than zero.
1552      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
1553      */
SSPR(@plo int Uplo, float alpha, Allocation X, int incX, Allocation Ap)1554     public void SSPR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Ap) {
1555         int N = validateSPR(Element.F32(mRS), Uplo, X, incX, Ap);
1556         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Ap.getID(mRS), 0.f, 0, incX, 0, 0, 0);
1557     }
1558 
1559     /**
1560      * SSYR2 performs the symmetric rank 2 operation
1561      * A := alpha*x*y**T + alpha*y*x**T + A
1562      *
1563      * Details: http://www.netlib.org/lapack/explore-html/db/d99/ssyr2_8f.html
1564      *
1565      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1566      * @param alpha The scalar alpha.
1567      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1568      * @param incX The increment for the elements of vector x, must be larger than zero.
1569      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
1570      * @param incY The increment for the elements of vector y, must be larger than zero.
1571      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
1572      */
SSYR2(@plo int Uplo, float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)1573     public void SSYR2(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1574         int N = validateSYR2(Element.F32(mRS), Uplo, X, incX, Y, incY, A);
1575         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0, A.getID(mRS), incX, incY, 0, 0);
1576     }
1577 
1578     /**
1579      * SSPR2 performs the symmetric rank 2 operation
1580      * A := alpha*x*y**T + alpha*y*x**T + A
1581      *
1582      * Details: http://www.netlib.org/lapack/explore-html/db/d3e/sspr2_8f.html
1583      *
1584      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1585      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1586      *       'a' to packed matrix 'b'.
1587      *           k = 0
1588      *           for i in range(0, n):
1589      *              for j in range(i, n):
1590      *                  b[k++] = a[i, j]
1591      *
1592      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
1593      * @param alpha The scalar alpha.
1594      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
1595      * @param incX The increment for the elements of vector x, must be larger than zero.
1596      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
1597      * @param incY The increment for the elements of vector y, must be larger than zero.
1598      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
1599      */
SSPR2(@plo int Uplo, float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap)1600     public void SSPR2(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
1601         int N = validateSPR2(Element.F32(mRS), Uplo, X, incX, Y, incY, Ap);
1602         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0, Ap.getID(mRS), incX, incY, 0, 0);
1603     }
1604 
1605     /**
1606      * DSYMV performs the matrix-vector operation
1607      * y := alpha*A*x + beta*y
1608      *
1609      * Details: http://www.netlib.org/lapack/explore-html/d8/dbe/dsymv_8f.html
1610      *
1611      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1612      * @param alpha The scalar alpha.
1613      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
1614      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1615      * @param incX The increment for the elements of vector x, must be larger than zero.
1616      * @param beta The scalar beta.
1617      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
1618      * @param incY The increment for the elements of vector y, must be larger than zero.
1619      */
DSYMV(@plo int Uplo, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY)1620     public void DSYMV(@Uplo int Uplo, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
1621         int N = validateSYMV(Element.F64(mRS), Uplo, A, X, Y, incX, incY);
1622         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsymv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
1623     }
1624 
1625     /**
1626      * DSBMV performs the matrix-vector operation
1627      * y := alpha*A*x + beta*y
1628      *
1629      * Details: http://www.netlib.org/lapack/explore-html/d8/d1e/dsbmv_8f.html
1630      *
1631      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
1632      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
1633      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
1634      *           for i in range(0, n):
1635      *              for j in range(i, min(i+k+1, n)):
1636      *                  b[i, j-i] = a[i, j]
1637      *
1638      * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
1639      * @param K The number of off-diagonals of the matrix A
1640      * @param alpha The scalar alpha.
1641      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
1642      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1643      * @param incX The increment for the elements of vector x, must be larger than zero.
1644      * @param beta The scalar beta.
1645      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
1646      * @param incY The increment for the elements of vector y, must be larger than zero.
1647      */
DSBMV(@plo int Uplo, int K, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY)1648     public void DSBMV(@Uplo int Uplo, int K, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
1649         // SBMV is the same as SYMV + K >= 0
1650         if (K < 0) {
1651             throw new RSRuntimeException("K must be greater than or equal to 0");
1652         }
1653         int N = validateSYMV(Element.F64(mRS), Uplo, A, X, Y, incX, incY);
1654         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
1655     }
1656 
1657     /**
1658      * DSPMV performs the matrix-vector operation
1659      * y := alpha*A*x + beta*y
1660      *
1661      * Details: http://www.netlib.org/lapack/explore-html/d4/d85/dspmv_8f.html
1662      *
1663      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1664      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1665      *       'a' to packed matrix 'b'.
1666      *           k = 0
1667      *           for i in range(0, n):
1668      *              for j in range(i, n):
1669      *                  b[k++] = a[i, j]
1670      *
1671      * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
1672      * @param alpha The scalar alpha.
1673      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
1674      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1675      * @param incX The increment for the elements of vector x, must be larger than zero.
1676      * @param beta The scalar beta.
1677      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
1678      * @param incY The increment for the elements of vector y, must be larger than zero.
1679      */
DSPMV(@plo int Uplo, double alpha, Allocation Ap, Allocation X, int incX, double beta, Allocation Y, int incY)1680     public void DSPMV(@Uplo int Uplo, double alpha, Allocation Ap, Allocation X, int incX, double beta, Allocation Y, int incY) {
1681         int N = validateSPMV(Element.F64(mRS), Uplo, Ap, X, incX, Y, incY);
1682         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dspmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, Ap.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
1683     }
1684 
1685     /**
1686      * DGER performs the rank 1 operation
1687      * A := alpha*x*y**T + A
1688      *
1689      * Details: http://www.netlib.org/lapack/explore-html/dc/da8/dger_8f.html
1690      *
1691      * @param alpha The scalar alpha.
1692      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1693      * @param incX The increment for the elements of vector x, must be larger than zero.
1694      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
1695      * @param incY The increment for the elements of vector y, must be larger than zero.
1696      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
1697      */
DGER(double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)1698     public void DGER(double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1699         int M = A.getType().getY();
1700         int N = A.getType().getX();
1701         validateGER(Element.F64(mRS), X, incX, Y, incY, A);
1702         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dger, 0, 0, 0, 0, 0, M, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0.f, A.getID(mRS), incX, incY, 0, 0);
1703     }
1704 
1705     /**
1706      * DSYR performs the rank 1 operation
1707      * A := alpha*x*x**T + A
1708      *
1709      * Details: http://www.netlib.org/lapack/explore-html/d3/d60/dsyr_8f.html
1710      *
1711      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1712      * @param alpha The scalar alpha.
1713      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1714      * @param incX The increment for the elements of vector x, must be larger than zero.
1715      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
1716      */
DSYR(@plo int Uplo, double alpha, Allocation X, int incX, Allocation A)1717     public void DSYR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation A) {
1718         int N = validateSYR(Element.F64(mRS), Uplo, X, incX, A);
1719         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), A.getID(mRS), 0.f, 0, incX, 0, 0, 0);
1720     }
1721 
1722     /**
1723      * DSPR performs the rank 1 operation
1724      * A := alpha*x*x**T + A
1725      *
1726      * Details: http://www.netlib.org/lapack/explore-html/dd/dba/dspr_8f.html
1727      *
1728      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1729      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1730      *       'a' to packed matrix 'b'.
1731      *           k = 0
1732      *           for i in range(0, n):
1733      *              for j in range(i, n):
1734      *                  b[k++] = a[i, j]
1735      *
1736      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
1737      * @param alpha The scalar alpha.
1738      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1739      * @param incX The increment for the elements of vector x, must be larger than zero.
1740      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
1741      */
DSPR(@plo int Uplo, double alpha, Allocation X, int incX, Allocation Ap)1742     public void DSPR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Ap) {
1743         int N = validateSPR(Element.F64(mRS), Uplo, X, incX, Ap);
1744         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dspr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Ap.getID(mRS), 0.f, 0, incX, 0, 0, 0);
1745     }
1746 
1747     /**
1748      * DSYR2 performs the symmetric rank 2 operation
1749      * A := alpha*x*y**T + alpha*y*x**T + A
1750      *
1751      * Details: http://www.netlib.org/lapack/explore-html/de/d41/dsyr2_8f.html
1752      *
1753      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1754      * @param alpha The scalar alpha.
1755      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1756      * @param incX The increment for the elements of vector x, must be larger than zero.
1757      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
1758      * @param incY The increment for the elements of vector y, must be larger than zero.
1759      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
1760      */
DSYR2(@plo int Uplo, double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)1761     public void DSYR2(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1762         int N = validateSYR2(Element.F64(mRS), Uplo, X, incX, Y, incY, A);
1763         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0, A.getID(mRS), incX, incY, 0, 0);
1764     }
1765 
1766     /**
1767      * DSPR2 performs the symmetric rank 2 operation
1768      * A := alpha*x*y**T + alpha*y*x**T + A
1769      *
1770      * Details: http://www.netlib.org/lapack/explore-html/dd/d9e/dspr2_8f.html
1771      *
1772      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1773      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1774      *       'a' to packed matrix 'b'.
1775      *           k = 0
1776      *           for i in range(0, n):
1777      *              for j in range(i, n):
1778      *                  b[k++] = a[i, j]
1779      *
1780      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
1781      * @param alpha The scalar alpha.
1782      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
1783      * @param incX The increment for the elements of vector x, must be larger than zero.
1784      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
1785      * @param incY The increment for the elements of vector y, must be larger than zero.
1786      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
1787      */
DSPR2(@plo int Uplo, double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap)1788     public void DSPR2(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
1789         int N = validateSPR2(Element.F64(mRS), Uplo, X, incX, Y, incY, Ap);
1790         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dspr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0, Ap.getID(mRS), incX, incY, 0, 0);
1791     }
1792 
1793 
1794     /**
1795      * Level 2, C and Z only
1796      */
1797 
validateGERU(Element e, Allocation X, int incX, Allocation Y, int incY, Allocation A)1798     static void validateGERU(Element e, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1799         if (!A.getType().getElement().isCompatible(e) ||
1800             !X.getType().getElement().isCompatible(e) ||
1801             !Y.getType().getElement().isCompatible(e)) {
1802             throw new RSRuntimeException("Called BLAS with wrong Element type");
1803         }
1804         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
1805             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
1806         }
1807 
1808         int M = A.getType().getY();
1809         int N = A.getType().getX();
1810         if (incX <= 0 || incY <= 0) {
1811             throw new RSRuntimeException("Vector increments must be greater than 0");
1812         }
1813         int expectedXDim = 1 + (M - 1) * incX;
1814         if (X.getType().getX() != expectedXDim) {
1815             throw new RSRuntimeException("Incorrect vector dimensions for GERU");
1816         }
1817         int expectedYDim = 1 + (N - 1) * incY;
1818         if (Y.getType().getX() != expectedYDim) {
1819             throw new RSRuntimeException("Incorrect vector dimensions for GERU");
1820         }
1821 
1822     }
1823 
1824     /**
1825      * CHEMV performs the matrix-vector operation
1826      * y := alpha*A*x + beta*y
1827      *
1828      * Details: http://www.netlib.org/lapack/explore-html/d7/d51/chemv_8f.html
1829      *
1830      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1831      * @param alpha The scalar alpha.
1832      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
1833      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1834      * @param incX The increment for the elements of vector x, must be larger than zero.
1835      * @param beta The scalar beta.
1836      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
1837      * @param incY The increment for the elements of vector y, must be larger than zero.
1838      */
CHEMV(@plo int Uplo, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY)1839     public void CHEMV(@Uplo int Uplo, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
1840         // HEMV is the same as SYR2 validation-wise
1841         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
1842         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chemv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
1843     }
1844 
1845     /**
1846      * CHBMV performs the matrix-vector operation
1847      * y := alpha*A*x + beta*y
1848      *
1849      * Details: http://www.netlib.org/lapack/explore-html/db/dc2/chbmv_8f.html
1850      *
1851      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
1852      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
1853      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
1854      *           for i in range(0, n):
1855      *              for j in range(i, min(i+k+1, n)):
1856      *                  b[i, j-i] = a[i, j]
1857      *
1858      * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
1859      * @param K The number of off-diagonals of the matrix A
1860      * @param alpha The scalar alpha.
1861      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
1862      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1863      * @param incX The increment for the elements of vector x, must be larger than zero.
1864      * @param beta The scalar beta.
1865      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
1866      * @param incY The increment for the elements of vector y, must be larger than zero.
1867      */
CHBMV(@plo int Uplo, int K, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY)1868     public void CHBMV(@Uplo int Uplo, int K, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
1869         // HBMV is the same as SYR2 validation-wise
1870         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
1871         if (K < 0) {
1872             throw new RSRuntimeException("K must be 0 or greater for HBMV");
1873         }
1874         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
1875     }
1876 
1877     /**
1878      * CHPMV performs the matrix-vector operation
1879      * y := alpha*A*x + beta*y
1880      *
1881      * Details: http://www.netlib.org/lapack/explore-html/d2/d06/chpmv_8f.html
1882      *
1883      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1884      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1885      *       'a' to packed matrix 'b'.
1886      *           k = 0
1887      *           for i in range(0, n):
1888      *              for j in range(i, n):
1889      *                  b[k++] = a[i, j]
1890      *
1891      * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
1892      * @param alpha The scalar alpha.
1893      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
1894      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1895      * @param incX The increment for the elements of vector x, must be larger than zero.
1896      * @param beta The scalar beta.
1897      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
1898      * @param incY The increment for the elements of vector y, must be larger than zero.
1899      */
CHPMV(@plo int Uplo, Float2 alpha, Allocation Ap, Allocation X, int incX, Float2 beta, Allocation Y, int incY)1900     public void CHPMV(@Uplo int Uplo, Float2 alpha, Allocation Ap, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
1901         // HPMV is the same as SPR2
1902         int N = validateSPR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, Ap);
1903         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, Ap.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
1904     }
1905 
1906     /**
1907      * CGERU performs the rank 1 operation
1908      * A := alpha*x*y**T + A
1909      *
1910      * Details: http://www.netlib.org/lapack/explore-html/db/d5f/cgeru_8f.html
1911      *
1912      * @param alpha The scalar alpha.
1913      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1914      * @param incX The increment for the elements of vector x, must be larger than zero.
1915      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
1916      * @param incY The increment for the elements of vector y, must be larger than zero.
1917      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
1918      */
CGERU(Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)1919     public void CGERU(Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1920         validateGERU(Element.F32_2(mRS), X, incX, Y, incY, A);
1921         int M = A.getType().getY();
1922         int N = A.getType().getX();
1923         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgeru, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
1924     }
1925 
1926     /**
1927      * CGERC performs the rank 1 operation
1928      * A := alpha*x*y**H + A
1929      *
1930      * Details: http://www.netlib.org/lapack/explore-html/dd/d84/cgerc_8f.html
1931      *
1932      * @param alpha The scalar alpha.
1933      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1934      * @param incX The increment for the elements of vector x, must be larger than zero.
1935      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
1936      * @param incY The increment for the elements of vector y, must be larger than zero.
1937      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
1938      */
CGERC(Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)1939     public void CGERC(Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
1940         // same as GERU
1941         validateGERU(Element.F32_2(mRS), X, incX, Y, incY, A);
1942         int M = A.getType().getY();
1943         int N = A.getType().getX();
1944         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgerc, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
1945     }
1946 
1947     /**
1948      * CHER performs the rank 1 operation
1949      * A := alpha*x*x**H + A
1950      *
1951      * Details: http://www.netlib.org/lapack/explore-html/d3/d6d/cher_8f.html
1952      *
1953      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1954      * @param alpha The scalar alpha.
1955      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1956      * @param incX The increment for the elements of vector x, must be larger than zero.
1957      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
1958      */
CHER(@plo int Uplo, float alpha, Allocation X, int incX, Allocation A)1959     public void CHER(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation A) {
1960         // same as SYR
1961         int N = validateSYR(Element.F32_2(mRS), Uplo, X, incX, A);
1962         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, X.getID(mRS), 0, 0, 0, A.getID(mRS), incX, 0, 0, 0);
1963     }
1964 
1965     /**
1966      * CHPR performs the rank 1 operation
1967      * A := alpha*x*x**H + A
1968      *
1969      * Details: http://www.netlib.org/lapack/explore-html/db/dcd/chpr_8f.html
1970      *
1971      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
1972      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
1973      *       'a' to packed matrix 'b'.
1974      *           k = 0
1975      *           for i in range(0, n):
1976      *              for j in range(i, n):
1977      *                  b[k++] = a[i, j]
1978      *
1979      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
1980      * @param alpha The scalar alpha.
1981      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
1982      * @param incX The increment for the elements of vector x, must be larger than zero.
1983      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
1984      */
CHPR(@plo int Uplo, float alpha, Allocation X, int incX, Allocation Ap)1985     public void CHPR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Ap) {
1986         // equivalent to SPR for validation
1987         int N = validateSPR(Element.F32_2(mRS), Uplo, X, incX, Ap);
1988         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, X.getID(mRS), 0, 0, 0, Ap.getID(mRS), incX, 0, 0, 0);
1989     }
1990 
1991     /**
1992      * CHER2 performs the symmetric rank 2 operation
1993      * A := alpha*x*y**H + alpha*y*x**H + A
1994      *
1995      * Details: http://www.netlib.org/lapack/explore-html/db/d87/cher2_8f.html
1996      *
1997      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
1998      * @param alpha The scalar alpha.
1999      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
2000      * @param incX The increment for the elements of vector x, must be larger than zero.
2001      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
2002      * @param incY The increment for the elements of vector y, must be larger than zero.
2003      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
2004      */
CHER2(@plo int Uplo, Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)2005     public void CHER2(@Uplo int Uplo, Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
2006         // same as SYR2
2007         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
2008         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
2009     }
2010 
2011     /**
2012      * CHPR2 performs the symmetric rank 2 operation
2013      * A := alpha*x*y**H + alpha*y*x**H + A
2014      *
2015      * Details: http://www.netlib.org/lapack/explore-html/d6/d44/chpr2_8f.html
2016      *
2017      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
2018      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
2019      *       'a' to packed matrix 'b'.
2020      *           k = 0
2021      *           for i in range(0, n):
2022      *              for j in range(i, n):
2023      *                  b[k++] = a[i, j]
2024      *
2025      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
2026      * @param alpha The scalar alpha.
2027      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
2028      * @param incX The increment for the elements of vector x, must be larger than zero.
2029      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
2030      * @param incY The increment for the elements of vector y, must be larger than zero.
2031      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
2032      */
CHPR2(@plo int Uplo, Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap)2033     public void CHPR2(@Uplo int Uplo, Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
2034         // same as SPR2
2035         int N = validateSPR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, Ap);
2036         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, Ap.getID(mRS), incX, incY, 0, 0);
2037     }
2038 
2039     /**
2040      * ZHEMV performs the matrix-vector operation
2041      * y := alpha*A*x + beta*y
2042      *
2043      * Details: http://www.netlib.org/lapack/explore-html/d0/ddd/zhemv_8f.html
2044      *
2045      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
2046      * @param alpha The scalar alpha.
2047      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2048      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2049      * @param incX The increment for the elements of vector x, must be larger than zero.
2050      * @param beta The scalar beta.
2051      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
2052      * @param incY The increment for the elements of vector y, must be larger than zero.
2053      */
ZHEMV(@plo int Uplo, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY)2054     public void ZHEMV(@Uplo int Uplo, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
2055         // HEMV is the same as SYR2 validation-wise
2056         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
2057         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhemv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
2058     }
2059 
2060     /**
2061      * ZHBMV performs the matrix-vector operation
2062      * y := alpha*A*x + beta*y
2063      *
2064      * Details: http://www.netlib.org/lapack/explore-html/d3/d1a/zhbmv_8f.html
2065      *
2066      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
2067      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
2068      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
2069      *           for i in range(0, n):
2070      *              for j in range(i, min(i+k+1, n)):
2071      *                  b[i, j-i] = a[i, j]
2072      *
2073      * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
2074      * @param K The number of off-diagonals of the matrix A
2075      * @param alpha The scalar alpha.
2076      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2077      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2078      * @param incX The increment for the elements of vector x, must be larger than zero.
2079      * @param beta The scalar beta.
2080      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
2081      * @param incY The increment for the elements of vector y, must be larger than zero.
2082      */
ZHBMV(@plo int Uplo, int K, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY)2083     public void ZHBMV(@Uplo int Uplo, int K, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
2084         // HBMV is the same as SYR2 validation-wise
2085         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
2086         if (K < 0) {
2087             throw new RSRuntimeException("K must be 0 or greater for HBMV");
2088         }
2089         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
2090     }
2091 
2092     /**
2093      * ZHPMV performs the matrix-vector operation
2094      * y := alpha*A*x + beta*y
2095      *
2096      * Details: http://www.netlib.org/lapack/explore-html/d0/d60/zhpmv_8f.html
2097      *
2098      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
2099      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
2100      *       'a' to packed matrix 'b'.
2101      *           k = 0
2102      *           for i in range(0, n):
2103      *              for j in range(i, n):
2104      *                  b[k++] = a[i, j]
2105      *
2106      * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
2107      * @param alpha The scalar alpha.
2108      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2109      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2110      * @param incX The increment for the elements of vector x, must be larger than zero.
2111      * @param beta The scalar beta.
2112      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
2113      * @param incY The increment for the elements of vector y, must be larger than zero.
2114      */
ZHPMV(@plo int Uplo, Double2 alpha, Allocation Ap, Allocation X, int incX, Double2 beta, Allocation Y, int incY)2115     public void ZHPMV(@Uplo int Uplo, Double2 alpha, Allocation Ap, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
2116         // HPMV is the same as SPR2
2117         int N = validateSPR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, Ap);
2118         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhpmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, Ap.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
2119     }
2120 
2121     /**
2122      * ZGERU performs the rank 1 operation
2123      * A := alpha*x*y**T + A
2124      *
2125      * Details: http://www.netlib.org/lapack/explore-html/d7/d12/zgeru_8f.html
2126      *
2127      * @param alpha The scalar alpha.
2128      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2129      * @param incX The increment for the elements of vector x, must be larger than zero.
2130      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
2131      * @param incY The increment for the elements of vector y, must be larger than zero.
2132      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2133      */
ZGERU(Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)2134     public void ZGERU(Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
2135         validateGERU(Element.F64_2(mRS), X, incX, Y, incY, A);
2136         int M = A.getType().getY();
2137         int N = A.getType().getX();
2138         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgeru, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
2139     }
2140 
2141     /**
2142      * ZGERC performs the rank 1 operation
2143      * A := alpha*x*y**H + A
2144      *
2145      * Details: http://www.netlib.org/lapack/explore-html/d3/dad/zgerc_8f.html
2146      *
2147      * @param alpha The scalar alpha.
2148      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2149      * @param incX The increment for the elements of vector x, must be larger than zero.
2150      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
2151      * @param incY The increment for the elements of vector y, must be larger than zero.
2152      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2153      */
ZGERC(Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)2154     public void ZGERC(Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
2155         // same as GERU
2156         validateGERU(Element.F64_2(mRS), X, incX, Y, incY, A);
2157         int M = A.getType().getY();
2158         int N = A.getType().getX();
2159         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgerc, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
2160     }
2161 
2162     /**
2163      * ZHER performs the rank 1 operation
2164      * A := alpha*x*x**H + A
2165      *
2166      * Details: http://www.netlib.org/lapack/explore-html/de/d0e/zher_8f.html
2167      *
2168      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
2169      * @param alpha The scalar alpha.
2170      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2171      * @param incX The increment for the elements of vector x, must be larger than zero.
2172      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2173      */
ZHER(@plo int Uplo, double alpha, Allocation X, int incX, Allocation A)2174     public void ZHER(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation A) {
2175         // same as SYR
2176         int N = validateSYR(Element.F64_2(mRS), Uplo, X, incX, A);
2177         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zher, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, X.getID(mRS), 0, 0, 0, A.getID(mRS), incX, 0, 0, 0);
2178     }
2179 
2180     /**
2181      * ZHPR performs the rank 1 operation
2182      * A := alpha*x*x**H + A
2183      *
2184      * Details: http://www.netlib.org/lapack/explore-html/de/de1/zhpr_8f.html
2185      *
2186      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
2187      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
2188      *       'a' to packed matrix 'b'.
2189      *           k = 0
2190      *           for i in range(0, n):
2191      *              for j in range(i, n):
2192      *                  b[k++] = a[i, j]
2193      *
2194      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
2195      * @param alpha The scalar alpha.
2196      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2197      * @param incX The increment for the elements of vector x, must be larger than zero.
2198      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2199      */
ZHPR(@plo int Uplo, double alpha, Allocation X, int incX, Allocation Ap)2200     public void ZHPR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Ap) {
2201         // equivalent to SPR for validation
2202         int N = validateSPR(Element.F64_2(mRS), Uplo, X, incX, Ap);
2203         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhpr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, X.getID(mRS), 0, 0, 0, Ap.getID(mRS), incX, 0, 0, 0);
2204     }
2205 
2206     /**
2207      * ZHER2 performs the symmetric rank 2 operation
2208      * A := alpha*x*y**H + alpha*y*x**H + A
2209      *
2210      * Details: http://www.netlib.org/lapack/explore-html/da/d8a/zher2_8f.html
2211      *
2212      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
2213      * @param alpha The scalar alpha.
2214      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2215      * @param incX The increment for the elements of vector x, must be larger than zero.
2216      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
2217      * @param incY The increment for the elements of vector y, must be larger than zero.
2218      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2219      */
ZHER2(@plo int Uplo, Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A)2220     public void ZHER2(@Uplo int Uplo, Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
2221         // same as SYR2
2222         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
2223         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zher2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
2224     }
2225 
2226     /**
2227      * ZHPR2 performs the symmetric rank 2 operation
2228      * A := alpha*x*y**H + alpha*y*x**H + A
2229      *
2230      * Details: http://www.netlib.org/lapack/explore-html/d5/d52/zhpr2_8f.html
2231      *
2232      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
2233      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
2234      *       'a' to packed matrix 'b'.
2235      *           k = 0
2236      *           for i in range(0, n):
2237      *              for j in range(i, n):
2238      *                  b[k++] = a[i, j]
2239      *
2240      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
2241      * @param alpha The scalar alpha.
2242      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
2243      * @param incX The increment for the elements of vector x, must be larger than zero.
2244      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
2245      * @param incY The increment for the elements of vector y, must be larger than zero.
2246      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2247      */
ZHPR2(@plo int Uplo, Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap)2248     public void ZHPR2(@Uplo int Uplo, Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
2249         // same as SPR2
2250         int N = validateSPR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, Ap);
2251         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhpr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, Ap.getID(mRS), incX, incY, 0, 0);
2252     }
2253 
2254 
2255     /**
2256      * Level 3 BLAS
2257      */
2258 
validateL3(Element e, int TransA, int TransB, int Side, Allocation A, Allocation B, Allocation C)2259     static void validateL3(Element e, int TransA, int TransB, int Side, Allocation A, Allocation B, Allocation C) {
2260         int aM = -1, aN = -1, bM = -1, bN = -1, cM = -1, cN = -1;
2261         if ((A != null && !A.getType().getElement().isCompatible(e)) ||
2262             (B != null && !B.getType().getElement().isCompatible(e)) ||
2263             (C != null && !C.getType().getElement().isCompatible(e))) {
2264             throw new RSRuntimeException("Called BLAS with wrong Element type");
2265         }
2266         if (C == null) {
2267             //since matrix C is used to store the result, it cannot be null.
2268             throw new RSRuntimeException("Allocation C cannot be null");
2269         }
2270         cM = C.getType().getY();
2271         cN = C.getType().getX();
2272 
2273         if (Side == RIGHT) {
2274             if ((A == null && B != null) || (A != null && B == null)) {
2275                 throw new RSRuntimeException("Provided Matrix A without Matrix B, or vice versa");
2276             }
2277             if (B != null) {
2278                 bM = A.getType().getY();
2279                 bN = A.getType().getX();
2280             }
2281             if (A != null) {
2282                 aM = B.getType().getY();
2283                 aN = B.getType().getX();
2284             }
2285         } else {
2286             if (A != null) {
2287                 if (TransA == TRANSPOSE || TransA == CONJ_TRANSPOSE) {
2288                     aN = A.getType().getY();
2289                     aM = A.getType().getX();
2290                 } else {
2291                     aM = A.getType().getY();
2292                     aN = A.getType().getX();
2293                 }
2294             }
2295             if (B != null) {
2296                 if (TransB == TRANSPOSE || TransB == CONJ_TRANSPOSE) {
2297                     bN = B.getType().getY();
2298                     bM = B.getType().getX();
2299                 } else {
2300                     bM = B.getType().getY();
2301                     bN = B.getType().getX();
2302                 }
2303             }
2304         }
2305         if (A != null && B != null && C != null) {
2306             if (aN != bM || aM != cM || bN != cN) {
2307                 throw new RSRuntimeException("Called BLAS with invalid dimensions");
2308             }
2309         } else if (A != null && C != null) {
2310             // A and C only, for SYRK
2311             if (cM != cN) {
2312                 throw new RSRuntimeException("Matrix C is not symmetric");
2313             }
2314             if (aM != cM) {
2315                 throw new RSRuntimeException("Called BLAS with invalid dimensions");
2316             }
2317         } else if (A != null && B != null) {
2318             // A and B only
2319             if (aN != bM) {
2320                 throw new RSRuntimeException("Called BLAS with invalid dimensions");
2321             }
2322         }
2323 
2324     }
2325 
2326     /**
2327      * SGEMM performs one of the matrix-matrix operations
2328      * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T
2329      *
2330      * Details: http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
2331      *
2332      * @param TransA The type of transpose applied to matrix A.
2333      * @param TransB The type of transpose applied to matrix B.
2334      * @param alpha The scalar alpha.
2335      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
2336      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
2337      * @param beta The scalar beta.
2338      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
2339      */
SGEMM(@ranspose int TransA, @Transpose int TransB, float alpha, Allocation A, Allocation B, float beta, Allocation C)2340     public void SGEMM(@Transpose int TransA, @Transpose int TransB, float alpha, Allocation A,
2341                       Allocation B, float beta, Allocation C) {
2342         validateTranspose(TransA);
2343         validateTranspose(TransB);
2344         validateL3(Element.F32(mRS), TransA, TransB, 0, A, B, C);
2345 
2346         int M = -1, N = -1, K = -1;
2347         if (TransA != NO_TRANSPOSE) {
2348             M = A.getType().getX();
2349             K = A.getType().getY();
2350         } else {
2351             M = A.getType().getY();
2352             K = A.getType().getX();
2353         }
2354         if (TransB != NO_TRANSPOSE) {
2355             N = B.getType().getY();
2356         } else {
2357             N = B.getType().getX();
2358         }
2359         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha, A.getID(mRS), B.getID(mRS),
2360                                         beta, C.getID(mRS), 0, 0, 0, 0);
2361     }
2362 
2363     /**
2364      * DGEMM performs one of the matrix-matrix operations
2365      * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T
2366      *
2367      * Details: http://www.netlib.org/lapack/explore-html/d7/d2b/dgemm_8f.html
2368      *
2369      * @param TransA The type of transpose applied to matrix A.
2370      * @param TransB The type of transpose applied to matrix B.
2371      * @param alpha The scalar alpha.
2372      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
2373      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
2374      * @param beta The scalar beta.
2375      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
2376      */
DGEMM(@ranspose int TransA, @Transpose int TransB, double alpha, Allocation A, Allocation B, double beta, Allocation C)2377     public void DGEMM(@Transpose int TransA, @Transpose int TransB, double alpha, Allocation A,
2378                       Allocation B, double beta, Allocation C) {
2379         validateTranspose(TransA);
2380         validateTranspose(TransB);
2381         validateL3(Element.F64(mRS), TransA, TransB, 0, A, B, C);
2382         int M = -1, N = -1, K = -1;
2383         if (TransA != NO_TRANSPOSE) {
2384             M = A.getType().getX();
2385             K = A.getType().getY();
2386         } else {
2387             M = A.getType().getY();
2388             K = A.getType().getX();
2389         }
2390         if (TransB != NO_TRANSPOSE) {
2391             N = B.getType().getY();
2392         } else {
2393             N = B.getType().getX();
2394         }
2395         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha, A.getID(mRS), B.getID(mRS),
2396                                         beta, C.getID(mRS), 0, 0, 0, 0);
2397     }
2398 
2399     /**
2400      * CGEMM performs one of the matrix-matrix operations
2401      * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T  or  op(X) = X**H
2402      *
2403      * Details: http://www.netlib.org/lapack/explore-html/d6/d5b/cgemm_8f.html
2404      *
2405      * @param TransA The type of transpose applied to matrix A.
2406      * @param TransB The type of transpose applied to matrix B.
2407      * @param alpha The scalar alpha.
2408      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
2409      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
2410      * @param beta The scalar beta.
2411      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
2412      */
CGEMM(@ranspose int TransA, @Transpose int TransB, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C)2413     public void CGEMM(@Transpose int TransA, @Transpose int TransB, Float2 alpha, Allocation A,
2414                       Allocation B, Float2 beta, Allocation C) {
2415         validateTranspose(TransA);
2416         validateTranspose(TransB);
2417         validateL3(Element.F32_2(mRS), TransA, TransB, 0, A, B, C);
2418         int M = -1, N = -1, K = -1;
2419         if (TransA != NO_TRANSPOSE) {
2420             M = A.getType().getX();
2421             K = A.getType().getY();
2422         } else {
2423             M = A.getType().getY();
2424             K = A.getType().getX();
2425         }
2426         if (TransB != NO_TRANSPOSE) {
2427             N = B.getType().getY();
2428         } else {
2429             N = B.getType().getX();
2430         }
2431         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha.x, alpha.y, A.getID(mRS), B.getID(mRS),
2432                                          beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
2433     }
2434 
2435     /**
2436      * ZGEMM performs one of the matrix-matrix operations
2437      * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T  or  op(X) = X**H
2438      *
2439      * Details: http://www.netlib.org/lapack/explore-html/d7/d76/zgemm_8f.html
2440      *
2441      * @param TransA The type of transpose applied to matrix A.
2442      * @param TransB The type of transpose applied to matrix B.
2443      * @param alpha The scalar alpha.
2444      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2445      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
2446      * @param beta The scalar beta.
2447      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
2448      */
ZGEMM(@ranspose int TransA, @Transpose int TransB, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C)2449     public void ZGEMM(@Transpose int TransA, @Transpose int TransB, Double2 alpha, Allocation A,
2450                       Allocation B, Double2 beta, Allocation C) {
2451         validateTranspose(TransA);
2452         validateTranspose(TransB);
2453         validateL3(Element.F64_2(mRS), TransA, TransB, 0, A, B, C);
2454         int M = -1, N = -1, K = -1;
2455         if (TransA != NO_TRANSPOSE) {
2456             M = A.getType().getX();
2457             K = A.getType().getY();
2458         } else {
2459             M = A.getType().getY();
2460             K = A.getType().getX();
2461         }
2462         if (TransB != NO_TRANSPOSE) {
2463             N = B.getType().getY();
2464         } else {
2465             N = B.getType().getX();
2466         }
2467         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha.x, alpha.y, A.getID(mRS), B.getID(mRS),
2468                                    beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
2469     }
2470 
2471     /**
2472      * SSYMM performs one of the matrix-matrix operations
2473      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
2474      *
2475      * Details: http://www.netlib.org/lapack/explore-html/d7/d42/ssymm_8f.html
2476      *
2477      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2478      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
2479      * @param alpha The scalar alpha.
2480      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
2481      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
2482      * @param beta The scalar beta.
2483      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
2484      */
SSYMM(@ide int Side, @Uplo int Uplo, float alpha, Allocation A, Allocation B, float beta, Allocation C)2485     public void SSYMM(@Side int Side, @Uplo int Uplo, float alpha, Allocation A,
2486                       Allocation B, float beta, Allocation C) {
2487         validateSide(Side);
2488         validateUplo(Uplo);
2489         //For SYMM, Matrix A should be symmetric
2490         if (A.getType().getX() != A.getType().getY()) {
2491             throw new RSRuntimeException("Matrix A is not symmetric");
2492         }
2493         validateL3(Element.F32(mRS), 0, 0, Side, A, B, C);
2494         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha, A.getID(mRS), B.getID(mRS),
2495                                         beta, C.getID(mRS), 0, 0, 0, 0);
2496     }
2497 
2498     /**
2499      * DSYMM performs one of the matrix-matrix operations
2500      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
2501      *
2502      * Details: http://www.netlib.org/lapack/explore-html/d8/db0/dsymm_8f.html
2503      *
2504      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2505      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
2506      * @param alpha The scalar alpha.
2507      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
2508      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
2509      * @param beta The scalar beta.
2510      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
2511      */
DSYMM(@ide int Side, @Uplo int Uplo, double alpha, Allocation A, Allocation B, double beta, Allocation C)2512     public void DSYMM(@Side int Side, @Uplo int Uplo, double alpha, Allocation A,
2513                       Allocation B, double beta, Allocation C) {
2514         validateSide(Side);
2515         validateUplo(Uplo);
2516         if (A.getType().getX() != A.getType().getY()) {
2517             throw new RSRuntimeException("Matrix A is not symmetric");
2518         }
2519         validateL3(Element.F64(mRS), 0, 0, Side, A, B, C);
2520         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha, A.getID(mRS), B.getID(mRS),
2521                                         beta, C.getID(mRS), 0, 0, 0, 0);
2522     }
2523 
2524     /**
2525      * CSYMM performs one of the matrix-matrix operations
2526      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
2527      *
2528      * Details: http://www.netlib.org/lapack/explore-html/db/d59/csymm_8f.html
2529      *
2530      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2531      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
2532      * @param alpha The scalar alpha.
2533      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
2534      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
2535      * @param beta The scalar beta.
2536      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
2537      */
CSYMM(@ide int Side, @Uplo int Uplo, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C)2538     public void CSYMM(@Side int Side, @Uplo int Uplo, Float2 alpha, Allocation A,
2539                       Allocation B, Float2 beta, Allocation C) {
2540         validateSide(Side);
2541         validateUplo(Uplo);
2542         if (A.getType().getX() != A.getType().getY()) {
2543             throw new RSRuntimeException("Matrix A is not symmetric");
2544         }
2545         validateL3(Element.F32_2(mRS), 0, 0, Side, A, B, C);
2546         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha.x, alpha.y, A.getID(mRS), B.getID(mRS),
2547                                          beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
2548     }
2549 
2550     /**
2551      * ZSYMM performs one of the matrix-matrix operations
2552      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
2553      *
2554      * Details: http://www.netlib.org/lapack/explore-html/df/d51/zsymm_8f.html
2555      *
2556      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2557      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
2558      * @param alpha The scalar alpha.
2559      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2560      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
2561      * @param beta The scalar beta.
2562      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
2563      */
ZSYMM(@ide int Side, @Uplo int Uplo, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C)2564     public void ZSYMM(@Side int Side, @Uplo int Uplo, Double2 alpha, Allocation A,
2565                       Allocation B, Double2 beta, Allocation C) {
2566         validateSide(Side);
2567         validateUplo(Uplo);
2568         if (A.getType().getX() != A.getType().getY()) {
2569             throw new RSRuntimeException("Matrix A is not symmetric");
2570         }
2571         validateL3(Element.F64_2(mRS), 0, 0, Side, A, B, C);
2572         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zsymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha.x, alpha.y, A.getID(mRS), B.getID(mRS),
2573                                    beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
2574     }
2575 
2576     /**
2577      * SSYRK performs one of the symmetric rank k operations
2578      * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
2579      *
2580      * Details: http://www.netlib.org/lapack/explore-html/d0/d40/ssyrk_8f.html
2581      *
2582      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
2583      * @param Trans The type of transpose applied to the operation.
2584      * @param alpha The scalar alpha.
2585      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
2586      * @param beta The scalar beta.
2587      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
2588      */
SSYRK(@plo int Uplo, @Transpose int Trans, float alpha, Allocation A, float beta, Allocation C)2589     public void SSYRK(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, float beta, Allocation C) {
2590         validateTranspose(Trans);
2591         validateUplo(Uplo);
2592         validateL3(Element.F32(mRS), Trans, 0, 0, A, null, C);
2593         int K = -1;
2594         if (Trans != NO_TRANSPOSE) {
2595             K = A.getType().getY();
2596         } else {
2597             K = A.getType().getX();
2598         }
2599 
2600         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, A.getID(mRS), 0, beta, C.getID(mRS), 0, 0, 0, 0);
2601     }
2602 
2603     /**
2604      * DSYRK performs one of the symmetric rank k operations
2605      * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
2606      *
2607      * Details: http://www.netlib.org/lapack/explore-html/dc/d05/dsyrk_8f.html
2608      *
2609      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
2610      * @param Trans The type of transpose applied to the operation.
2611      * @param alpha The scalar alpha.
2612      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
2613      * @param beta The scalar beta.
2614      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
2615      */
DSYRK(@plo int Uplo, @Transpose int Trans, double alpha, Allocation A, double beta, Allocation C)2616     public void DSYRK(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, double beta, Allocation C) {
2617         validateTranspose(Trans);
2618         validateUplo(Uplo);
2619         validateL3(Element.F64(mRS), Trans, 0, 0, A, null, C);
2620         int K = -1;
2621         if (Trans != NO_TRANSPOSE) {
2622             K = A.getType().getY();
2623         } else {
2624             K = A.getType().getX();
2625         }
2626         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, A.getID(mRS), 0, beta, C.getID(mRS), 0, 0, 0, 0);
2627     }
2628 
2629     /**
2630      * CSYRK performs one of the symmetric rank k operations
2631      * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
2632      *
2633      * Details: http://www.netlib.org/lapack/explore-html/d3/d6a/csyrk_8f.html
2634      *
2635      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
2636      * @param Trans The type of transpose applied to the operation.
2637      * @param alpha The scalar alpha.
2638      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
2639      * @param beta The scalar beta.
2640      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
2641      */
CSYRK(@plo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Float2 beta, Allocation C)2642     public void CSYRK(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Float2 beta, Allocation C) {
2643         validateTranspose(Trans);
2644         validateUplo(Uplo);
2645         validateL3(Element.F32_2(mRS), Trans, 0, 0, A, null, C);
2646         int K = -1;
2647         if (Trans != NO_TRANSPOSE) {
2648             K = A.getType().getY();
2649         } else {
2650             K = A.getType().getX();
2651         }
2652         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, A.getID(mRS), 0, beta.x, beta.y,
2653                                          C.getID(mRS), 0, 0, 0, 0);
2654     }
2655 
2656     /**
2657      * ZSYRK performs one of the symmetric rank k operations
2658      * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
2659      *
2660      * Details: http://www.netlib.org/lapack/explore-html/de/d54/zsyrk_8f.html
2661      *
2662      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
2663      * @param Trans The type of transpose applied to the operation.
2664      * @param alpha The scalar alpha.
2665      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2666      * @param beta The scalar beta.
2667      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
2668      */
ZSYRK(@plo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Double2 beta, Allocation C)2669     public void ZSYRK(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Double2 beta, Allocation C) {
2670         validateTranspose(Trans);
2671         validateUplo(Uplo);
2672         validateL3(Element.F64_2(mRS), Trans, 0, 0, A, null, C);
2673         int K = -1;
2674         if (Trans != NO_TRANSPOSE) {
2675             K = A.getType().getY();
2676         } else {
2677             K = A.getType().getX();
2678         }
2679         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zsyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, A.getID(mRS), 0, beta.x, beta.y,
2680                                    C.getID(mRS), 0, 0, 0, 0);
2681     }
2682 
validateSYR2K(Element e, @Transpose int Trans, Allocation A, Allocation B, Allocation C)2683     static void validateSYR2K(Element e, @Transpose int Trans, Allocation A, Allocation B, Allocation C) {
2684         validateTranspose(Trans);
2685         if (!A.getType().getElement().isCompatible(e) ||
2686             !B.getType().getElement().isCompatible(e) ||
2687             !C.getType().getElement().isCompatible(e)) {
2688             throw new RSRuntimeException("Called BLAS with wrong Element type");
2689         }
2690         int Cdim = -1;
2691         // A is n x k if no transpose, k x n if transpose
2692         // C is n x n
2693         if (Trans == TRANSPOSE) {
2694             // check columns versus C
2695             Cdim = A.getType().getX();
2696         } else {
2697             // check rows versus C
2698             Cdim = A.getType().getY();
2699         }
2700         if (C.getType().getX() != Cdim || C.getType().getY() != Cdim) {
2701             throw new RSRuntimeException("Invalid symmetric matrix in SYR2K");
2702         }
2703         // A dims == B dims
2704         if (A.getType().getX() != B.getType().getX() || A.getType().getY() != B.getType().getY()) {
2705             throw new RSRuntimeException("Invalid A and B in SYR2K");
2706         }
2707     }
2708 
2709     /**
2710      * SSYR2K performs one of the symmetric rank 2k operations
2711      * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
2712      *
2713      * Details: http://www.netlib.org/lapack/explore-html/df/d3d/ssyr2k_8f.html
2714      *
2715      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
2716      * @param Trans The type of transpose applied to the operation.
2717      * @param alpha The scalar alpha.
2718      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
2719      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
2720      * @param beta The scalar beta.
2721      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
2722      */
SSYR2K(@plo int Uplo, @Transpose int Trans, float alpha, Allocation A, Allocation B, float beta, Allocation C)2723     public void SSYR2K(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, Allocation B, float beta, Allocation C) {
2724         validateUplo(Uplo);
2725         validateSYR2K(Element.F32(mRS), Trans, A, B, C);
2726         int K = -1;
2727         if (Trans != NO_TRANSPOSE) {
2728             K = A.getType().getY();
2729         } else {
2730             K = A.getType().getX();
2731         }
2732         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, A.getID(mRS), B.getID(mRS), beta, C.getID(mRS), 0, 0, 0, 0);
2733     }
2734 
2735     /**
2736      * DSYR2K performs one of the symmetric rank 2k operations
2737      * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
2738      *
2739      * Details: http://www.netlib.org/lapack/explore-html/d1/dec/dsyr2k_8f.html
2740      *
2741      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
2742      * @param Trans The type of transpose applied to the operation.
2743      * @param alpha The scalar alpha.
2744      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
2745      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
2746      * @param beta The scalar beta.
2747      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
2748      */
DSYR2K(@plo int Uplo, @Transpose int Trans, double alpha, Allocation A, Allocation B, double beta, Allocation C)2749     public void DSYR2K(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, Allocation B, double beta, Allocation C) {
2750         validateUplo(Uplo);
2751         validateSYR2K(Element.F64(mRS), Trans, A, B, C);
2752         int K = -1;
2753         if (Trans != NO_TRANSPOSE) {
2754             K = A.getType().getY();
2755         } else {
2756             K = A.getType().getX();
2757         }
2758         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, A.getID(mRS), B.getID(mRS), beta, C.getID(mRS), 0, 0, 0, 0);
2759     }
2760 
2761     /**
2762      * CSYR2K performs one of the symmetric rank 2k operations
2763      * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
2764      *
2765      * Details: http://www.netlib.org/lapack/explore-html/de/d7e/csyr2k_8f.html
2766      *
2767      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
2768      * @param Trans The type of transpose applied to the operation.
2769      * @param alpha The scalar alpha.
2770      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
2771      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
2772      * @param beta The scalar beta.
2773      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
2774      */
CSYR2K(@plo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C)2775     public void CSYR2K(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C) {
2776         validateUplo(Uplo);
2777         validateSYR2K(Element.F32_2(mRS), Trans, A, B, C);
2778         int K = -1;
2779         if (Trans != NO_TRANSPOSE) {
2780             K = A.getType().getY();
2781         } else {
2782             K = A.getType().getX();
2783         }
2784         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
2785     }
2786 
2787     /**
2788      * ZSYR2K performs one of the symmetric rank 2k operations
2789      * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
2790      *
2791      * Details: http://www.netlib.org/lapack/explore-html/df/d20/zsyr2k_8f.html
2792      *
2793      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
2794      * @param Trans The type of transpose applied to the operation.
2795      * @param alpha The scalar alpha.
2796      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2797      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
2798      * @param beta The scalar beta.
2799      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
2800      */
ZSYR2K(@plo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C)2801     public void ZSYR2K(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C) {
2802         validateUplo(Uplo);
2803         validateSYR2K(Element.F64_2(mRS), Trans, A, B, C);
2804         int K = -1;
2805         if (Trans != NO_TRANSPOSE) {
2806             K = A.getType().getY();
2807         } else {
2808             K = A.getType().getX();
2809         }
2810         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zsyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
2811     }
2812 
validateTRMM(Element e, @Side int Side, @Transpose int TransA, Allocation A, Allocation B)2813     static void validateTRMM(Element e, @Side int Side, @Transpose int TransA, Allocation A, Allocation B) {
2814         validateSide(Side);
2815         validateTranspose(TransA);
2816         int aM = -1, aN = -1, bM = -1, bN = -1;
2817         if (!A.getType().getElement().isCompatible(e) ||
2818             !B.getType().getElement().isCompatible(e)) {
2819             throw new RSRuntimeException("Called BLAS with wrong Element type");
2820         }
2821 
2822         aM = A.getType().getY();
2823         aN = A.getType().getX();
2824         if (aM != aN) {
2825             throw new RSRuntimeException("Called TRMM with a non-symmetric matrix A");
2826         }
2827 
2828         bM = B.getType().getY();
2829         bN = B.getType().getX();
2830         if (Side == LEFT) {
2831             if (aN != bM) {
2832                 throw new RSRuntimeException("Called TRMM with invalid matrices");
2833             }
2834         } else {
2835             if (bN != aM) {
2836                 throw new RSRuntimeException("Called TRMM with invalid matrices");
2837             }
2838         }
2839     }
2840 
2841     /**
2842      * STRMM performs one of the matrix-matrix operations
2843      * B := alpha*op(A)*B   or   B := alpha*B*op(A)
2844      * op(A) is one of  op(A) = A  or  op(A) = A**T
2845      *
2846      * Details: http://www.netlib.org/lapack/explore-html/df/d01/strmm_8f.html
2847      *
2848      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2849      * @param Uplo Specifies whether matrix A is upper or lower triangular.
2850      * @param TransA The type of transpose applied to matrix A.
2851      * @param Diag Specifies whether or not A is unit triangular.
2852      * @param alpha The scalar alpha.
2853      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
2854      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
2855      */
STRMM(@ide int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, float alpha, Allocation A, Allocation B)2856     public void STRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, float alpha, Allocation A, Allocation B) {
2857         validateUplo(Uplo);
2858         validateDiag(Diag);
2859         validateTRMM(Element.F32(mRS), Side, TransA, A, B);
2860         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
2861                                         alpha, A.getID(mRS), B.getID(mRS), 0.f, 0, 0, 0, 0, 0);
2862     }
2863 
2864     /**
2865      * DTRMM performs one of the matrix-matrix operations
2866      * B := alpha*op(A)*B   or   B := alpha*B*op(A)
2867      * op(A) is one of  op(A) = A  or  op(A) = A**T
2868      *
2869      * Details: http://www.netlib.org/lapack/explore-html/dd/d19/dtrmm_8f.html
2870      *
2871      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2872      * @param Uplo Specifies whether matrix A is upper or lower triangular.
2873      * @param TransA The type of transpose applied to matrix A.
2874      * @param Diag Specifies whether or not A is unit triangular.
2875      * @param alpha The scalar alpha.
2876      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
2877      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
2878      */
DTRMM(@ide int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, double alpha, Allocation A, Allocation B)2879     public void DTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, double alpha, Allocation A, Allocation B) {
2880         validateUplo(Uplo);
2881         validateDiag(Diag);
2882         validateTRMM(Element.F64(mRS), Side, TransA, A, B);
2883         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
2884                                         alpha, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0);
2885     }
2886 
2887     /**
2888      * CTRMM performs one of the matrix-matrix operations
2889      * B := alpha*op(A)*B   or   B := alpha*B*op(A)
2890      * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
2891      *
2892      * Details: http://www.netlib.org/lapack/explore-html/d4/d9b/ctrmm_8f.html
2893      *
2894      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2895      * @param Uplo Specifies whether matrix A is upper or lower triangular.
2896      * @param TransA The type of transpose applied to matrix A.
2897      * @param Diag Specifies whether or not A is unit triangular.
2898      * @param alpha The scalar alpha.
2899      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
2900      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
2901      */
CTRMM(@ide int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Float2 alpha, Allocation A, Allocation B)2902     public void CTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Float2 alpha, Allocation A, Allocation B) {
2903         validateUplo(Uplo);
2904         validateDiag(Diag);
2905         validateTRMM(Element.F32_2(mRS), Side, TransA, A, B);
2906         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
2907                                          alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0, 0);
2908     }
2909 
2910     /**
2911      * ZTRMM performs one of the matrix-matrix operations
2912      * B := alpha*op(A)*B   or   B := alpha*B*op(A)
2913      * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
2914      *
2915      * Details: http://www.netlib.org/lapack/explore-html/d8/de1/ztrmm_8f.html
2916      *
2917      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2918      * @param Uplo Specifies whether matrix A is upper or lower triangular.
2919      * @param TransA The type of transpose applied to matrix A.
2920      * @param Diag Specifies whether or not A is unit triangular.
2921      * @param alpha The scalar alpha.
2922      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
2923      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
2924      */
ZTRMM(@ide int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Double2 alpha, Allocation A, Allocation B)2925     public void ZTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Double2 alpha, Allocation A, Allocation B) {
2926         validateUplo(Uplo);
2927         validateDiag(Diag);
2928         validateTRMM(Element.F64_2(mRS), Side, TransA, A, B);
2929         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
2930                                    alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0, 0);
2931     }
2932 
validateTRSM(Element e, @Side int Side, @Transpose int TransA, Allocation A, Allocation B)2933     static void validateTRSM(Element e, @Side int Side, @Transpose int TransA, Allocation A, Allocation B) {
2934         int adim = -1, bM = -1, bN = -1;
2935         validateSide(Side);
2936         validateTranspose(TransA);
2937         if (!A.getType().getElement().isCompatible(e) ||
2938             !B.getType().getElement().isCompatible(e)) {
2939             throw new RSRuntimeException("Called BLAS with wrong Element type");
2940         }
2941         adim = A.getType().getX();
2942         if (adim != A.getType().getY()) {
2943             // this may be unnecessary, the restriction could potentially be relaxed
2944             // A needs to contain at least that symmetric matrix but could theoretically be larger
2945             // for now we assume adapters are sufficient, will reevaluate in the future
2946             throw new RSRuntimeException("Called TRSM with a non-symmetric matrix A");
2947         }
2948         bM = B.getType().getY();
2949         bN = B.getType().getX();
2950         if (Side == LEFT) {
2951             // A is M*M
2952             if (adim != bM) {
2953                 throw new RSRuntimeException("Called TRSM with invalid matrix dimensions");
2954             }
2955         } else {
2956             // A is N*N
2957             if (adim != bN) {
2958                 throw new RSRuntimeException("Called TRSM with invalid matrix dimensions");
2959             }
2960         }
2961     }
2962 
2963     /**
2964      * STRSM solves one of the matrix equations
2965      * op(A)*X := alpha*B   or   X*op(A) := alpha*B
2966      * op(A) is one of  op(A) = A  or  op(A) = A**T
2967      *
2968      * Details: http://www.netlib.org/lapack/explore-html/d2/d8b/strsm_8f.html
2969      *
2970      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2971      * @param Uplo Specifies whether matrix A is upper or lower triangular.
2972      * @param TransA The type of transpose applied to matrix A.
2973      * @param Diag Specifies whether or not A is unit triangular.
2974      * @param alpha The scalar alpha.
2975      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
2976      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
2977      */
STRSM(@ide int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, float alpha, Allocation A, Allocation B)2978     public void STRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, float alpha, Allocation A, Allocation B) {
2979         validateUplo(Uplo);
2980         validateDiag(Diag);
2981         validateTRSM(Element.F32(mRS), Side, TransA, A, B);
2982         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
2983                                         alpha, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0);
2984     }
2985 
2986     /**
2987      * DTRSM solves one of the matrix equations
2988      * op(A)*X := alpha*B   or   X*op(A) := alpha*B
2989      * op(A) is one of  op(A) = A  or  op(A) = A**T
2990      *
2991      * Details: http://www.netlib.org/lapack/explore-html/de/da7/dtrsm_8f.html
2992      *
2993      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
2994      * @param Uplo Specifies whether matrix A is upper or lower triangular.
2995      * @param TransA The type of transpose applied to matrix A.
2996      * @param Diag Specifies whether or not A is unit triangular.
2997      * @param alpha The scalar alpha.
2998      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
2999      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
3000      */
DTRSM(@ide int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, double alpha, Allocation A, Allocation B)3001     public void DTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, double alpha, Allocation A, Allocation B) {
3002         validateUplo(Uplo);
3003         validateDiag(Diag);
3004         validateTRSM(Element.F64(mRS), Side, TransA, A, B);
3005         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
3006                                         alpha, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0);
3007     }
3008 
3009     /**
3010      * CTRSM solves one of the matrix equations
3011      * op(A)*X := alpha*B   or   X*op(A) := alpha*B
3012      * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
3013      *
3014      * Details: http://www.netlib.org/lapack/explore-html/de/d30/ctrsm_8f.html
3015      *
3016      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
3017      * @param Uplo Specifies whether matrix A is upper or lower triangular.
3018      * @param TransA The type of transpose applied to matrix A.
3019      * @param Diag Specifies whether or not A is unit triangular.
3020      * @param alpha The scalar alpha.
3021      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
3022      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
3023      */
CTRSM(@ide int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Float2 alpha, Allocation A, Allocation B)3024     public void CTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Float2 alpha, Allocation A, Allocation B) {
3025         validateUplo(Uplo);
3026         validateDiag(Diag);
3027         validateTRSM(Element.F32_2(mRS), Side, TransA, A, B);
3028         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
3029                                          alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0, 0);
3030     }
3031 
3032     /**
3033      * ZTRSM solves one of the matrix equations
3034      * op(A)*X := alpha*B   or   X*op(A) := alpha*B
3035      * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
3036      *
3037      * Details: http://www.netlib.org/lapack/explore-html/d1/d39/ztrsm_8f.html
3038      *
3039      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
3040      * @param Uplo Specifies whether matrix A is upper or lower triangular.
3041      * @param TransA The type of transpose applied to matrix A.
3042      * @param Diag Specifies whether or not A is unit triangular.
3043      * @param alpha The scalar alpha.
3044      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
3045      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
3046      */
ZTRSM(@ide int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Double2 alpha, Allocation A, Allocation B)3047     public void ZTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Double2 alpha, Allocation A, Allocation B) {
3048         validateUplo(Uplo);
3049         validateDiag(Diag);
3050         validateTRSM(Element.F64_2(mRS), Side, TransA, A, B);
3051         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
3052                                    alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0, 0);
3053     }
3054 
validateHEMM(Element e, @Side int Side, Allocation A, Allocation B, Allocation C)3055     static void validateHEMM(Element e, @Side int Side, Allocation A, Allocation B, Allocation C) {
3056         validateSide(Side);
3057 
3058         if (!A.getType().getElement().isCompatible(e) ||
3059             !B.getType().getElement().isCompatible(e) ||
3060             !C.getType().getElement().isCompatible(e)) {
3061             throw new RSRuntimeException("Called BLAS with wrong Element type");
3062         }
3063 
3064         // A must be square; can potentially be relaxed similar to TRSM
3065         int adim = A.getType().getX();
3066         if (adim != A.getType().getY()) {
3067             throw new RSRuntimeException("Called HEMM with non-square A");
3068         }
3069         if ((Side == LEFT && adim != B.getType().getY()) ||
3070             (Side == RIGHT && adim != B.getType().getX())) {
3071             throw new RSRuntimeException("Called HEMM with invalid B");
3072         }
3073         if (B.getType().getX() != C.getType().getX() ||
3074             B.getType().getY() != C.getType().getY()) {
3075             throw new RSRuntimeException("Called HEMM with mismatched B and C");
3076         }
3077     }
3078 
3079     /**
3080      * CHEMM performs one of the matrix-matrix operations
3081      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
3082      *
3083      * Details: http://www.netlib.org/lapack/explore-html/d3/d66/chemm_8f.html
3084      *
3085      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
3086      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
3087      * @param alpha The scalar alpha.
3088      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
3089      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
3090      * @param beta The scalar beta.
3091      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
3092      */
CHEMM(@ide int Side, @Uplo int Uplo, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C)3093     public void CHEMM(@Side int Side, @Uplo int Uplo, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C) {
3094         validateUplo(Uplo);
3095         validateHEMM(Element.F32_2(mRS), Side, A, B, C);
3096         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chemm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0,
3097                                          alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
3098     }
3099 
3100     /**
3101      * ZHEMM performs one of the matrix-matrix operations
3102      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
3103      *
3104      * Details: http://www.netlib.org/lapack/explore-html/d6/d3e/zhemm_8f.html
3105      *
3106      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
3107      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
3108      * @param alpha The scalar alpha.
3109      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
3110      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
3111      * @param beta The scalar beta.
3112      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
3113      */
ZHEMM(@ide int Side, @Uplo int Uplo, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C)3114     public void ZHEMM(@Side int Side, @Uplo int Uplo, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C) {
3115         validateUplo(Uplo);
3116         validateHEMM(Element.F64_2(mRS), Side, A, B, C);
3117         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhemm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0,
3118                                    alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
3119     }
3120 
validateHERK(Element e, @Transpose int Trans, Allocation A, Allocation C)3121     static void validateHERK(Element e, @Transpose int Trans, Allocation A, Allocation C) {
3122         if (!A.getType().getElement().isCompatible(e) ||
3123             !C.getType().getElement().isCompatible(e)) {
3124             throw new RSRuntimeException("Called BLAS with wrong Element type");
3125         }
3126         validateConjTranspose(Trans);
3127         int cdim = C.getType().getX();
3128         if (cdim != C.getType().getY()) {
3129             throw new RSRuntimeException("Called HERK with non-square C");
3130         }
3131         if (Trans == NO_TRANSPOSE) {
3132             if (cdim != A.getType().getY()) {
3133                 throw new RSRuntimeException("Called HERK with invalid A");
3134             }
3135         } else {
3136             if (cdim != A.getType().getX()) {
3137                 throw new RSRuntimeException("Called HERK with invalid A");
3138             }
3139         }
3140     }
3141 
3142     /**
3143      * CHERK performs one of the hermitian rank k operations
3144      * C := alpha*A*A**H + beta*C   or   C := alpha*A**H*A + beta*C
3145      *
3146      * Details: http://www.netlib.org/lapack/explore-html/d8/d52/cherk_8f.html
3147      *
3148      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
3149      * @param Trans The type of transpose applied to the operation.
3150      * @param alpha The scalar alpha.
3151      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
3152      * @param beta The scalar beta.
3153      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
3154      */
CHERK(@plo int Uplo, @Transpose int Trans, float alpha, Allocation A, float beta, Allocation C)3155     public void CHERK(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, float beta, Allocation C) {
3156         validateUplo(Uplo);
3157         validateHERK(Element.F32_2(mRS), Trans, A, C);
3158         int k = 0;
3159         if (Trans == CONJ_TRANSPOSE) {
3160             k = A.getType().getY();
3161         } else {
3162             k = A.getType().getX();
3163         }
3164         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cherk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k,
3165                                          alpha, 0, A.getID(mRS), 0, beta, 0, C.getID(mRS), 0, 0, 0, 0);
3166     }
3167 
3168     /**
3169      * ZHERK performs one of the hermitian rank k operations
3170      * C := alpha*A*A**H + beta*C   or   C := alpha*A**H*A + beta*C
3171      *
3172      * Details: http://www.netlib.org/lapack/explore-html/d1/db1/zherk_8f.html
3173      *
3174      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
3175      * @param Trans The type of transpose applied to the operation.
3176      * @param alpha The scalar alpha.
3177      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
3178      * @param beta The scalar beta.
3179      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
3180      */
ZHERK(@plo int Uplo, @Transpose int Trans, double alpha, Allocation A, double beta, Allocation C)3181     public void ZHERK(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, double beta, Allocation C) {
3182         validateUplo(Uplo);
3183         validateHERK(Element.F64_2(mRS), Trans, A, C);
3184         int k = 0;
3185         if (Trans == CONJ_TRANSPOSE) {
3186             k = A.getType().getY();
3187         } else {
3188             k = A.getType().getX();
3189         }
3190         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zherk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k,
3191                                    alpha, 0, A.getID(mRS), 0, beta, 0, C.getID(mRS), 0, 0, 0, 0);
3192     }
3193 
validateHER2K(Element e, @Transpose int Trans, Allocation A, Allocation B, Allocation C)3194     static void validateHER2K(Element e, @Transpose int Trans, Allocation A, Allocation B, Allocation C) {
3195         if (!A.getType().getElement().isCompatible(e) ||
3196             !B.getType().getElement().isCompatible(e) ||
3197             !C.getType().getElement().isCompatible(e)) {
3198             throw new RSRuntimeException("Called BLAS with wrong Element type");
3199         }
3200         validateConjTranspose(Trans);
3201         int cdim = C.getType().getX();
3202         if (cdim != C.getType().getY()) {
3203             throw new RSRuntimeException("Called HER2K with non-square C");
3204         }
3205         if (Trans == NO_TRANSPOSE) {
3206             if (A.getType().getY() != cdim) {
3207                 throw new RSRuntimeException("Called HER2K with invalid matrices");
3208             }
3209         } else {
3210             if (A.getType().getX() != cdim) {
3211                 throw new RSRuntimeException("Called HER2K with invalid matrices");
3212             }
3213         }
3214         if (A.getType().getX() != B.getType().getX() || A.getType().getY() != B.getType().getY()) {
3215             throw new RSRuntimeException("Called HER2K with invalid A and B matrices");
3216         }
3217     }
3218 
3219     /**
3220      * CHER2K performs one of the hermitian rank 2k operations
3221      * C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C   or   C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C
3222      *
3223      * Details: http://www.netlib.org/lapack/explore-html/d1/d82/cher2k_8f.html
3224      *
3225      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
3226      * @param Trans The type of transpose applied to the operation.
3227      * @param alpha The scalar alpha.
3228      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
3229      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
3230      * @param beta The scalar beta.
3231      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
3232      */
CHER2K(@plo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Allocation B, float beta, Allocation C)3233     public void CHER2K(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Allocation B, float beta, Allocation C) {
3234         validateUplo(Uplo);
3235         validateHER2K(Element.F32_2(mRS), Trans, A, B, C);
3236         int k = 0;
3237         if (Trans == NO_TRANSPOSE) {
3238             k = A.getType().getX();
3239         } else {
3240             k = A.getType().getY();
3241         }
3242         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k, alpha.x, alpha.y,
3243                                          A.getID(mRS), B.getID(mRS), beta, 0, C.getID(mRS), 0, 0, 0, 0);
3244     }
3245 
3246     /**
3247      * ZHER2K performs one of the hermitian rank 2k operations
3248      * C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C   or   C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C
3249      *
3250      * Details: http://www.netlib.org/lapack/explore-html/d7/dfa/zher2k_8f.html
3251      *
3252      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
3253      * @param Trans The type of transpose applied to the operation.
3254      * @param alpha The scalar alpha.
3255      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
3256      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
3257      * @param beta The scalar beta.
3258      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
3259      */
ZHER2K(@plo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Allocation B, double beta, Allocation C)3260     public void ZHER2K(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Allocation B, double beta, Allocation C) {
3261         validateUplo(Uplo);
3262         validateHER2K(Element.F64_2(mRS), Trans, A, B, C);
3263         int k = 0;
3264         if (Trans == NO_TRANSPOSE) {
3265             k = A.getType().getX();
3266         } else {
3267             k = A.getType().getY();
3268         }
3269         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zher2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k, alpha.x, alpha.y,
3270                                    A.getID(mRS), B.getID(mRS), beta, 0, C.getID(mRS), 0, 0, 0, 0);
3271     }
3272 
3273 
3274     /**
3275      * 8-bit GEMM-like operation for neural networks: C = A * Transpose(B)
3276      * Calculations are done in 1.10.21 fixed-point format for the final output,
3277      * just before there's a shift down to drop the fractional parts. The output
3278      * values are gated to 0 to 255 to fit in a byte, but the 10-bit format
3279      * gives some headroom to avoid wrapping around on small overflows.
3280      *
3281      * @param A The input allocation contains matrix A, supported elements type {@link Element#U8}.
3282      * @param a_offset The offset for all values in matrix A, e.g A[i,j] = A[i,j] - a_offset. Value should be from 0 to 255.
3283      * @param B The input allocation contains matrix B, supported elements type {@link Element#U8}.
3284      * @param b_offset The offset for all values in matrix B, e.g B[i,j] = B[i,j] - b_offset. Value should be from 0 to 255.
3285      * @param C The input allocation contains matrix C, supported elements type {@link Element#U8}.
3286      * @param c_offset The offset for all values in matrix C.
3287      * @param c_mult The multiplier for all values in matrix C, e.g C[i,j] = (C[i,j] + c_offset) * c_mult.
3288      **/
BNNM(Allocation A, int a_offset, Allocation B, int b_offset, Allocation C, int c_offset, int c_mult)3289     public void BNNM(Allocation A, int a_offset, Allocation B, int b_offset, Allocation C, int c_offset, int c_mult) {
3290         validateL3(Element.U8(mRS), NO_TRANSPOSE, TRANSPOSE, 0, A, B, C);
3291 
3292         if (a_offset < 0 || a_offset > 255) {
3293             throw new RSRuntimeException("Invalid a_offset passed to BNNM");
3294         }
3295         if (b_offset < 0 || b_offset > 255) {
3296             throw new RSRuntimeException("Invalid b_offset passed to BNNM");
3297         }
3298         int M = -1, N = -1, K = -1;
3299         M = A.getType().getY();
3300         N = B.getType().getY();
3301         K = A.getType().getX();
3302 
3303 
3304         mRS.nScriptIntrinsicBLAS_BNNM(getID(mRS), M, N, K, A.getID(mRS), a_offset, B.getID(mRS), b_offset, C.getID(mRS), c_offset, c_mult);
3305 
3306     }
3307 
3308 }
3309