1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 
18 #include "rsCpuIntrinsic.h"
19 #include "rsCpuIntrinsicInlines.h"
20 #include "rsCpuBLASDispatch.h"
21 #include "eight_bit_int_gemm.h"
22 
23 namespace android {
24 namespace renderscript {
25 
26 
27 class RsdCpuScriptIntrinsicBLAS : public RsdCpuScriptIntrinsic {
28 public:
29     void invokeForEach(uint32_t slot,
30                        const Allocation ** ain,
31                        uint32_t inLen,
32                        Allocation * aout,
33                        const void * usr,
34                        uint32_t usrLen,
35                        const RsScriptCall *sc) override;
36     void populateScript(Script *) override;
37     ~RsdCpuScriptIntrinsicBLAS() override;
38     RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx, const Script *s);
39 
40 protected:
41 
42     uint8_t a_offset = 0;
43     uint8_t b_offset = 0;
44     uint8_t c_offset = 0;
45 
46 #ifdef RS_COMPATIBILITY_LIB
47     bool isBlasLibInitialized = false;
48 #endif
49     static void kernelBNNM(size_t m, size_t n, size_t k,
50                            const uint8_t* a, uint8_t a_offset, size_t lda,
51                            const uint8_t* b, uint8_t b_offset, size_t ldb,
52                            uint8_t* c, int32_t c_offset, size_t ldc,
53                            int32_t c_mult_int);
54 
55 
56 
57 };
58 
populateScript(Script * s)59 void RsdCpuScriptIntrinsicBLAS::populateScript(Script *s) {
60     s->mHal.info.exportedVariableCount = 0;
61 }
62 
initABC(const Allocation ** ain,size_t size,void ** A,void ** B,void ** C,int * lda,int * ldb,int * ldc)63 static void initABC(const Allocation ** ain,
64                     size_t size,
65                     void** A,
66                     void** B,
67                     void** C,
68                     int* lda,
69                     int* ldb,
70                     int* ldc)
71 {
72     if (ain[0]) {
73         *A = ain[0]->mHal.drvState.lod[0].mallocPtr;
74         *lda = (int)(ain[0]->mHal.drvState.lod[0].stride/size);
75     }
76     if (ain[1]) {
77         *B = ain[1]->mHal.drvState.lod[0].mallocPtr;
78         *ldb = (int)(ain[1]->mHal.drvState.lod[0].stride/size);
79     }
80     if (ain[2]) {
81         *C = ain[2]->mHal.drvState.lod[0].mallocPtr;
82         *ldc = (int)(ain[2]->mHal.drvState.lod[0].stride/size);
83     }
84 }
85 
86 // Routine to setup LaunchStruct for GEMM callback.
setupGEMM(MTLaunchStructForEachBlas * mtls,const Allocation ** ain,RsBlasCall * call,RsdCpuReferenceImpl * ctx)87 static void setupGEMM(MTLaunchStructForEachBlas *mtls, const Allocation **ain, RsBlasCall* call,
88                       RsdCpuReferenceImpl *ctx) {
89     uint32_t mm, nn, kk;
90     mm = call->M;
91     nn = call->N;
92     kk = call->K;
93 
94     memset(mtls, 0, sizeof(MTLaunchStructForEachBlas));
95     mtls->rs        = ctx;
96     mtls->sc        = call;
97     mtls->dimPtr    = &mtls->fep.dim;
98     mtls->fep.dim.x = nn;
99     mtls->fep.dim.y = mm;
100     mtls->fep.dim.z = kk;
101     if (ain) {
102         memcpy(mtls->ains, ain, 3 * sizeof(ain[0]));
103     }
104     uint32_t elementBytes = 4;
105     if (ain[0]) {
106         elementBytes = ain[0]->getType()->getElement()->getSizeBytes();
107     }
108     const uint32_t MIN_SIZE_TO_TILE = 64 * 1024 / elementBytes;
109     const uint32_t MAX_WORK_PER_THREAD = 512 / elementBytes;
110     const uint32_t THREAD_COUNT = ctx->getThreadCount();
111     uint32_t tileSizeN = 0;
112     uint32_t tileSizeM = 0;
113 
114     // Do not tile the matrix if:
115     // 1. It is too small comparing to the other matrix.
116     // 2. It is too small comparing to MIN_SIZE_TO_TILE .
117     if (nn * kk > MIN_SIZE_TO_TILE && nn * THREAD_COUNT > mm) {
118         tileSizeN = rsMin(nn / THREAD_COUNT, MAX_WORK_PER_THREAD);
119     }
120     if (mm * kk > MIN_SIZE_TO_TILE && mm * THREAD_COUNT > nn) {
121         tileSizeM = rsMin(mm / THREAD_COUNT, MAX_WORK_PER_THREAD);
122     }
123     mtls->numTileM = 1;
124     mtls->numTileN = 1;
125     mtls->tileSizeM = mm;
126     mtls->tileSizeN = nn;
127 
128     // If tiling is needed, compute the number of slices for A & B.
129     mtls->isThreadable = (tileSizeM > 0 || tileSizeN > 0);
130     if (tileSizeM) {
131         mtls->numTileM += (mm - 1) / tileSizeM;
132         mtls->tileSizeM = tileSizeM;
133     }
134     if (tileSizeN) {
135         mtls->numTileN += (nn - 1) / tileSizeN;
136         mtls->tileSizeN = tileSizeN;
137     }
138 
139     mtls->mSliceNum  = 0;
140 }
141 
142 // Generic GEMM callback routine.
143 template <typename T_data, typename T_param, typename Func>
walk_tiled_gemm(Func blasFunc,T_param alpha,T_param beta,int vecSize,RsBlasCall * call,MTLaunchStructForEachBlas * mtls)144 static void walk_tiled_gemm(Func blasFunc, T_param alpha, T_param beta, int vecSize,
145                             RsBlasCall* call, MTLaunchStructForEachBlas *mtls) {
146     // setup BLAS enum args
147     enum CBLAS_TRANSPOSE TransA = (enum CBLAS_TRANSPOSE)call->transA;
148     enum CBLAS_TRANSPOSE TransB = (enum CBLAS_TRANSPOSE)call->transB;
149 
150     void *A = nullptr;
151     void *B = nullptr;
152     void *C = nullptr;
153 
154     int lda = 0, ldb = 0, ldc = 0;
155 
156     const Allocation *ain[RS_KERNEL_INPUT_LIMIT];
157     ain[0] = mtls->ains[0];
158     ain[1] = mtls->ains[1];
159     ain[2] = mtls->ains[2];
160 
161     initABC(ain, sizeof(T_data) * vecSize, &A, &B, &C, &lda, &ldb, &ldc);
162 
163     // Determin the stride of the tiled matrices.
164     int mStride = (TransA == CblasNoTrans) ? lda : 1;
165     int nStride = (TransB == CblasNoTrans) ? 1 : ldb;
166     while (1) {
167         uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
168 
169         uint32_t mStart = (slice % mtls->numTileM) * mtls->tileSizeM;
170         uint32_t mEnd   = mStart + mtls->tileSizeM;
171         mEnd = rsMin(mEnd, (uint32_t)call->M);
172         if (mEnd <= mStart) {
173             return;
174         }
175 
176         uint32_t nStart = (slice / mtls->numTileM) * mtls->tileSizeN;
177         uint32_t nEnd   = nStart + mtls->tileSizeN;
178         nEnd = rsMin(nEnd, (uint32_t)call->N);
179         if (nEnd <= nStart) {
180             return;
181         }
182 
183         blasFunc(CblasRowMajor, TransA, TransB,
184                  mEnd - mStart, nEnd - nStart, call->K, alpha,
185                  (T_data *)A + mStart * mStride * vecSize, lda,
186                  (T_data *)B + nStart * nStride * vecSize, ldb, beta,
187                  (T_data *)C + (mStart * ldc + nStart) * vecSize, ldc);
188     }
189 }
190 
191 // SGEMM callback
walk_2d_sgemm(void * usr,uint32_t idx)192 static void walk_2d_sgemm(void *usr, uint32_t idx) {
193     MTLaunchStructForEachBlas *mtls = (MTLaunchStructForEachBlas *)usr;
194     RsBlasCall* call = (RsBlasCall*) mtls->sc;
195 
196     float alpha = call->alpha.f;
197     float beta = call->beta.f;
198 
199     walk_tiled_gemm<float, float, FnPtr_cblas_sgemm>(cblas_sgemm, alpha, beta, 1, call, mtls);
200 }
201 
202 // DGEMM callback
walk_2d_dgemm(void * usr,uint32_t idx)203 static void walk_2d_dgemm(void *usr, uint32_t idx) {
204     MTLaunchStructForEachBlas *mtls = (MTLaunchStructForEachBlas *)usr;
205     RsBlasCall* call = (RsBlasCall*) mtls->sc;
206 
207     double alpha = call->alpha.d;
208     double beta = call->beta.d;
209 
210     walk_tiled_gemm<double, double, FnPtr_cblas_dgemm>(cblas_dgemm, alpha, beta, 1, call, mtls);
211 }
212 
213 // CGEMM callback
walk_2d_cgemm(void * usr,uint32_t idx)214 static void walk_2d_cgemm(void *usr, uint32_t idx) {
215     MTLaunchStructForEachBlas *mtls = (MTLaunchStructForEachBlas *)usr;
216     RsBlasCall* call = (RsBlasCall*) mtls->sc;
217 
218     void * alpha = (void *)&call->alpha.c;
219     void * beta = (void *)&call->beta.c;
220 
221     walk_tiled_gemm<float, void *, FnPtr_cblas_cgemm>(cblas_cgemm, alpha, beta, 2, call, mtls);
222 }
223 
224 // ZGEMM callback
walk_2d_zgemm(void * usr,uint32_t idx)225 static void walk_2d_zgemm(void *usr, uint32_t idx) {
226     MTLaunchStructForEachBlas *mtls = (MTLaunchStructForEachBlas *)usr;
227     RsBlasCall* call = (RsBlasCall*) mtls->sc;
228 
229     void * alpha = (void *)&call->alpha.z;
230     void * beta = (void *)&call->beta.z;
231 
232     walk_tiled_gemm<double, void *, FnPtr_cblas_zgemm>(cblas_zgemm, alpha, beta, 2, call, mtls);
233 }
234 
235 
invokeForEach(uint32_t slot,const Allocation ** ain,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)236 void RsdCpuScriptIntrinsicBLAS::invokeForEach(uint32_t slot,
237                                               const Allocation ** ain,
238                                               uint32_t inLen,
239                                               Allocation * aout,
240                                               const void * usr,
241                                               uint32_t usrLen,
242                                               const RsScriptCall *sc) {
243     RsBlasCall* call = (RsBlasCall*) usr;
244     // setup BLAS enum args
245     enum CBLAS_TRANSPOSE TransA = (enum CBLAS_TRANSPOSE)call->transA;
246     enum CBLAS_TRANSPOSE TransB = (enum CBLAS_TRANSPOSE)call->transB;
247     enum CBLAS_UPLO Uplo = (enum CBLAS_UPLO)call->uplo;
248     enum CBLAS_DIAG Diag = (enum CBLAS_DIAG)call->diag;
249     enum CBLAS_SIDE Side = (enum CBLAS_SIDE)call->side;
250 
251     void *A = nullptr;
252     void *B = nullptr;
253     void *C = nullptr;
254     void *X = nullptr;
255     void *Y = nullptr;
256 
257     int lda = 0, ldb = 0, ldc = 0;
258 
259     MTLaunchStructForEachBlas mtls;
260 
261 #ifdef RS_COMPATIBILITY_LIB
262     // Allow BNNM even without libblas
263     if (call->func != RsBlas_bnnm && !isBlasLibInitialized) {
264         if (!loadBLASLib()) {
265             ALOGE("Failed to load the BLAS lib, IntrinsicBLAS NOT supported!\n");
266             return;
267         }
268         isBlasLibInitialized = true;
269     }
270 #endif
271 
272     switch (call->func) {
273 
274     // Level 1 BLAS: returns into a 1D Allocation
275 
276 
277     // Level 2 BLAS
278     case (RsBlas_sgemv):
279         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
280         cblas_sgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.f, (float*)A,
281                     lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
282         break;
283     case (RsBlas_sgbmv):
284         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
285         cblas_sgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
286                     call->alpha.f, (float*)A, lda, (float*)X, call->incX,
287                     call->beta.f, (float*)Y, call->incY);
288         break;
289     case (RsBlas_strmv):
290         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
291         cblas_strmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
292                     lda, (float*)X, call->incX);
293         break;
294     case (RsBlas_stbmv):
295         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
296         cblas_stbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
297                     lda, (float*)X, call->incX);
298         break;
299     // stpmv takes a packed 1D Allocation only
300     case (RsBlas_stpmv):
301         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
302         cblas_stpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
303                     (float*)X, call->incX);
304         break;
305     case (RsBlas_strsv):
306         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
307         cblas_strsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, lda,
308                     (float*)X, call->incX);
309         break;
310     case (RsBlas_stbsv):
311         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
312         cblas_stbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
313                     lda, (float*)X, call->incX);
314         break;
315     case (RsBlas_stpsv):
316         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
317         cblas_stpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
318                     (float*)X, call->incX);
319         break;
320     case (RsBlas_dgemv):
321         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
322         cblas_dgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.d, (double*)A,
323                     lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
324         break;
325     case (RsBlas_dgbmv):
326         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
327         cblas_dgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
328                     call->alpha.d, (double*)A, lda, (double*)X, call->incX,
329                     call->beta.d, (double*)Y, call->incY);
330         break;
331     case (RsBlas_dtrmv):
332         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
333         cblas_dtrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
334                     lda, (double*)X, call->incX);
335         break;
336     case (RsBlas_dtbmv):
337         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
338         cblas_dtbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
339                     lda, (double*)X, call->incX);
340         break;
341     // stpmv takes a packed 1D Allocation only
342     case (RsBlas_dtpmv):
343         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
344         cblas_dtpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
345                     (double*)X, call->incX);
346         break;
347     case (RsBlas_dtrsv):
348         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
349         cblas_dtrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, lda,
350                     (double*)X, call->incX);
351         break;
352     case (RsBlas_dtbsv):
353         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
354         cblas_dtbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
355                     lda, (double*)X, call->incX);
356         break;
357     case (RsBlas_dtpsv):
358         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
359         cblas_dtpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
360                     (double*)X, call->incX);
361         break;
362     case (RsBlas_cgemv):
363         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
364         cblas_cgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.c, (void*)A,
365                     lda, (void*)X, call->incX, (void*)&call->beta.c, (void*)Y, call->incY);
366         break;
367     case (RsBlas_cgbmv):
368         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
369         cblas_cgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
370                     (void*)&call->alpha.c, (void*)A, lda, (void*)X, call->incX,
371                     (void*)&call->beta.c, (void*)Y, call->incY);
372         break;
373     case (RsBlas_ctrmv):
374         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
375         cblas_ctrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
376                     lda, (void*)X, call->incX);
377         break;
378     case (RsBlas_ctbmv):
379         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
380         cblas_ctbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
381                     lda, (void*)X, call->incX);
382         break;
383     // stpmv takes a packed 1D Allocation only
384     case (RsBlas_ctpmv):
385         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
386         cblas_ctpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
387                     (void*)X, call->incX);
388         break;
389     case (RsBlas_ctrsv):
390         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
391         cblas_ctrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
392                     (void*)X, call->incX);
393         break;
394     case (RsBlas_ctbsv):
395         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
396         cblas_ctbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
397                     lda, (void*)X, call->incX);
398         break;
399     case (RsBlas_ctpsv):
400         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
401         cblas_ctpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
402                     (void*)X, call->incX);
403         break;
404     case (RsBlas_zgemv):
405         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
406         cblas_zgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.z, (void*)A,
407                     lda, (void*)X, call->incX, (void*)&call->beta.z, (void*)Y, call->incY);
408         break;
409     case (RsBlas_zgbmv):
410         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
411         cblas_zgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
412                     (void*)&call->alpha.z, (void*)A, lda, (void*)X, call->incX,
413                     (void*)&call->beta.z, (void*)Y, call->incY);
414         break;
415     case (RsBlas_ztrmv):
416         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
417         cblas_ztrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
418                     lda, (void*)X, call->incX);
419         break;
420     case (RsBlas_ztbmv):
421         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
422         cblas_ztbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
423                     lda, (void*)X, call->incX);
424         break;
425     // stpmv takes a packed 1D Allocation only
426     case (RsBlas_ztpmv):
427         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
428         cblas_ztpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
429                     (void*)X, call->incX);
430         break;
431     case (RsBlas_ztrsv):
432         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
433         cblas_ztrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
434                     (void*)X, call->incX);
435         break;
436     case (RsBlas_ztbsv):
437         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
438         cblas_ztbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
439                     lda, (void*)X, call->incX);
440         break;
441     case (RsBlas_ztpsv):
442         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
443         cblas_ztpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
444                     (void*)X, call->incX);
445         break;
446 
447 
448     // S and D only
449     case (RsBlas_ssymv):
450         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
451         cblas_ssymv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A, lda,
452                     (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
453         break;
454     case (RsBlas_ssbmv):
455         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
456         cblas_ssbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.f,
457                     (float*)A, lda, (float*)X, call->incX, call->beta.f,
458                     (float*)Y, call->incY);
459         break;
460     //sspmv requires a packed 1D Allocation
461     case (RsBlas_sspmv):
462         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
463         cblas_sspmv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A,
464                     (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
465         break;
466     // following calls have init reordered because A is output matrix
467     case (RsBlas_sger):
468         initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
469         cblas_sger(CblasRowMajor, call->M, call->N, call->alpha.f, (float*)X,
470                    call->incX, (float*)Y, call->incY, (float*)A, lda);
471         break;
472     case (RsBlas_ssyr):
473         initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
474         cblas_ssyr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
475                    (float*)A, lda);
476         break;
477     // sspr is packed 1D Allocation A only
478     case (RsBlas_sspr):
479         initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
480         cblas_sspr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
481                    (float*)A);
482         break;
483     case (RsBlas_ssyr2):
484         initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
485         cblas_ssyr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
486                     (float*)Y, call->incY, (float*)A, lda);
487         break;
488     // sspr2 is packed 1D Allocation A only
489     case (RsBlas_sspr2):
490         initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
491         cblas_sspr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
492                     (float*)Y, call->incY, (float*)A);
493         break;
494     case (RsBlas_dsymv):
495         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
496         cblas_dsymv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A, lda,
497                     (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
498         break;
499     case (RsBlas_dsbmv):
500         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
501         cblas_dsbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.d,
502                     (double*)A, lda, (double*)X, call->incX, call->beta.d,
503                     (double*)Y, call->incY);
504         break;
505     // dspmv requires a packed 1D Allocation
506     case (RsBlas_dspmv):
507         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
508         cblas_dspmv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A,
509                     (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
510         break;
511     // following calls have init reordered because A is output matrix
512     case (RsBlas_dger):
513         initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
514         cblas_dger(CblasRowMajor, call->M, call->N, call->alpha.d, (double*)X,
515                    call->incX, (double*)Y, call->incY, (double*)A, lda);
516         break;
517     case (RsBlas_dsyr):
518         initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
519         cblas_dsyr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
520                    (double*)A, lda);
521         break;
522     // dspr is packed 1D Allocation A only
523     case (RsBlas_dspr):
524         initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
525         cblas_dspr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
526                    (double*)A);
527         break;
528     case (RsBlas_dsyr2):
529         initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
530         cblas_dsyr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
531                     (double*)Y, call->incY, (double*)A, lda);
532         break;
533     // dspr2 is packed 1D Allocation A only
534     case (RsBlas_dspr2):
535         initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
536         cblas_dspr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
537                     (double*)Y, call->incY, (double*)A);
538         break;
539 
540     // C and Z only
541     case (RsBlas_chemv):
542         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
543         cblas_chemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A, lda,
544                     X, call->incX, (void*)&call->beta.c, Y, call->incY);
545         break;
546     case (RsBlas_chbmv):
547         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
548         cblas_chbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.c,
549                     A, lda, X, call->incX, (void*)&call->beta.c, Y, call->incY);
550         break;
551     case (RsBlas_chpmv):
552         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
553         cblas_chpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A,
554                     X, call->incX, (void*)&call->beta.c, Y, call->incY);
555         break;
556     case (RsBlas_cgeru):
557         initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
558         cblas_cgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
559                     X, call->incX, Y, call->incY, A, lda);
560         break;
561     case (RsBlas_cgerc):
562         initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
563         cblas_cgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
564                     X, call->incX, Y, call->incY, A, lda);
565         break;
566     case (RsBlas_cher):
567         initABC(ain, sizeof(float)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
568         cblas_cher(CblasRowMajor, Uplo, call->N, call->alpha.f,
569                    X, call->incX, A, lda);
570         break;
571     // packed 1D Allocations only
572     case (RsBlas_chpr):
573         initABC(ain, sizeof(float)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
574         cblas_chpr(CblasRowMajor, Uplo, call->N, call->alpha.f, X,
575                    call->incX, A);
576         break;
577     case (RsBlas_cher2):
578         initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
579         cblas_cher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c,
580                    X, call->incX, Y, call->incY, A, lda);
581         break;
582     // packed 1D Allocations only
583     case (RsBlas_chpr2):
584         initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
585         cblas_chpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, X,
586                    call->incX, Y, call->incY, A);
587         break;
588     case (RsBlas_zhemv):
589         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
590         cblas_zhemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A, lda,
591                     X, call->incX, (void*)&call->beta.z, Y, call->incY);
592         break;
593     case (RsBlas_zhbmv):
594         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
595         cblas_zhbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.z,
596                     A, lda, X, call->incX, (void*)&call->beta.z, Y, call->incY);
597         break;
598     case (RsBlas_zhpmv):
599         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
600         cblas_zhpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A,
601                     X, call->incX, (void*)&call->beta.z, Y, call->incY);
602         break;
603     case (RsBlas_zgeru):
604         initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
605         cblas_zgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
606                     X, call->incX, Y, call->incY, A, lda);
607         break;
608     case (RsBlas_zgerc):
609         initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
610         cblas_zgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
611                     X, call->incX, Y, call->incY, A, lda);
612         break;
613     case (RsBlas_zher):
614         initABC(ain, sizeof(double)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
615         cblas_zher(CblasRowMajor, Uplo, call->N, call->alpha.d,
616                    X, call->incX, A, lda);
617         break;
618     // packed 1D Allocations only
619     case (RsBlas_zhpr):
620         initABC(ain, sizeof(double)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
621         cblas_zhpr(CblasRowMajor, Uplo, call->N, call->alpha.d, X,
622                    call->incX, A);
623         break;
624     case (RsBlas_zher2):
625         initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
626         cblas_zher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z,
627                    X, call->incX, Y, call->incY, A, lda);
628         break;
629     // packed 1D Allocations only
630     case (RsBlas_zhpr2):
631         initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
632         cblas_zhpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, X,
633                    call->incX, Y, call->incY, A);
634         break;
635 
636     // Level 3 BLAS
637     case (RsBlas_sgemm):
638         setupGEMM(&mtls, ain, call, mCtx);
639         if (mtls.isThreadable) {
640             mCtx->launchThreads(walk_2d_sgemm, &mtls);
641         } else {
642             initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
643             cblas_sgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.f,
644                         (float*)A, lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
645         }
646         break;
647     case (RsBlas_ssymm):
648         initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
649         cblas_ssymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.f, (float*)A,
650                     lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
651         break;
652     case (RsBlas_ssyrk):
653         initABC(ain, sizeof(float), &A, nullptr, &C, &lda, nullptr, &ldc);
654         cblas_ssyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
655                     lda, call->beta.f, (float*)C, ldc);
656         break;
657     case (RsBlas_ssyr2k):
658         initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
659         cblas_ssyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
660                      lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
661         break;
662     case (RsBlas_strmm):
663         initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
664         cblas_strmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
665                     (float*)A, lda, (float*)B, ldb);
666         break;
667     case (RsBlas_strsm):
668         initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
669         cblas_strsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
670                     (float*)A, lda, (float*)B, ldb);
671         break;
672 
673 
674     case (RsBlas_dgemm):
675         setupGEMM(&mtls, ain, call, mCtx);
676         if (mtls.isThreadable) {
677             mCtx->launchThreads(walk_2d_dgemm, &mtls);
678         } else {
679             initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
680             cblas_dgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.d,
681                         (double*)A, lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
682         }
683         break;
684     case (RsBlas_dsymm):
685         initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
686         cblas_dsymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.d, (double*)A,
687                     lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
688         break;
689     case (RsBlas_dsyrk):
690         initABC(ain, sizeof(double), &A, nullptr, &C, &lda, nullptr, &ldc);
691         cblas_dsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
692                     lda, call->beta.d, (double*)C, ldc);
693         break;
694     case (RsBlas_dsyr2k):
695         initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
696         cblas_dsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
697                      lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
698         break;
699     case (RsBlas_dtrmm):
700         initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
701         cblas_dtrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
702                     (double*)A, lda, (double*)B, ldb);
703         break;
704     case (RsBlas_dtrsm):
705         initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
706         cblas_dtrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
707                     (double*)A, lda, (double*)B, ldb);
708         break;
709 
710     case (RsBlas_cgemm):
711         setupGEMM(&mtls, ain, call, mCtx);
712         if (mtls.isThreadable) {
713             mCtx->launchThreads(walk_2d_cgemm, &mtls);
714         } else {
715             initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
716             cblas_cgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.c,
717                         A, lda, B, ldb, (void*)&call->beta.c, C, ldc);
718         }
719         break;
720     case (RsBlas_csymm):
721         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
722         cblas_csymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A,
723                     lda, B, ldb, (void*)&call->beta.c, C, ldc);
724         break;
725     case (RsBlas_csyrk):
726         initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
727         cblas_csyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
728                     lda, (void*)&call->beta.c, C, ldc);
729         break;
730     case (RsBlas_csyr2k):
731         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
732         cblas_csyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
733                      lda, B, ldb, (void*)&call->beta.c, C, ldc);
734         break;
735     case (RsBlas_ctrmm):
736         initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
737         cblas_ctrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
738                     A, lda, B, ldb);
739         break;
740     case (RsBlas_ctrsm):
741         initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
742         cblas_ctrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
743                     A, lda, B, ldb);
744         break;
745 
746     case (RsBlas_zgemm):
747         setupGEMM(&mtls, ain, call, mCtx);
748         if (mtls.isThreadable) {
749             mCtx->launchThreads(walk_2d_zgemm, &mtls);
750         } else {
751             initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
752             cblas_zgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.z,
753                         A, lda, B, ldb, (void*)&call->beta.z, C, ldc);
754         }
755         break;
756     case (RsBlas_zsymm):
757         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
758         cblas_zsymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A,
759                     lda, B, ldb, (void*)&call->beta.z, C, ldc);
760         break;
761     case (RsBlas_zsyrk):
762         initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
763         cblas_zsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
764                     lda, (void*)&call->beta.z, C, ldc);
765         break;
766     case (RsBlas_zsyr2k):
767         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
768         cblas_zsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
769                      lda, B, ldb, (void*)&call->beta.z, C, ldc);
770         break;
771     case (RsBlas_ztrmm):
772         initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
773         cblas_ztrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
774                     A, lda, B, ldb);
775         break;
776     case (RsBlas_ztrsm):
777         initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
778         cblas_ztrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
779                     A, lda, B, ldb);
780         break;
781 
782     // Level 3 C and Z only
783     case (RsBlas_chemm):
784         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
785         cblas_chemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A, lda,
786                     B, ldb, (void*)&call->beta.c, C, ldc);
787         break;
788     case (RsBlas_cherk):
789         initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
790         cblas_cherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, A, lda,
791                     call->beta.f, C, ldc);
792         break;
793     case (RsBlas_cher2k):
794         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
795         cblas_cher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A, lda,
796                      B, ldb, call->beta.f, C, ldc);
797         break;
798 
799     case (RsBlas_zhemm):
800         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
801         cblas_zhemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A, lda,
802                     B, ldb, (void*)&call->beta.z, C, ldc);
803         break;
804     case (RsBlas_zherk):
805         initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
806         cblas_zherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, A, lda,
807                     call->beta.d, C, ldc);
808         break;
809     case (RsBlas_zher2k):
810         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
811         cblas_zher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A, lda,
812                      B, ldb, call->beta.d, C, ldc);
813         break;
814 
815 
816     case (RsBlas_bnnm):
817         initABC(ain, sizeof(uint8_t), &A, &B, &C, &lda, &ldb, &ldc);
818         kernelBNNM(call->M, call->N, call->K,
819                     (const uint8_t*)A, call->a_offset, lda,
820                     (const uint8_t*)B, call->b_offset, ldb,
821                     (uint8_t*)C, call->c_offset, ldc,
822                     call->c_mult_int);
823 
824         break;
825 
826     default:
827         ALOGE("unimplemented\n");
828     }
829 
830 
831 }
832 
kernelBNNM(size_t m,size_t n,size_t k,const uint8_t * a,uint8_t a_offset,size_t lda,const uint8_t * b,uint8_t b_offset,size_t ldb,uint8_t * c,int32_t c_offset,size_t ldc,int32_t c_mult_int)833 void RsdCpuScriptIntrinsicBLAS::kernelBNNM(size_t m, size_t n, size_t k,
834                                            const uint8_t* a, uint8_t a_offset, size_t lda,
835                                            const uint8_t* b, uint8_t b_offset, size_t ldb,
836                                            uint8_t* c, int32_t c_offset, size_t ldc,
837                                            int32_t c_mult_int) {
838     const int c_shift = 21;
839 #if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
840     // Non-optimized path for ARMv7 devices without SIMD instructions.
841     if (!gArchUseSIMD) {
842         /*
843          * Calculations are done in 1.10.21 fixed-point format for the final output,
844          * just before there's a shift down to drop the fractional parts. The output
845          * values are gated to 0 to 255 to fit in a byte, but the 10-bit format
846          * gives some headroom to avoid wrapping around on small overflows.
847          */
848         size_t i = 0, j = 0, l = 0;
849         for (j = 0; j < n; j++) {
850             for (i = 0; i < m; i++) {
851                 int32_t total = 0;
852                 for (l = 0; l < k; l++) {
853                     const int a_index = ((i * lda) + l);
854                     const uint8_t a_as_byte = a[a_index];
855                     const int32_t a_as_int = (((int32_t)(a_as_byte)) - a_offset);
856                     const int b_index = ((j * ldb) + l);
857                     const uint8_t b_as_byte = b[b_index];
858                     const int32_t b_as_int = (((int32_t)(b_as_byte)) - b_offset);
859                     const int32_t mult_as_int = (a_as_int * b_as_int);
860                     total += mult_as_int;
861                 }
862                 const int c_index = ((ldc * i) + j);
863                 int32_t output =
864                     ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1)))
865                      >> c_shift);
866                 if (output > 255) {
867                     output = 255;
868                 }
869                 if (output < 0) {
870                     output = 0;
871                 }
872                 c[c_index] = (uint8_t)(output);
873             }
874         }
875         return;
876     }
877 #endif
878 
879     // Using gemmlowp to calculate the low precision 8 bit GEMM.
880     // Set MaxNumThreads to 0. The value 0 lets the implementation query
881     // the system to determine the number of hardware threads
882     gemmlowp::eight_bit_int_gemm::SetMaxNumThreads(0);
883 
884     bool transpose_a = true;
885     bool transpose_b = false;
886     bool transpose_c = true;
887     gemmlowp::eight_bit_int_gemm::EightBitIntGemm(transpose_a, transpose_b, transpose_c,
888                                                   m, n, k, a, -a_offset, lda,
889                                                   b, -b_offset, ldb, c, c_offset,
890                                                   c_mult_int, c_shift, ldc,
891                                                   gemmlowp::eight_bit_int_gemm::BitDepthSetting::A8B8);
892 
893 }
894 
895 
896 
897 
898 
RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl * ctx,const Script * s)899 RsdCpuScriptIntrinsicBLAS::RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx,
900                                                    const Script *s)
901             : RsdCpuScriptIntrinsic(ctx, s, nullptr, RS_SCRIPT_INTRINSIC_ID_BLAS) {
902 
903 
904 }
905 
~RsdCpuScriptIntrinsicBLAS()906 RsdCpuScriptIntrinsicBLAS::~RsdCpuScriptIntrinsicBLAS() {
907 }
908 
rsdIntrinsic_BLAS(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)909 RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
910                                     const Script *s, const Element *e) {
911 
912     return new RsdCpuScriptIntrinsicBLAS(ctx, s);
913 }
914 
915 } // namespace renderscript
916 } // namespace android
917