/* * Copyright (C) 2012 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "rsCpuIntrinsic.h" #include "rsCpuIntrinsicInlines.h" #include "rsCpuBLASDispatch.h" #include "eight_bit_int_gemm.h" namespace android { namespace renderscript { class RsdCpuScriptIntrinsicBLAS : public RsdCpuScriptIntrinsic { public: void invokeForEach(uint32_t slot, const Allocation ** ain, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc) override; void populateScript(Script *) override; ~RsdCpuScriptIntrinsicBLAS() override; RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx, const Script *s); protected: uint8_t a_offset = 0; uint8_t b_offset = 0; uint8_t c_offset = 0; #ifdef RS_COMPATIBILITY_LIB bool isBlasLibInitialized = false; #endif static void kernelBNNM(size_t m, size_t n, size_t k, const uint8_t* a, uint8_t a_offset, size_t lda, const uint8_t* b, uint8_t b_offset, size_t ldb, uint8_t* c, int32_t c_offset, size_t ldc, int32_t c_mult_int); }; void RsdCpuScriptIntrinsicBLAS::populateScript(Script *s) { s->mHal.info.exportedVariableCount = 0; } static void initABC(const Allocation ** ain, size_t size, void** A, void** B, void** C, int* lda, int* ldb, int* ldc) { if (ain[0]) { *A = ain[0]->mHal.drvState.lod[0].mallocPtr; *lda = (int)(ain[0]->mHal.drvState.lod[0].stride/size); } if (ain[1]) { *B = ain[1]->mHal.drvState.lod[0].mallocPtr; *ldb = (int)(ain[1]->mHal.drvState.lod[0].stride/size); } if (ain[2]) { *C = ain[2]->mHal.drvState.lod[0].mallocPtr; *ldc = (int)(ain[2]->mHal.drvState.lod[0].stride/size); } } // Routine to setup LaunchStruct for GEMM callback. static void setupGEMM(MTLaunchStructForEachBlas *mtls, const Allocation **ain, RsBlasCall* call, RsdCpuReferenceImpl *ctx) { uint32_t mm, nn, kk; mm = call->M; nn = call->N; kk = call->K; memset(mtls, 0, sizeof(MTLaunchStructForEachBlas)); mtls->rs = ctx; mtls->sc = call; mtls->dimPtr = &mtls->fep.dim; mtls->fep.dim.x = nn; mtls->fep.dim.y = mm; mtls->fep.dim.z = kk; if (ain) { memcpy(mtls->ains, ain, 3 * sizeof(ain[0])); } uint32_t elementBytes = 4; if (ain[0]) { elementBytes = ain[0]->getType()->getElement()->getSizeBytes(); } const uint32_t MIN_SIZE_TO_TILE = 64 * 1024 / elementBytes; const uint32_t MAX_WORK_PER_THREAD = 512 / elementBytes; const uint32_t THREAD_COUNT = ctx->getThreadCount(); uint32_t tileSizeN = 0; uint32_t tileSizeM = 0; // Do not tile the matrix if: // 1. It is too small comparing to the other matrix. // 2. It is too small comparing to MIN_SIZE_TO_TILE . if (nn * kk > MIN_SIZE_TO_TILE && nn * THREAD_COUNT > mm) { tileSizeN = rsMin(nn / THREAD_COUNT, MAX_WORK_PER_THREAD); } if (mm * kk > MIN_SIZE_TO_TILE && mm * THREAD_COUNT > nn) { tileSizeM = rsMin(mm / THREAD_COUNT, MAX_WORK_PER_THREAD); } mtls->numTileM = 1; mtls->numTileN = 1; mtls->tileSizeM = mm; mtls->tileSizeN = nn; // If tiling is needed, compute the number of slices for A & B. mtls->isThreadable = (tileSizeM > 0 || tileSizeN > 0); if (tileSizeM) { mtls->numTileM += (mm - 1) / tileSizeM; mtls->tileSizeM = tileSizeM; } if (tileSizeN) { mtls->numTileN += (nn - 1) / tileSizeN; mtls->tileSizeN = tileSizeN; } mtls->mSliceNum = 0; } // Generic GEMM callback routine. template static void walk_tiled_gemm(Func blasFunc, T_param alpha, T_param beta, int vecSize, RsBlasCall* call, MTLaunchStructForEachBlas *mtls) { // setup BLAS enum args enum CBLAS_TRANSPOSE TransA = (enum CBLAS_TRANSPOSE)call->transA; enum CBLAS_TRANSPOSE TransB = (enum CBLAS_TRANSPOSE)call->transB; void *A = nullptr; void *B = nullptr; void *C = nullptr; int lda = 0, ldb = 0, ldc = 0; const Allocation *ain[RS_KERNEL_INPUT_LIMIT]; ain[0] = mtls->ains[0]; ain[1] = mtls->ains[1]; ain[2] = mtls->ains[2]; initABC(ain, sizeof(T_data) * vecSize, &A, &B, &C, &lda, &ldb, &ldc); // Determin the stride of the tiled matrices. int mStride = (TransA == CblasNoTrans) ? lda : 1; int nStride = (TransB == CblasNoTrans) ? 1 : ldb; while (1) { uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); uint32_t mStart = (slice % mtls->numTileM) * mtls->tileSizeM; uint32_t mEnd = mStart + mtls->tileSizeM; mEnd = rsMin(mEnd, (uint32_t)call->M); if (mEnd <= mStart) { return; } uint32_t nStart = (slice / mtls->numTileM) * mtls->tileSizeN; uint32_t nEnd = nStart + mtls->tileSizeN; nEnd = rsMin(nEnd, (uint32_t)call->N); if (nEnd <= nStart) { return; } blasFunc(CblasRowMajor, TransA, TransB, mEnd - mStart, nEnd - nStart, call->K, alpha, (T_data *)A + mStart * mStride * vecSize, lda, (T_data *)B + nStart * nStride * vecSize, ldb, beta, (T_data *)C + (mStart * ldc + nStart) * vecSize, ldc); } } // SGEMM callback static void walk_2d_sgemm(void *usr, uint32_t idx) { MTLaunchStructForEachBlas *mtls = (MTLaunchStructForEachBlas *)usr; RsBlasCall* call = (RsBlasCall*) mtls->sc; float alpha = call->alpha.f; float beta = call->beta.f; walk_tiled_gemm(cblas_sgemm, alpha, beta, 1, call, mtls); } // DGEMM callback static void walk_2d_dgemm(void *usr, uint32_t idx) { MTLaunchStructForEachBlas *mtls = (MTLaunchStructForEachBlas *)usr; RsBlasCall* call = (RsBlasCall*) mtls->sc; double alpha = call->alpha.d; double beta = call->beta.d; walk_tiled_gemm(cblas_dgemm, alpha, beta, 1, call, mtls); } // CGEMM callback static void walk_2d_cgemm(void *usr, uint32_t idx) { MTLaunchStructForEachBlas *mtls = (MTLaunchStructForEachBlas *)usr; RsBlasCall* call = (RsBlasCall*) mtls->sc; void * alpha = (void *)&call->alpha.c; void * beta = (void *)&call->beta.c; walk_tiled_gemm(cblas_cgemm, alpha, beta, 2, call, mtls); } // ZGEMM callback static void walk_2d_zgemm(void *usr, uint32_t idx) { MTLaunchStructForEachBlas *mtls = (MTLaunchStructForEachBlas *)usr; RsBlasCall* call = (RsBlasCall*) mtls->sc; void * alpha = (void *)&call->alpha.z; void * beta = (void *)&call->beta.z; walk_tiled_gemm(cblas_zgemm, alpha, beta, 2, call, mtls); } void RsdCpuScriptIntrinsicBLAS::invokeForEach(uint32_t slot, const Allocation ** ain, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc) { RsBlasCall* call = (RsBlasCall*) usr; // setup BLAS enum args enum CBLAS_TRANSPOSE TransA = (enum CBLAS_TRANSPOSE)call->transA; enum CBLAS_TRANSPOSE TransB = (enum CBLAS_TRANSPOSE)call->transB; enum CBLAS_UPLO Uplo = (enum CBLAS_UPLO)call->uplo; enum CBLAS_DIAG Diag = (enum CBLAS_DIAG)call->diag; enum CBLAS_SIDE Side = (enum CBLAS_SIDE)call->side; void *A = nullptr; void *B = nullptr; void *C = nullptr; void *X = nullptr; void *Y = nullptr; int lda = 0, ldb = 0, ldc = 0; MTLaunchStructForEachBlas mtls; #ifdef RS_COMPATIBILITY_LIB // Allow BNNM even without libblas if (call->func != RsBlas_bnnm && !isBlasLibInitialized) { if (!loadBLASLib()) { ALOGE("Failed to load the BLAS lib, IntrinsicBLAS NOT supported!\n"); return; } isBlasLibInitialized = true; } #endif switch (call->func) { // Level 1 BLAS: returns into a 1D Allocation // Level 2 BLAS case (RsBlas_sgemv): initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc); cblas_sgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.f, (float*)A, lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY); break; case (RsBlas_sgbmv): initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc); cblas_sgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU, call->alpha.f, (float*)A, lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY); break; case (RsBlas_strmv): initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_strmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, lda, (float*)X, call->incX); break; case (RsBlas_stbmv): initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_stbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A, lda, (float*)X, call->incX); break; // stpmv takes a packed 1D Allocation only case (RsBlas_stpmv): initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_stpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, (float*)X, call->incX); break; case (RsBlas_strsv): initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_strsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, lda, (float*)X, call->incX); break; case (RsBlas_stbsv): initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_stbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A, lda, (float*)X, call->incX); break; case (RsBlas_stpsv): initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_stpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, (float*)X, call->incX); break; case (RsBlas_dgemv): initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc); cblas_dgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.d, (double*)A, lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY); break; case (RsBlas_dgbmv): initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc); cblas_dgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU, call->alpha.d, (double*)A, lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY); break; case (RsBlas_dtrmv): initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_dtrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, lda, (double*)X, call->incX); break; case (RsBlas_dtbmv): initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_dtbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A, lda, (double*)X, call->incX); break; // stpmv takes a packed 1D Allocation only case (RsBlas_dtpmv): initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_dtpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, (double*)X, call->incX); break; case (RsBlas_dtrsv): initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_dtrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, lda, (double*)X, call->incX); break; case (RsBlas_dtbsv): initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_dtbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A, lda, (double*)X, call->incX); break; case (RsBlas_dtpsv): initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr); cblas_dtpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, (double*)X, call->incX); break; case (RsBlas_cgemv): initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_cgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.c, (void*)A, lda, (void*)X, call->incX, (void*)&call->beta.c, (void*)Y, call->incY); break; case (RsBlas_cgbmv): initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_cgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU, (void*)&call->alpha.c, (void*)A, lda, (void*)X, call->incX, (void*)&call->beta.c, (void*)Y, call->incY); break; case (RsBlas_ctrmv): initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ctrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda, (void*)X, call->incX); break; case (RsBlas_ctbmv): initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ctbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A, lda, (void*)X, call->incX); break; // stpmv takes a packed 1D Allocation only case (RsBlas_ctpmv): initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ctpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, (void*)X, call->incX); break; case (RsBlas_ctrsv): initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ctrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda, (void*)X, call->incX); break; case (RsBlas_ctbsv): initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ctbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A, lda, (void*)X, call->incX); break; case (RsBlas_ctpsv): initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ctpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, (void*)X, call->incX); break; case (RsBlas_zgemv): initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_zgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.z, (void*)A, lda, (void*)X, call->incX, (void*)&call->beta.z, (void*)Y, call->incY); break; case (RsBlas_zgbmv): initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_zgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU, (void*)&call->alpha.z, (void*)A, lda, (void*)X, call->incX, (void*)&call->beta.z, (void*)Y, call->incY); break; case (RsBlas_ztrmv): initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ztrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda, (void*)X, call->incX); break; case (RsBlas_ztbmv): initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ztbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A, lda, (void*)X, call->incX); break; // stpmv takes a packed 1D Allocation only case (RsBlas_ztpmv): initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ztpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, (void*)X, call->incX); break; case (RsBlas_ztrsv): initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ztrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda, (void*)X, call->incX); break; case (RsBlas_ztbsv): initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ztbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A, lda, (void*)X, call->incX); break; case (RsBlas_ztpsv): initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr); cblas_ztpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, (void*)X, call->incX); break; // S and D only case (RsBlas_ssymv): initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc); cblas_ssymv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A, lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY); break; case (RsBlas_ssbmv): initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc); cblas_ssbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.f, (float*)A, lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY); break; //sspmv requires a packed 1D Allocation case (RsBlas_sspmv): initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc); cblas_sspmv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY); break; // following calls have init reordered because A is output matrix case (RsBlas_sger): initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda); cblas_sger(CblasRowMajor, call->M, call->N, call->alpha.f, (float*)X, call->incX, (float*)Y, call->incY, (float*)A, lda); break; case (RsBlas_ssyr): initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr); cblas_ssyr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX, (float*)A, lda); break; // sspr is packed 1D Allocation A only case (RsBlas_sspr): initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr); cblas_sspr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX, (float*)A); break; case (RsBlas_ssyr2): initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda); cblas_ssyr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX, (float*)Y, call->incY, (float*)A, lda); break; // sspr2 is packed 1D Allocation A only case (RsBlas_sspr2): initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda); cblas_sspr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX, (float*)Y, call->incY, (float*)A); break; case (RsBlas_dsymv): initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc); cblas_dsymv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A, lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY); break; case (RsBlas_dsbmv): initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc); cblas_dsbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.d, (double*)A, lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY); break; // dspmv requires a packed 1D Allocation case (RsBlas_dspmv): initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc); cblas_dspmv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY); break; // following calls have init reordered because A is output matrix case (RsBlas_dger): initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda); cblas_dger(CblasRowMajor, call->M, call->N, call->alpha.d, (double*)X, call->incX, (double*)Y, call->incY, (double*)A, lda); break; case (RsBlas_dsyr): initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr); cblas_dsyr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX, (double*)A, lda); break; // dspr is packed 1D Allocation A only case (RsBlas_dspr): initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr); cblas_dspr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX, (double*)A); break; case (RsBlas_dsyr2): initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda); cblas_dsyr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX, (double*)Y, call->incY, (double*)A, lda); break; // dspr2 is packed 1D Allocation A only case (RsBlas_dspr2): initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda); cblas_dspr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX, (double*)Y, call->incY, (double*)A); break; // C and Z only case (RsBlas_chemv): initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_chemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A, lda, X, call->incX, (void*)&call->beta.c, Y, call->incY); break; case (RsBlas_chbmv): initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_chbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.c, A, lda, X, call->incX, (void*)&call->beta.c, Y, call->incY); break; case (RsBlas_chpmv): initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_chpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A, X, call->incX, (void*)&call->beta.c, Y, call->incY); break; case (RsBlas_cgeru): initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda); cblas_cgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c, X, call->incX, Y, call->incY, A, lda); break; case (RsBlas_cgerc): initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda); cblas_cgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c, X, call->incX, Y, call->incY, A, lda); break; case (RsBlas_cher): initABC(ain, sizeof(float)*2, &X, nullptr, &A, &ldb, nullptr, &lda); cblas_cher(CblasRowMajor, Uplo, call->N, call->alpha.f, X, call->incX, A, lda); break; // packed 1D Allocations only case (RsBlas_chpr): initABC(ain, sizeof(float)*2, &X, nullptr, &A, &ldb, nullptr, &lda); cblas_chpr(CblasRowMajor, Uplo, call->N, call->alpha.f, X, call->incX, A); break; case (RsBlas_cher2): initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda); cblas_cher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, X, call->incX, Y, call->incY, A, lda); break; // packed 1D Allocations only case (RsBlas_chpr2): initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda); cblas_chpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, X, call->incX, Y, call->incY, A); break; case (RsBlas_zhemv): initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_zhemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A, lda, X, call->incX, (void*)&call->beta.z, Y, call->incY); break; case (RsBlas_zhbmv): initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_zhbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.z, A, lda, X, call->incX, (void*)&call->beta.z, Y, call->incY); break; case (RsBlas_zhpmv): initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc); cblas_zhpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A, X, call->incX, (void*)&call->beta.z, Y, call->incY); break; case (RsBlas_zgeru): initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda); cblas_zgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z, X, call->incX, Y, call->incY, A, lda); break; case (RsBlas_zgerc): initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda); cblas_zgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z, X, call->incX, Y, call->incY, A, lda); break; case (RsBlas_zher): initABC(ain, sizeof(double)*2, &X, nullptr, &A, &ldb, nullptr, &lda); cblas_zher(CblasRowMajor, Uplo, call->N, call->alpha.d, X, call->incX, A, lda); break; // packed 1D Allocations only case (RsBlas_zhpr): initABC(ain, sizeof(double)*2, &X, nullptr, &A, &ldb, nullptr, &lda); cblas_zhpr(CblasRowMajor, Uplo, call->N, call->alpha.d, X, call->incX, A); break; case (RsBlas_zher2): initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda); cblas_zher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, X, call->incX, Y, call->incY, A, lda); break; // packed 1D Allocations only case (RsBlas_zhpr2): initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda); cblas_zhpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, X, call->incX, Y, call->incY, A); break; // Level 3 BLAS case (RsBlas_sgemm): setupGEMM(&mtls, ain, call, mCtx); if (mtls.isThreadable) { mCtx->launchThreads(walk_2d_sgemm, &mtls); } else { initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc); cblas_sgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.f, (float*)A, lda, (float*)B, ldb, call->beta.f, (float*)C, ldc); } break; case (RsBlas_ssymm): initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc); cblas_ssymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.f, (float*)A, lda, (float*)B, ldb, call->beta.f, (float*)C, ldc); break; case (RsBlas_ssyrk): initABC(ain, sizeof(float), &A, nullptr, &C, &lda, nullptr, &ldc); cblas_ssyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A, lda, call->beta.f, (float*)C, ldc); break; case (RsBlas_ssyr2k): initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc); cblas_ssyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A, lda, (float*)B, ldb, call->beta.f, (float*)C, ldc); break; case (RsBlas_strmm): initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr); cblas_strmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f, (float*)A, lda, (float*)B, ldb); break; case (RsBlas_strsm): initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr); cblas_strsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f, (float*)A, lda, (float*)B, ldb); break; case (RsBlas_dgemm): setupGEMM(&mtls, ain, call, mCtx); if (mtls.isThreadable) { mCtx->launchThreads(walk_2d_dgemm, &mtls); } else { initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc); cblas_dgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.d, (double*)A, lda, (double*)B, ldb, call->beta.d, (double*)C, ldc); } break; case (RsBlas_dsymm): initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc); cblas_dsymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.d, (double*)A, lda, (double*)B, ldb, call->beta.d, (double*)C, ldc); break; case (RsBlas_dsyrk): initABC(ain, sizeof(double), &A, nullptr, &C, &lda, nullptr, &ldc); cblas_dsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A, lda, call->beta.d, (double*)C, ldc); break; case (RsBlas_dsyr2k): initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc); cblas_dsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A, lda, (double*)B, ldb, call->beta.d, (double*)C, ldc); break; case (RsBlas_dtrmm): initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr); cblas_dtrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d, (double*)A, lda, (double*)B, ldb); break; case (RsBlas_dtrsm): initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr); cblas_dtrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d, (double*)A, lda, (double*)B, ldb); break; case (RsBlas_cgemm): setupGEMM(&mtls, ain, call, mCtx); if (mtls.isThreadable) { mCtx->launchThreads(walk_2d_cgemm, &mtls); } else { initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_cgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.c, A, lda, B, ldb, (void*)&call->beta.c, C, ldc); } break; case (RsBlas_csymm): initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_csymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A, lda, B, ldb, (void*)&call->beta.c, C, ldc); break; case (RsBlas_csyrk): initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc); cblas_csyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A, lda, (void*)&call->beta.c, C, ldc); break; case (RsBlas_csyr2k): initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_csyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A, lda, B, ldb, (void*)&call->beta.c, C, ldc); break; case (RsBlas_ctrmm): initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr); cblas_ctrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c, A, lda, B, ldb); break; case (RsBlas_ctrsm): initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr); cblas_ctrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c, A, lda, B, ldb); break; case (RsBlas_zgemm): setupGEMM(&mtls, ain, call, mCtx); if (mtls.isThreadable) { mCtx->launchThreads(walk_2d_zgemm, &mtls); } else { initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_zgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.z, A, lda, B, ldb, (void*)&call->beta.z, C, ldc); } break; case (RsBlas_zsymm): initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_zsymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A, lda, B, ldb, (void*)&call->beta.z, C, ldc); break; case (RsBlas_zsyrk): initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc); cblas_zsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A, lda, (void*)&call->beta.z, C, ldc); break; case (RsBlas_zsyr2k): initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_zsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A, lda, B, ldb, (void*)&call->beta.z, C, ldc); break; case (RsBlas_ztrmm): initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr); cblas_ztrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z, A, lda, B, ldb); break; case (RsBlas_ztrsm): initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr); cblas_ztrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z, A, lda, B, ldb); break; // Level 3 C and Z only case (RsBlas_chemm): initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_chemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A, lda, B, ldb, (void*)&call->beta.c, C, ldc); break; case (RsBlas_cherk): initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc); cblas_cherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, A, lda, call->beta.f, C, ldc); break; case (RsBlas_cher2k): initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_cher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A, lda, B, ldb, call->beta.f, C, ldc); break; case (RsBlas_zhemm): initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_zhemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A, lda, B, ldb, (void*)&call->beta.z, C, ldc); break; case (RsBlas_zherk): initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc); cblas_zherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, A, lda, call->beta.d, C, ldc); break; case (RsBlas_zher2k): initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc); cblas_zher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A, lda, B, ldb, call->beta.d, C, ldc); break; case (RsBlas_bnnm): initABC(ain, sizeof(uint8_t), &A, &B, &C, &lda, &ldb, &ldc); kernelBNNM(call->M, call->N, call->K, (const uint8_t*)A, call->a_offset, lda, (const uint8_t*)B, call->b_offset, ldb, (uint8_t*)C, call->c_offset, ldc, call->c_mult_int); break; default: ALOGE("unimplemented\n"); } } void RsdCpuScriptIntrinsicBLAS::kernelBNNM(size_t m, size_t n, size_t k, const uint8_t* a, uint8_t a_offset, size_t lda, const uint8_t* b, uint8_t b_offset, size_t ldb, uint8_t* c, int32_t c_offset, size_t ldc, int32_t c_mult_int) { const int c_shift = 21; #if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS) // Non-optimized path for ARMv7 devices without SIMD instructions. if (!gArchUseSIMD) { /* * Calculations are done in 1.10.21 fixed-point format for the final output, * just before there's a shift down to drop the fractional parts. The output * values are gated to 0 to 255 to fit in a byte, but the 10-bit format * gives some headroom to avoid wrapping around on small overflows. */ size_t i = 0, j = 0, l = 0; for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { int32_t total = 0; for (l = 0; l < k; l++) { const int a_index = ((i * lda) + l); const uint8_t a_as_byte = a[a_index]; const int32_t a_as_int = (((int32_t)(a_as_byte)) - a_offset); const int b_index = ((j * ldb) + l); const uint8_t b_as_byte = b[b_index]; const int32_t b_as_int = (((int32_t)(b_as_byte)) - b_offset); const int32_t mult_as_int = (a_as_int * b_as_int); total += mult_as_int; } const int c_index = ((ldc * i) + j); int32_t output = ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1))) >> c_shift); if (output > 255) { output = 255; } if (output < 0) { output = 0; } c[c_index] = (uint8_t)(output); } } return; } #endif // Using gemmlowp to calculate the low precision 8 bit GEMM. // Set MaxNumThreads to 0. The value 0 lets the implementation query // the system to determine the number of hardware threads gemmlowp::eight_bit_int_gemm::SetMaxNumThreads(0); bool transpose_a = true; bool transpose_b = false; bool transpose_c = true; gemmlowp::eight_bit_int_gemm::EightBitIntGemm(transpose_a, transpose_b, transpose_c, m, n, k, a, -a_offset, lda, b, -b_offset, ldb, c, c_offset, c_mult_int, c_shift, ldc, gemmlowp::eight_bit_int_gemm::BitDepthSetting::A8B8); } RsdCpuScriptIntrinsicBLAS::RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx, const Script *s) : RsdCpuScriptIntrinsic(ctx, s, nullptr, RS_SCRIPT_INTRINSIC_ID_BLAS) { } RsdCpuScriptIntrinsicBLAS::~RsdCpuScriptIntrinsicBLAS() { } RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) { return new RsdCpuScriptIntrinsicBLAS(ctx, s); } } // namespace renderscript } // namespace android