__cxa_finalize LIBC libc.so libRSDriverArm.so __cxa_atexit __register_atfork _Z19isAllocationCpuOnlyPKN7android12renderscript10AllocationE _Z19rsdGpuIntrinsic_LUTPKN7android12renderscript7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _Z20rsdGpuIntrinsic_BlurPKN7android12renderscript7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _Z21rsdGpuIntrinsic_3DLUTPKN7android12renderscript7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _Z25rsdGpuIntrinsic_HistogramPKN7android12renderscript7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID dladdr libdl.so _Z27rsdGpuIntrinsic_ColorMatrixPKN7android12renderscript7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID dl_iterate_phdr _Z27rsdGpuIntrinsic_Convolve3x3PKN7android12renderscript7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _Z29rsdClAllocationReleaseHostPtrPKN7android12renderscript7ContextEPKNS0_10AllocationE _ZN7android12renderscript21rsdGpuScriptIntrinsic11getNumSlotsEv _ZN7android12renderscript21rsdGpuScriptIntrinsic12launchKernelEP10_cl_kernel _ZN7android12renderscript21rsdGpuScriptIntrinsic12setGlobalObjEjPNS0_10ObjectBaseE _ZN7android12renderscript21rsdGpuScriptIntrinsic12setGlobalVarEjPKvj _ZN7android12renderscript21rsdGpuScriptIntrinsic13prepareLaunchEPK12RsScriptCallPPKNS0_10AllocationEj _ZN7android12renderscript21rsdGpuScriptIntrinsic13scaleWorkSizeEi _ZN7android12renderscript21rsdGpuScriptIntrinsic13setGlobalBindEjPNS0_10AllocationE _ZN7android12renderscript21rsdGpuScriptIntrinsic15createIntrinsicEPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript21rsdGpuScriptIntrinsic24setGlobalVarWithElemDimsEjPKvjPKNS0_7ElementEPKjj _ZN7android12renderscript21rsdGpuScriptIntrinsic7gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript21rsdGpuScriptIntrinsic7initGpuEP9RsdHalRecPNS0_6ScriptE _ZN7android12renderscript21rsdGpuScriptIntrinsic8getClMemEPKNS0_10AllocationE _ZN7android12renderscript21rsdGpuScriptIntrinsicC1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript21rsdGpuScriptIntrinsicC2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript27rsdGpuScriptIntrinsicBuffer3mapEv _ZN7android12renderscript27rsdGpuScriptIntrinsicBuffer5unmapEv _ZN7android12renderscript27rsdGpuScriptIntrinsicBufferC1EP8RsdCLRecjyPv _ZN7android12renderscript27rsdGpuScriptIntrinsicBufferC2EP8RsdCLRecjyPv _ZN7android12renderscript27rsdGpuScriptIntrinsicBufferD0Ev _ZN7android12renderscript27rsdGpuScriptIntrinsicBufferD1Ev _ZN7android12renderscript27rsdGpuScriptIntrinsicBufferD2Ev _ZNK7android12renderscript7Context8setErrorE7RsErrorPKc _ZTVN7android12renderscript21rsdGpuScriptIntrinsicE _ZTVN7android12renderscript27rsdGpuScriptIntrinsicBufferE _ZdlPv __android_log_print __stack_chk_fail __stack_chk_guard _Z26rsdClSetAllocationArgumentPKN7android12renderscript7ContextEPKNS0_6ScriptEP13maliKernelRecjPKNS0_10AllocationE _ZN7android12renderscript26rsdGpuScriptIntrinsic3DLUT11getNumSlotsEv _ZN7android12renderscript26rsdGpuScriptIntrinsic3DLUT12gpuRunHelperEP13maliKernelRecPKNS0_10AllocationEPS4_ _ZN7android12renderscript26rsdGpuScriptIntrinsic3DLUT12setGlobalObjEjPNS0_10ObjectBaseE _ZN7android12renderscript26rsdGpuScriptIntrinsic3DLUT7gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript26rsdGpuScriptIntrinsic3DLUT7initGpuEP9RsdHalRecPNS0_6ScriptE _ZN7android12renderscript26rsdGpuScriptIntrinsic3DLUTC1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript26rsdGpuScriptIntrinsic3DLUTC2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZNK7android12renderscript10ObjectBase9decSysRefEv _ZNK7android12renderscript10ObjectBase9incSysRefEv _ZTVN7android12renderscript26rsdGpuScriptIntrinsic3DLUTE _Znwj calloc src_kernels_lut3d _Z21rsdGpuIntrinsic_BlendPKN7android12renderscript7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript26rsdGpuScriptIntrinsicBlend11getNumSlotsEv __vsnprintf_chk _ZN7android12renderscript26rsdGpuScriptIntrinsicBlend12gpuRunHelperEiPKNS0_10AllocationEPS2_PK12RsScriptCalli _ZN7android12renderscript26rsdGpuScriptIntrinsicBlend7gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript26rsdGpuScriptIntrinsicBlend7initGpuEP9RsdHalRecPNS0_6ScriptE _ZN7android12renderscript26rsdGpuScriptIntrinsicBlendC1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript26rsdGpuScriptIntrinsicBlendC2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZTVN7android12renderscript26rsdGpuScriptIntrinsicBlendE src_kernels_blend _ZN7android12renderscript25rsdGpuScriptIntrinsicBlur11getNumSlotsEv _ZN7android12renderscript25rsdGpuScriptIntrinsicBlur11gpuFirstRunEP13maliKernelRecPKNS0_10AllocationE _ZN7android12renderscript25rsdGpuScriptIntrinsicBlur12gpuSecondRunEP13maliKernelRecPKNS0_10AllocationE _ZN7android12renderscript25rsdGpuScriptIntrinsicBlur12setGlobalObjEjPNS0_10ObjectBaseE _ZN7android12renderscript25rsdGpuScriptIntrinsicBlur12setGlobalVarEjPKvj _ZN7android12renderscript25rsdGpuScriptIntrinsicBlur22ComputeGaussianWeightsEv _ZN7android12renderscript25rsdGpuScriptIntrinsicBlur7gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript25rsdGpuScriptIntrinsicBlur7initGpuEP9RsdHalRecPNS0_6ScriptE _ZN7android12renderscript25rsdGpuScriptIntrinsicBlurC1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript25rsdGpuScriptIntrinsicBlurC2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZTVN7android12renderscript25rsdGpuScriptIntrinsicBlurE __aeabi_memclr8 LIBC_N __aeabi_memcpy powf libm.so src_kernels_blur _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix11getNumSlotsEv _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix12gpuRunHelperEP13maliKernelRecPKNS0_10AllocationEPS4_i _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix12setGlobalVarEjPKvj _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix14float2half_rtzEf _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix18gpuRunHelperSimpleEP13maliKernelRecPKNS0_10AllocationEPS4_i _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix22gpuRunHelperGreyU84_U8EP13maliKernelRecPKNS0_10AllocationEPS4_i _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix7gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix7initGpuEP9RsdHalRecPNS0_6ScriptE _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrix8checkVarEv _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrixC1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript32rsdGpuScriptIntrinsicColorMatrixC2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZTVN7android12renderscript32rsdGpuScriptIntrinsicColorMatrixE src_kernels_colormatrix _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x311findPatternEv _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x311getNumSlotsEv _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x312gpuRunHelperENS1_11c3x3_kernelEPNS0_10AllocationE _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x312setGlobalObjEjPNS0_10ObjectBaseE _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x312setGlobalVarEjPKvj _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x37gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x37initGpuEP9RsdHalRecPNS0_6ScriptE fprintf _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x3C1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID fflush _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x3C2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZTVN7android12renderscript32rsdGpuScriptIntrinsicConvolve3x3E src_kernels_convolve3x3 _Z27rsdGpuIntrinsic_Convolve5x5PKN7android12renderscript7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x511getNumSlotsEv _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x512gpuRunHelperEiPNS0_10AllocationEPK12RsScriptCalli _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x512setGlobalObjEjPNS0_10ObjectBaseE _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x512setGlobalVarEjPKvj __sF _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x57gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x57initGpuEP9RsdHalRecPNS0_6ScriptE _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x5C1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x5C2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZTVN7android12renderscript32rsdGpuScriptIntrinsicConvolve5x5E src_kernels_convolve5x5 _Z29rsdClAllocationAcquireHostPtrPKN7android12renderscript7ContextEPKNS0_10AllocationEb _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogram11getNumSlotsEv _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogram12setGlobalObjEjPNS0_10ObjectBaseE _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogram12setGlobalVarEjPKvj _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogram17gpuRunHelper_fullEPKNS0_10AllocationE _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogram22gpuRunHelper_remainderEPKNS0_10AllocationE _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogram7gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogram7initGpuEP9RsdHalRecPNS0_6ScriptE _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogramC1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogramC2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogramD0Ev _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogramD1Ev _ZN7android12renderscript30rsdGpuScriptIntrinsicHistogramD2Ev _ZTVN7android12renderscript30rsdGpuScriptIntrinsicHistogramE __aeabi_memclr src_kernels_histogram _ZN7android12renderscript24rsdGpuScriptIntrinsicLUT11getNumSlotsEv _ZN7android12renderscript24rsdGpuScriptIntrinsicLUT12setGlobalObjEjPNS0_10ObjectBaseE _ZN7android12renderscript24rsdGpuScriptIntrinsicLUT7gpuRootEPKNS0_7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS7_PKvjPK12RsScriptCall _ZN7android12renderscript24rsdGpuScriptIntrinsicLUT7initGpuEP9RsdHalRecPNS0_6ScriptE _ZN7android12renderscript24rsdGpuScriptIntrinsicLUTC1EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZN7android12renderscript24rsdGpuScriptIntrinsicLUTC2EPKNS0_7ContextEPNS0_6ScriptEPKNS0_7ElementE19RsScriptIntrinsicID _ZTVN7android12renderscript24rsdGpuScriptIntrinsicLUTE src_kernels_lut _Z13rsdCLShutdownPKN7android12renderscript7ContextE _Z9rsdCLInitPKN7android12renderscript7ContextE _ZN7android12renderscript12property_getEPKcPcS2_ dlerror dlopen dlsym free malloc pthread_mutex_destroy pthread_mutex_init strcmp strncmp _Z17rsdAllocationInitPKN7android12renderscript7ContextEPNS0_10AllocationEb _Z19rsdAllocationData1DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjPKvj _Z19rsdAllocationData2DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjj23RsAllocationCubemapFacejjPKvjj _Z19rsdAllocationData3DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjjjjjPKvjj _Z19rsdAllocationLock1DPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z19rsdAllocationRead1DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjPvj _Z19rsdAllocationRead2DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjj23RsAllocationCubemapFacejjPvjj _Z19rsdAllocationRead3DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjjjjjPvjj _Z19rsdAllocationResizePKN7android12renderscript7ContextEPKNS0_10AllocationEPKNS0_4TypeEb _Z19rsdClAllocationInitPKN7android12renderscript7ContextEPNS0_10AllocationEb _Z20rsdAllocationDestroyPKN7android12renderscript7ContextEPNS0_10AllocationE _Z20rsdAllocationSyncAllPKN7android12renderscript7ContextEPKNS0_10AllocationE21RsAllocationUsageType _Z21rsdAllocationUnlock1DPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z21rsdClAllocationData1DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjPKvj _Z21rsdClAllocationData2DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjj23RsAllocationCubemapFacejjPKvjj _Z21rsdClAllocationData3DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjjjjjPKvjj _Z21rsdClAllocationLock1DPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z21rsdClAllocationMapAllPKN7android12renderscript7ContextE _Z21rsdClAllocationRead1DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjPvj _Z21rsdClAllocationRead2DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjj23RsAllocationCubemapFacejjPvjj _Z21rsdClAllocationRead3DPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjjjjjPvjj _Z21rsdClAllocationResizePKN7android12renderscript7ContextEPKNS0_10AllocationEPKNS0_4TypeEb _Z22rsdAllocationMarkDirtyPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z22rsdClAllocationDestroyPKN7android12renderscript7ContextEPNS0_10AllocationE _Z22rsdClAllocationSyncAllPKN7android12renderscript7ContextEPKNS0_10AllocationE21RsAllocationUsageType _Z23rsdAllocationSetSurfacePKN7android12renderscript7ContextEPNS0_10AllocationEP13ANativeWindow _Z23rsdClAllocationShutdownPKN7android12renderscript7ContextE _Z23rsdClAllocationUnlock1DPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z24rsdAllocationAdapterInitPKN7android12renderscript7ContextEPNS0_10AllocationE _Z24rsdAllocationElementDataPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjPKvjj _Z24rsdAllocationElementReadPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjPvjj _Z24rsdClAllocationMarkDirtyPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z25rsdAllocationData1D_allocPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjS6_jj _Z25rsdAllocationData2D_allocPKN7android12renderscript7ContextEPKNS0_10AllocationEjjj23RsAllocationCubemapFacejjS6_jjjS7_ _Z25rsdAllocationData3D_allocPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjjjjjS6_jjjj _Z25rsdClAllocationSetSurfacePKN7android12renderscript7ContextEPNS0_10AllocationEP13ANativeWindow _Z26rsdClAllocationAdapterInitPKN7android12renderscript7ContextEPNS0_10AllocationE _Z26rsdClAllocationElementDataPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjPKvjj _Z26rsdClAllocationElementReadPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjPvjj _Z27rsdClAllocationData1D_allocPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjS6_jj _Z27rsdClAllocationData2D_allocPKN7android12renderscript7ContextEPKNS0_10AllocationEjjj23RsAllocationCubemapFacejjS6_jjjS7_ _Z27rsdClAllocationData3D_allocPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjjjjjS6_jjjj _Z27rsdClAllocationSwitchToHostPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z28rsdAllocationGenerateMipmapsPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z29rsdClAllocationSwitchToDevicePKN7android12renderscript7ContextEPKNS0_10AllocationE _Z30rsdClAllocationGenerateMipmapsPKN7android12renderscript7ContextEPKNS0_10AllocationE _ZNK7android12renderscript7Element11getSizeBitsEv _ZNSt3__16vectorIPKN7android12renderscript10AllocationENS_9allocatorIS5_EEE21__push_back_slow_pathIS5_EEvOT_ _ZNSt3__16vectorIPKN7android12renderscript10AllocationENS_9allocatorIS5_EEE26__swap_out_circular_bufferERNS_14__split_bufferIS5_RS7_EE __aeabi_memmove abort pthread_mutex_lock pthread_mutex_unlock _Z13rsdScriptInitPKN7android12renderscript7ContextEPNS0_7ScriptCEPKcS7_PKhjj _Z15rsdClScriptInitPKN7android12renderscript7ContextEPNS0_7ScriptCEPKcS7_PKhjj _Z16rsdInitIntrinsicPKN7android12renderscript7ContextEPNS0_6ScriptE19RsScriptIntrinsicIDPNS0_7ElementE _Z16rsdScriptDestroyPKN7android12renderscript7ContextEPNS0_6ScriptE _Z18rsdClInitIntrinsicPKN7android12renderscript7ContextEPNS0_6ScriptE19RsScriptIntrinsicIDPNS0_7ElementE _Z18rsdClScriptDestroyPKN7android12renderscript7ContextEPNS0_6ScriptE _Z19rsdClSetPtrArgumentPKN7android12renderscript7ContextEPKNS0_6ScriptEP13maliKernelRecjPv _Z19rsdClSetPtrArgumentPKN7android12renderscript7ContextEPKNS0_6ScriptEP13maliKernelRecjPvj _Z19rsdScriptInvokeInitPKN7android12renderscript7ContextEPNS0_6ScriptE _Z19rsdScriptInvokeRootPKN7android12renderscript7ContextEPNS0_6ScriptE _Z21rsdClScriptInvokeInitPKN7android12renderscript7ContextEPNS0_6ScriptE _Z21rsdClScriptInvokeRootPKN7android12renderscript7ContextEPNS0_6ScriptE _Z21rsdScriptGroupExecutePKN7android12renderscript7ContextEPKNS0_15ScriptGroupBaseE _Z21rsdScriptSetGlobalObjPKN7android12renderscript7ContextEPKNS0_6ScriptEjPNS0_10ObjectBaseE _Z21rsdScriptSetGlobalVarPKN7android12renderscript7ContextEPKNS0_6ScriptEjPvj _Z22rsdClSetAllocationAddrPKN7android12renderscript7ContextEPKNS0_10AllocationEPj _Z22rsdScriptInvokeForEachPKN7android12renderscript7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS6_PKvjPK12RsScriptCall _Z23rsdClScriptGroupExecutePKN7android12renderscript7ContextEPKNS0_15ScriptGroupBaseE _Z23rsdClScriptSetGlobalObjPKN7android12renderscript7ContextEPKNS0_6ScriptEjPNS0_10ObjectBaseE _Z23rsdClScriptSetGlobalVarPKN7android12renderscript7ContextEPKNS0_6ScriptEjPvj _Z23rsdScriptInvokeFunctionPKN7android12renderscript7ContextEPNS0_6ScriptEjPKvj _Z24rsdClRegisterTranslationPKN7android12renderscript7ContextEPKNS0_6ScriptEPv _Z24rsdClScriptInvokeForEachPKN7android12renderscript7ContextEPNS0_6ScriptEjPKNS0_10AllocationEPS6_PKvjPK12RsScriptCall _Z25rsdClScriptInvokeFunctionPKN7android12renderscript7ContextEPNS0_6ScriptEjPKvj _Z26rsdClDeclareUsedAllocationPKN7android12renderscript7ContextEPKNS0_6ScriptEP13maliKernelRecPNS0_10AllocationE _Z27rsdScriptInvokeForEachMultiPKN7android12renderscript7ContextEPNS0_6ScriptEjPPKNS0_10AllocationEjPS6_PKvjPK12RsScriptCall _Z29rsdClScriptInvokeForEachMultiPKN7android12renderscript7ContextEPNS0_6ScriptEjPPKNS0_10AllocationEjPS6_PKvjPK12RsScriptCall _ZN7android12renderscript15RsdCpuReference15getThreadTLSKeyEv _ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE21__grow_by_and_replaceEjjjjjjPKc _ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6appendEPKcj _ZNSt3__113__vector_baseIPN7android12renderscript10AllocationENS_9allocatorIS4_EEED2Ev _ZNSt3__16vectorIPN7android12renderscript10AllocationENS_9allocatorIS4_EEE21__push_back_slow_pathIRKS4_EEvOT_ _ZNSt3__16vectorIPN7android12renderscript10AllocationENS_9allocatorIS4_EEE26__swap_out_circular_bufferERNS_14__split_bufferIS4_RS6_EE _ZNSt3__1plIcNS_11char_traitsIcEENS_9allocatorIcEEEENS_12basic_stringIT_T0_T1_EERKS9_PKS6_ fclose fopen fwrite pthread_getspecific strlen _Z11rsdShutdownPN7android12renderscript7ContextE _Z11rsdTypeInitPKN7android12renderscript7ContextEPKNS0_4TypeE _Z12rsdClHalInitPvjj _Z13rsdClHalAbortPv _Z14rsdElementInitPKN7android12renderscript7ContextEPKNS0_7ElementE _Z14rsdSamplerInitPKN7android12renderscript7ContextEPKNS0_7SamplerE _Z14rsdSetPriorityPKN7android12renderscript7ContextEi _Z14rsdTypeDestroyPKN7android12renderscript7ContextEPKNS0_4TypeE _Z16rsdClHalQueryHalN7android12renderscript14RsHalInitEnumsEPPv _Z17rsdElementDestroyPKN7android12renderscript7ContextEPKNS0_7ElementE _Z17rsdFreeRuntimeMemPv _Z17rsdSamplerDestroyPKN7android12renderscript7ContextEPKNS0_7SamplerE _Z18rsdAllocRuntimeMemjj _Z18rsdScriptGroupInitPKN7android12renderscript7ContextEPNS0_15ScriptGroupBaseE _Z19rsdAllocationIoSendPKN7android12renderscript7ContextEPNS0_10AllocationE _Z21rsdScriptGetGlobalVarPKN7android12renderscript7ContextEPKNS0_6ScriptEjPvj _Z21rsdScriptGroupDestroyPKN7android12renderscript7ContextEPKNS0_15ScriptGroupBaseE _Z21rsdScriptInvokeReducePKN7android12renderscript7ContextEPNS0_6ScriptEjPPKNS0_10AllocationEjPS6_PK12RsScriptCall _Z22rsdAllocationIoReceivePKN7android12renderscript7ContextEPNS0_10AllocationE _Z22rsdScriptGroupSetInputPKN7android12renderscript7ContextEPKNS0_11ScriptGroupEPKNS0_14ScriptKernelIDEPNS0_10AllocationE _Z22rsdScriptSetGlobalBindPKN7android12renderscript7ContextEPKNS0_6ScriptEjPNS0_10AllocationE _Z23rsdScriptGroupSetOutputPKN7android12renderscript7ContextEPKNS0_11ScriptGroupEPKNS0_14ScriptKernelIDEPNS0_10AllocationE _Z24rsdAllocationGrallocBitsPKN7android12renderscript7ContextEPNS0_10AllocationE _Z25rsdTypeUpdateCachedObjectPKN7android12renderscript7ContextEPKNS0_4TypeEPNS0_7rs_typeE _Z26rsdAllocationAdapterOffsetPKN7android12renderscript7ContextEPKNS0_10AllocationE _Z27rsdScriptInvokeFreeChildrenPKN7android12renderscript7ContextEPNS0_6ScriptE _Z27rsdScriptUpdateCachedObjectPKN7android12renderscript7ContextEPKNS0_6ScriptEPNS0_9rs_scriptE _Z28rsdElementUpdateCachedObjectPKN7android12renderscript7ContextEPKNS0_7ElementEPNS0_10rs_elementE _Z28rsdSamplerUpdateCachedObjectPKN7android12renderscript7ContextEPKNS0_7SamplerEPNS0_10rs_samplerE _Z31rsdAllocationUpdateCachedObjectPKN7android12renderscript7ContextEPKNS0_10AllocationEPNS0_13rs_allocationE _Z33rsdScriptSetGlobalVarWithElemDimsPKN7android12renderscript7ContextEPKNS0_6ScriptEjPvjPKNS0_7ElementEPKjj _ZN6bcinfo17MetadataExtractor7extractEv libbcinfo libbcinfo.so _ZN6bcinfo17MetadataExtractorC1EPKcj _ZN6bcinfo17MetadataExtractorD1Ev _ZN7android12renderscript15RsdCpuReference6createEPNS0_7ContextEjjPFPKNS1_9CpuSymbolES3_PKcEPFPNS1_9CpuScriptES3_PKNS0_6ScriptEEPFS8_S8_jES8_ _ZN7android12renderscript16RsdCpuScriptImpl12BCC_EXE_PATHE atoi _ZN7android12renderscript15RsdCpuReference12getTlsScriptEv _ZN7android12renderscript15RsdCpuReference13getTlsContextEv rsdDeclareUsedAllocation rsdRegisterTranslated rsdSetAllocationAddr rsdSetAllocationArg rsdSetPtrArg AHardwareBuffer_lock AHardwareBuffer_unlock ANativeWindowBuffer_getHardwareBuffer ANativeWindow_cancelBuffer ANativeWindow_dequeueBuffer ANativeWindow_queueBuffer ANativeWindow_release ANativeWindow_setBuffersGeometry _Z12GetOffsetPtrPKN7android12renderscript10AllocationEjjjj23RsAllocationCubemapFace _Z15rsdTypeToGLType10RsDataType _Z17rsdKindToGLFormat10RsDataKind _Z24rsdAllocationInitStridedPKN7android12renderscript7ContextEPNS0_10AllocationEbj _Z32rsdAllocationData2D_alloc_scriptPKN7android12renderscript7ContextEPKNS0_10AllocationEjjj23RsAllocationCubemapFacejjS6_jjjS7_ _Z32rsdAllocationData3D_alloc_scriptPKN7android12renderscript7ContextEPKNS0_10AllocationEjjjjjjjS6_jjjj _ZN7android12renderscript7rsRoundIjEET_S2_j _ZNK7android12renderscript10Allocation7decRefsEPKvjj _ZNK7android12renderscript10Allocation7incRefsEPKvjj _ZNK7android12renderscript7Element7decRefsEPKv _ZNK7android12renderscript7Element7incRefsEPKv close memalign realloc _Z32rsdScriptGetAllocationForPointerPKN7android12renderscript7ContextEPKNS0_6ScriptEPKv rsdHalAbort rsdHalInit rsdHalQueryHal rsdHalQueryVersion _Z10rsIsObject10rs_element _Z10rsIsObject10rs_sampler _Z10rsIsObject13rs_allocation _Z10rsIsObject16rs_program_store _Z10rsIsObject17rs_program_raster _Z10rsIsObject17rs_program_vertex _Z10rsIsObject19rs_program_fragment _Z10rsIsObject7rs_font _Z10rsIsObject7rs_mesh _Z10rsIsObject7rs_type _Z10rsIsObject9rs_script _Z11rsLocaltimeP5rs_tmPKi _Z11rsSetObjectP10rs_elementS_ _Z11rsSetObjectP10rs_samplerS_ _Z11rsSetObjectP13rs_allocationS_ _Z11rsSetObjectP16rs_program_storeS_ _Z11rsSetObjectP17rs_program_rasterS_ _Z11rsSetObjectP17rs_program_vertexS_ _Z11rsSetObjectP19rs_program_fragmentS_ _Z11rsSetObjectP7rs_fontS_ _Z11rsSetObjectP7rs_meshS_ _Z11rsSetObjectP7rs_typeS_ _Z11rsSetObjectP9rs_scriptS_ _Z12rsCreateType10rs_elementjjjbb13rs_yuv_format _Z13rsClearObjectP10rs_element _Z13rsClearObjectP10rs_sampler _Z13rsClearObjectP13rs_allocation _Z13rsClearObjectP16rs_program_store _Z13rsClearObjectP17rs_program_raster _Z13rsClearObjectP17rs_program_vertex _Z13rsClearObjectP19rs_program_fragment _Z13rsClearObjectP7rs_font _Z13rsClearObjectP7rs_mesh _Z13rsClearObjectP7rs_type _Z13rsClearObjectP9rs_script _Z13rsUptimeNanosv _Z14rsGetElementAt13rs_allocationj _Z14rsGetElementAt13rs_allocationjj _Z14rsGetElementAt13rs_allocationjjj _Z14rsSendToClienti _Z14rsSendToClientiPKvj _Z14rsSetElementAt13rs_allocationPKvj _Z14rsSetElementAt13rs_allocationPKvjj _Z14rsSetElementAt13rs_allocationPKvjjj _Z14rsUptimeMillisv _Z15rsCreateElementiibj _Z15rsGetAllocationPKv _Z17rsForEachInternaliP14rs_script_calliiP13rs_allocation _Z18rsAllocationIoSend13rs_allocation _Z18rsCreateAllocation7rs_type28rs_allocation_mipmap_controljPv _Z18rsGetElementAt_int13rs_allocationPij _Z18rsGetElementAt_int13rs_allocationPijj _Z18rsGetElementAt_int13rs_allocationPijjj _Z18rsSetElementAt_int13rs_allocationPKij _Z18rsSetElementAt_int13rs_allocationPKijj _Z18rsSetElementAt_int13rs_allocationPKijjj _Z19rsGetElementAt_char13rs_allocationPcj _Z19rsGetElementAt_char13rs_allocationPcjj _Z19rsGetElementAt_char13rs_allocationPcjjj _Z19rsGetElementAt_half13rs_allocationPDhj _Z19rsGetElementAt_half13rs_allocationPDhjj _Z19rsGetElementAt_half13rs_allocationPDhjjj _Z19rsGetElementAt_int213rs_allocationPDv2_ij _Z19rsGetElementAt_int213rs_allocationPDv2_ijj _Z19rsGetElementAt_int213rs_allocationPDv2_ijjj _Z19rsGetElementAt_int313rs_allocationPDv3_ij _Z19rsGetElementAt_int313rs_allocationPDv3_ijj _Z19rsGetElementAt_int313rs_allocationPDv3_ijjj _Z19rsGetElementAt_int413rs_allocationPDv4_ij _Z19rsGetElementAt_int413rs_allocationPDv4_ijj _Z19rsGetElementAt_int413rs_allocationPDv4_ijjj _Z19rsGetElementAt_long13rs_allocationPlj _Z19rsGetElementAt_long13rs_allocationPljj _Z19rsGetElementAt_long13rs_allocationPljjj _Z19rsGetElementAt_long13rs_allocationPxj _Z19rsGetElementAt_long13rs_allocationPxjj _Z19rsGetElementAt_long13rs_allocationPxjjj _Z19rsGetElementAt_uint13rs_allocationPjj _Z19rsGetElementAt_uint13rs_allocationPjjj _Z19rsGetElementAt_uint13rs_allocationPjjjj _Z19rsSetElementAt_char13rs_allocationPKcj _Z19rsSetElementAt_char13rs_allocationPKcjj _Z19rsSetElementAt_char13rs_allocationPKcjjj _Z19rsSetElementAt_half13rs_allocationPKDhj _Z19rsSetElementAt_half13rs_allocationPKDhjj _Z19rsSetElementAt_half13rs_allocationPKDhjjj _Z19rsSetElementAt_int213rs_allocationPKDv2_ij _Z19rsSetElementAt_int213rs_allocationPKDv2_ijj _Z19rsSetElementAt_int213rs_allocationPKDv2_ijjj _Z19rsSetElementAt_int313rs_allocationPKDv3_ij _Z19rsSetElementAt_int313rs_allocationPKDv3_ijj _Z19rsSetElementAt_int313rs_allocationPKDv3_ijjj _Z19rsSetElementAt_int413rs_allocationPKDv4_ij _Z19rsSetElementAt_int413rs_allocationPKDv4_ijj _Z19rsSetElementAt_int413rs_allocationPKDv4_ijjj _Z19rsSetElementAt_long13rs_allocationPKlj _Z19rsSetElementAt_long13rs_allocationPKljj _Z19rsSetElementAt_long13rs_allocationPKljjj _Z19rsSetElementAt_long13rs_allocationPKxj _Z19rsSetElementAt_long13rs_allocationPKxjj _Z19rsSetElementAt_long13rs_allocationPKxjjj _Z19rsSetElementAt_uint13rs_allocationPKjj _Z19rsSetElementAt_uint13rs_allocationPKjjj _Z19rsSetElementAt_uint13rs_allocationPKjjjj _Z20rsGetElementAt_char213rs_allocationPDv2_cj _Z20rsGetElementAt_char213rs_allocationPDv2_cjj _Z20rsGetElementAt_char213rs_allocationPDv2_cjjj _Z20rsGetElementAt_char313rs_allocationPDv3_cj _Z20rsGetElementAt_char313rs_allocationPDv3_cjj _Z20rsGetElementAt_char313rs_allocationPDv3_cjjj _Z20rsGetElementAt_char413rs_allocationPDv4_cj _Z20rsGetElementAt_char413rs_allocationPDv4_cjj _Z20rsGetElementAt_char413rs_allocationPDv4_cjjj _Z20rsGetElementAt_float13rs_allocationPfj _Z20rsGetElementAt_float13rs_allocationPfjj _Z20rsGetElementAt_float13rs_allocationPfjjj _Z20rsGetElementAt_half213rs_allocationPDv2_Dhj _Z20rsGetElementAt_half213rs_allocationPDv2_Dhjj _Z20rsGetElementAt_half213rs_allocationPDv2_Dhjjj _Z20rsGetElementAt_half313rs_allocationPDv3_Dhj _Z20rsGetElementAt_half313rs_allocationPDv3_Dhjj _Z20rsGetElementAt_half313rs_allocationPDv3_Dhjjj _Z20rsGetElementAt_half413rs_allocationPDv4_Dhj _Z20rsGetElementAt_half413rs_allocationPDv4_Dhjj _Z20rsGetElementAt_half413rs_allocationPDv4_Dhjjj _Z20rsGetElementAt_long213rs_allocationPDv2_lj _Z20rsGetElementAt_long213rs_allocationPDv2_ljj _Z20rsGetElementAt_long213rs_allocationPDv2_ljjj _Z20rsGetElementAt_long213rs_allocationPDv2_xj _Z20rsGetElementAt_long213rs_allocationPDv2_xjj _Z20rsGetElementAt_long213rs_allocationPDv2_xjjj _Z20rsGetElementAt_long313rs_allocationPDv3_lj _Z20rsGetElementAt_long313rs_allocationPDv3_ljj _Z20rsGetElementAt_long313rs_allocationPDv3_ljjj _Z20rsGetElementAt_long313rs_allocationPDv3_xj _Z20rsGetElementAt_long313rs_allocationPDv3_xjj _Z20rsGetElementAt_long313rs_allocationPDv3_xjjj _Z20rsGetElementAt_long413rs_allocationPDv4_lj _Z20rsGetElementAt_long413rs_allocationPDv4_ljj _Z20rsGetElementAt_long413rs_allocationPDv4_ljjj _Z20rsGetElementAt_long413rs_allocationPDv4_xj _Z20rsGetElementAt_long413rs_allocationPDv4_xjj _Z20rsGetElementAt_long413rs_allocationPDv4_xjjj _Z20rsGetElementAt_short13rs_allocationPsj _Z20rsGetElementAt_short13rs_allocationPsjj _Z20rsGetElementAt_short13rs_allocationPsjjj _Z20rsGetElementAt_uchar13rs_allocationPhj _Z20rsGetElementAt_uchar13rs_allocationPhjj _Z20rsGetElementAt_uchar13rs_allocationPhjjj _Z20rsGetElementAt_uint213rs_allocationPDv2_jj _Z20rsGetElementAt_uint213rs_allocationPDv2_jjj _Z20rsGetElementAt_uint213rs_allocationPDv2_jjjj _Z20rsGetElementAt_uint313rs_allocationPDv3_jj _Z20rsGetElementAt_uint313rs_allocationPDv3_jjj _Z20rsGetElementAt_uint313rs_allocationPDv3_jjjj _Z20rsGetElementAt_uint413rs_allocationPDv4_jj _Z20rsGetElementAt_uint413rs_allocationPDv4_jjj _Z20rsGetElementAt_uint413rs_allocationPDv4_jjjj _Z20rsGetElementAt_ulong13rs_allocationPmj _Z20rsGetElementAt_ulong13rs_allocationPmjj _Z20rsGetElementAt_ulong13rs_allocationPmjjj _Z20rsGetElementAt_ulong13rs_allocationPyj _Z20rsGetElementAt_ulong13rs_allocationPyjj _Z20rsGetElementAt_ulong13rs_allocationPyjjj _Z20rsSetElementAt_char213rs_allocationPKDv2_cj _Z20rsSetElementAt_char213rs_allocationPKDv2_cjj _Z20rsSetElementAt_char213rs_allocationPKDv2_cjjj _Z20rsSetElementAt_char313rs_allocationPKDv3_cj _Z20rsSetElementAt_char313rs_allocationPKDv3_cjj _Z20rsSetElementAt_char313rs_allocationPKDv3_cjjj _Z20rsSetElementAt_char413rs_allocationPKDv4_cj _Z20rsSetElementAt_char413rs_allocationPKDv4_cjj _Z20rsSetElementAt_char413rs_allocationPKDv4_cjjj _Z20rsSetElementAt_float13rs_allocationPKfj _Z20rsSetElementAt_float13rs_allocationPKfjj _Z20rsSetElementAt_float13rs_allocationPKfjjj _Z20rsSetElementAt_half213rs_allocationPKDv2_Dhj _Z20rsSetElementAt_half213rs_allocationPKDv2_Dhjj _Z20rsSetElementAt_half213rs_allocationPKDv2_Dhjjj _Z20rsSetElementAt_half313rs_allocationPKDv3_Dhj _Z20rsSetElementAt_half313rs_allocationPKDv3_Dhjj _Z20rsSetElementAt_half313rs_allocationPKDv3_Dhjjj _Z20rsSetElementAt_half413rs_allocationPKDv4_Dhj _Z20rsSetElementAt_half413rs_allocationPKDv4_Dhjj _Z20rsSetElementAt_half413rs_allocationPKDv4_Dhjjj _Z20rsSetElementAt_long213rs_allocationPKDv2_lj _Z20rsSetElementAt_long213rs_allocationPKDv2_ljj _Z20rsSetElementAt_long213rs_allocationPKDv2_ljjj _Z20rsSetElementAt_long213rs_allocationPKDv2_xj _Z20rsSetElementAt_long213rs_allocationPKDv2_xjj _Z20rsSetElementAt_long213rs_allocationPKDv2_xjjj _Z20rsSetElementAt_long313rs_allocationPKDv3_lj _Z20rsSetElementAt_long313rs_allocationPKDv3_ljj _Z20rsSetElementAt_long313rs_allocationPKDv3_ljjj _Z20rsSetElementAt_long313rs_allocationPKDv3_xj _Z20rsSetElementAt_long313rs_allocationPKDv3_xjj _Z20rsSetElementAt_long313rs_allocationPKDv3_xjjj _Z20rsSetElementAt_long413rs_allocationPKDv4_lj _Z20rsSetElementAt_long413rs_allocationPKDv4_ljj _Z20rsSetElementAt_long413rs_allocationPKDv4_ljjj _Z20rsSetElementAt_long413rs_allocationPKDv4_xj _Z20rsSetElementAt_long413rs_allocationPKDv4_xjj _Z20rsSetElementAt_long413rs_allocationPKDv4_xjjj _Z20rsSetElementAt_short13rs_allocationPKsj _Z20rsSetElementAt_short13rs_allocationPKsjj _Z20rsSetElementAt_short13rs_allocationPKsjjj _Z20rsSetElementAt_uchar13rs_allocationPKhj _Z20rsSetElementAt_uchar13rs_allocationPKhjj _Z20rsSetElementAt_uchar13rs_allocationPKhjjj _Z20rsSetElementAt_uint213rs_allocationPKDv2_jj _Z20rsSetElementAt_uint213rs_allocationPKDv2_jjj _Z20rsSetElementAt_uint213rs_allocationPKDv2_jjjj _Z20rsSetElementAt_uint313rs_allocationPKDv3_jj _Z20rsSetElementAt_uint313rs_allocationPKDv3_jjj _Z20rsSetElementAt_uint313rs_allocationPKDv3_jjjj _Z20rsSetElementAt_uint413rs_allocationPKDv4_jj _Z20rsSetElementAt_uint413rs_allocationPKDv4_jjj _Z20rsSetElementAt_uint413rs_allocationPKDv4_jjjj _Z20rsSetElementAt_ulong13rs_allocationPKmj _Z20rsSetElementAt_ulong13rs_allocationPKmjj _Z20rsSetElementAt_ulong13rs_allocationPKmjjj _Z20rsSetElementAt_ulong13rs_allocationPKyj _Z20rsSetElementAt_ulong13rs_allocationPKyjj _Z20rsSetElementAt_ulong13rs_allocationPKyjjj _Z20rsdLookupRuntimeStubPN7android12renderscript7ContextEPKc _Z21rsAllocationIoReceive13rs_allocation _Z21rsGetElementAt_double13rs_allocationPdj _Z21rsGetElementAt_double13rs_allocationPdjj _Z21rsGetElementAt_double13rs_allocationPdjjj _Z21rsGetElementAt_float213rs_allocationPDv2_fj _Z21rsGetElementAt_float213rs_allocationPDv2_fjj _Z21rsGetElementAt_float213rs_allocationPDv2_fjjj _Z21rsGetElementAt_float313rs_allocationPDv3_fj _Z21rsGetElementAt_float313rs_allocationPDv3_fjj _Z21rsGetElementAt_float313rs_allocationPDv3_fjjj _Z21rsGetElementAt_float413rs_allocationPDv4_fj _Z21rsGetElementAt_float413rs_allocationPDv4_fjj _Z21rsGetElementAt_float413rs_allocationPDv4_fjjj _Z21rsGetElementAt_short213rs_allocationPDv2_sj _Z21rsGetElementAt_short213rs_allocationPDv2_sjj _Z21rsGetElementAt_short213rs_allocationPDv2_sjjj _Z21rsGetElementAt_short313rs_allocationPDv3_sj _Z21rsGetElementAt_short313rs_allocationPDv3_sjj _Z21rsGetElementAt_short313rs_allocationPDv3_sjjj _Z21rsGetElementAt_short413rs_allocationPDv4_sj _Z21rsGetElementAt_short413rs_allocationPDv4_sjj _Z21rsGetElementAt_short413rs_allocationPDv4_sjjj _Z21rsGetElementAt_uchar213rs_allocationPDv2_hj _Z21rsGetElementAt_uchar213rs_allocationPDv2_hjj _Z21rsGetElementAt_uchar213rs_allocationPDv2_hjjj _Z21rsGetElementAt_uchar313rs_allocationPDv3_hj _Z21rsGetElementAt_uchar313rs_allocationPDv3_hjj _Z21rsGetElementAt_uchar313rs_allocationPDv3_hjjj _Z21rsGetElementAt_uchar413rs_allocationPDv4_hj _Z21rsGetElementAt_uchar413rs_allocationPDv4_hjj _Z21rsGetElementAt_uchar413rs_allocationPDv4_hjjj _Z21rsGetElementAt_ulong213rs_allocationPDv2_mj _Z21rsGetElementAt_ulong213rs_allocationPDv2_mjj _Z21rsGetElementAt_ulong213rs_allocationPDv2_mjjj _Z21rsGetElementAt_ulong213rs_allocationPDv2_yj _Z21rsGetElementAt_ulong213rs_allocationPDv2_yjj _Z21rsGetElementAt_ulong213rs_allocationPDv2_yjjj _Z21rsGetElementAt_ulong313rs_allocationPDv3_mj _Z21rsGetElementAt_ulong313rs_allocationPDv3_mjj _Z21rsGetElementAt_ulong313rs_allocationPDv3_mjjj _Z21rsGetElementAt_ulong313rs_allocationPDv3_yj _Z21rsGetElementAt_ulong313rs_allocationPDv3_yjj _Z21rsGetElementAt_ulong313rs_allocationPDv3_yjjj _Z21rsGetElementAt_ulong413rs_allocationPDv4_mj _Z21rsGetElementAt_ulong413rs_allocationPDv4_mjj _Z21rsGetElementAt_ulong413rs_allocationPDv4_mjjj _Z21rsGetElementAt_ulong413rs_allocationPDv4_yj _Z21rsGetElementAt_ulong413rs_allocationPDv4_yjj _Z21rsGetElementAt_ulong413rs_allocationPDv4_yjjj _Z21rsGetElementAt_ushort13rs_allocationPtj _Z21rsGetElementAt_ushort13rs_allocationPtjj _Z21rsGetElementAt_ushort13rs_allocationPtjjj _Z21rsSetElementAt_double13rs_allocationPKdj _Z21rsSetElementAt_double13rs_allocationPKdjj _Z21rsSetElementAt_double13rs_allocationPKdjjj _Z21rsSetElementAt_float213rs_allocationPKDv2_fj _Z21rsSetElementAt_float213rs_allocationPKDv2_fjj _Z21rsSetElementAt_float213rs_allocationPKDv2_fjjj _Z21rsSetElementAt_float313rs_allocationPKDv3_fj _Z21rsSetElementAt_float313rs_allocationPKDv3_fjj _Z21rsSetElementAt_float313rs_allocationPKDv3_fjjj _Z21rsSetElementAt_float413rs_allocationPKDv4_fj _Z21rsSetElementAt_float413rs_allocationPKDv4_fjj _Z21rsSetElementAt_float413rs_allocationPKDv4_fjjj _Z21rsSetElementAt_short213rs_allocationPKDv2_sj _Z21rsSetElementAt_short213rs_allocationPKDv2_sjj _Z21rsSetElementAt_short213rs_allocationPKDv2_sjjj _Z21rsSetElementAt_short313rs_allocationPKDv3_sj _Z21rsSetElementAt_short313rs_allocationPKDv3_sjj _Z21rsSetElementAt_short313rs_allocationPKDv3_sjjj _Z21rsSetElementAt_short413rs_allocationPKDv4_sj _Z21rsSetElementAt_short413rs_allocationPKDv4_sjj _Z21rsSetElementAt_short413rs_allocationPKDv4_sjjj _Z21rsSetElementAt_uchar213rs_allocationPKDv2_hj _Z21rsSetElementAt_uchar213rs_allocationPKDv2_hjj _Z21rsSetElementAt_uchar213rs_allocationPKDv2_hjjj _Z21rsSetElementAt_uchar313rs_allocationPKDv3_hj _Z21rsSetElementAt_uchar313rs_allocationPKDv3_hjj _Z21rsSetElementAt_uchar313rs_allocationPKDv3_hjjj _Z21rsSetElementAt_uchar413rs_allocationPKDv4_hj _Z21rsSetElementAt_uchar413rs_allocationPKDv4_hjj _Z21rsSetElementAt_uchar413rs_allocationPKDv4_hjjj _Z21rsSetElementAt_ulong213rs_allocationPKDv2_mj _Z21rsSetElementAt_ulong213rs_allocationPKDv2_mjj _Z21rsSetElementAt_ulong213rs_allocationPKDv2_mjjj _Z21rsSetElementAt_ulong213rs_allocationPKDv2_yj _Z21rsSetElementAt_ulong213rs_allocationPKDv2_yjj _Z21rsSetElementAt_ulong213rs_allocationPKDv2_yjjj _Z21rsSetElementAt_ulong313rs_allocationPKDv3_mj _Z21rsSetElementAt_ulong313rs_allocationPKDv3_mjj _Z21rsSetElementAt_ulong313rs_allocationPKDv3_mjjj _Z21rsSetElementAt_ulong313rs_allocationPKDv3_yj _Z21rsSetElementAt_ulong313rs_allocationPKDv3_yjj _Z21rsSetElementAt_ulong313rs_allocationPKDv3_yjjj _Z21rsSetElementAt_ulong413rs_allocationPKDv4_mj _Z21rsSetElementAt_ulong413rs_allocationPKDv4_mjj _Z21rsSetElementAt_ulong413rs_allocationPKDv4_mjjj _Z21rsSetElementAt_ulong413rs_allocationPKDv4_yj _Z21rsSetElementAt_ulong413rs_allocationPKDv4_yjj _Z21rsSetElementAt_ulong413rs_allocationPKDv4_yjjj _Z21rsSetElementAt_ushort13rs_allocationPKtj _Z21rsSetElementAt_ushort13rs_allocationPKtjj _Z21rsSetElementAt_ushort13rs_allocationPKtjjj _Z22rsGetElementAt_double213rs_allocationPDv2_dj _Z22rsGetElementAt_double213rs_allocationPDv2_djj _Z22rsGetElementAt_double213rs_allocationPDv2_djjj _Z22rsGetElementAt_double313rs_allocationPDv3_dj _Z22rsGetElementAt_double313rs_allocationPDv3_djj _Z22rsGetElementAt_double313rs_allocationPDv3_djjj _Z22rsGetElementAt_double413rs_allocationPDv4_dj _Z22rsGetElementAt_double413rs_allocationPDv4_djj _Z22rsGetElementAt_double413rs_allocationPDv4_djjj _Z22rsGetElementAt_ushort213rs_allocationPDv2_tj _Z22rsGetElementAt_ushort213rs_allocationPDv2_tjj _Z22rsGetElementAt_ushort213rs_allocationPDv2_tjjj _Z22rsGetElementAt_ushort313rs_allocationPDv3_tj _Z22rsGetElementAt_ushort313rs_allocationPDv3_tjj _Z22rsGetElementAt_ushort313rs_allocationPDv3_tjjj _Z22rsGetElementAt_ushort413rs_allocationPDv4_tj _Z22rsGetElementAt_ushort413rs_allocationPDv4_tjj _Z22rsGetElementAt_ushort413rs_allocationPDv4_tjjj _Z22rsSendToClientBlockingi _Z22rsSendToClientBlockingiPKvj _Z22rsSetElementAt_double213rs_allocationPKDv2_dj _Z22rsSetElementAt_double213rs_allocationPKDv2_djj _Z22rsSetElementAt_double213rs_allocationPKDv2_djjj _Z22rsSetElementAt_double313rs_allocationPKDv3_dj _Z22rsSetElementAt_double313rs_allocationPKDv3_djj _Z22rsSetElementAt_double313rs_allocationPKDv3_djjj _Z22rsSetElementAt_double413rs_allocationPKDv4_dj _Z22rsSetElementAt_double413rs_allocationPKDv4_djj _Z22rsSetElementAt_double413rs_allocationPKDv4_djjj _Z22rsSetElementAt_ushort213rs_allocationPKDv2_tj _Z22rsSetElementAt_ushort213rs_allocationPKDv2_tjj _Z22rsSetElementAt_ushort213rs_allocationPKDv2_tjjj _Z22rsSetElementAt_ushort313rs_allocationPKDv3_tj _Z22rsSetElementAt_ushort313rs_allocationPKDv3_tjj _Z22rsSetElementAt_ushort313rs_allocationPKDv3_tjjj _Z22rsSetElementAt_ushort413rs_allocationPKDv4_tj _Z22rsSetElementAt_ushort413rs_allocationPKDv4_tjj _Z22rsSetElementAt_ushort413rs_allocationPKDv4_tjjj _Z23rsAllocationCopy1DRange13rs_allocationjjjS_jj _Z23rsAllocationCopy2DRange13rs_allocationjjj26rs_allocation_cubemap_facejjS_jjjS0_ _Z6rsTimePi _Z7rsDebugPKcDv2_x _Z7rsDebugPKcDv2_y _Z7rsDebugPKcDv3_x _Z7rsDebugPKcDv3_y _Z7rsDebugPKcDv4_x _Z7rsDebugPKcDv4_y _Z7rsDebugPKcPK12rs_matrix2x2 _Z7rsDebugPKcPK12rs_matrix3x3 _Z7rsDebugPKcPK12rs_matrix4x4 _Z7rsDebugPKcPKDv2_c _Z7rsDebugPKcPKDv2_d _Z7rsDebugPKcPKDv2_f _Z7rsDebugPKcPKDv2_fPKDv2_t _Z7rsDebugPKcPKDv2_h _Z7rsDebugPKcPKDv2_i _Z7rsDebugPKcPKDv2_j _Z7rsDebugPKcPKDv2_l _Z7rsDebugPKcPKDv2_m _Z7rsDebugPKcPKDv2_s _Z7rsDebugPKcPKDv2_t _Z7rsDebugPKcPKDv2_x _Z7rsDebugPKcPKDv2_y _Z7rsDebugPKcPKDv3_c _Z7rsDebugPKcPKDv3_d _Z7rsDebugPKcPKDv3_f _Z7rsDebugPKcPKDv3_fPKDv3_t _Z7rsDebugPKcPKDv3_h _Z7rsDebugPKcPKDv3_i _Z7rsDebugPKcPKDv3_j _Z7rsDebugPKcPKDv3_l _Z7rsDebugPKcPKDv3_m _Z7rsDebugPKcPKDv3_s _Z7rsDebugPKcPKDv3_t _Z7rsDebugPKcPKDv3_x _Z7rsDebugPKcPKDv3_y _Z7rsDebugPKcPKDv4_c _Z7rsDebugPKcPKDv4_d _Z7rsDebugPKcPKDv4_f _Z7rsDebugPKcPKDv4_fPKDv4_t _Z7rsDebugPKcPKDv4_h _Z7rsDebugPKcPKDv4_i _Z7rsDebugPKcPKDv4_j _Z7rsDebugPKcPKDv4_l _Z7rsDebugPKcPKDv4_m _Z7rsDebugPKcPKDv4_s _Z7rsDebugPKcPKDv4_t _Z7rsDebugPKcPKDv4_x _Z7rsDebugPKcPKDv4_y _Z7rsDebugPKcPKv _Z7rsDebugPKcc _Z7rsDebugPKcd _Z7rsDebugPKcf _Z7rsDebugPKcff _Z7rsDebugPKcfff _Z7rsDebugPKcffff _Z7rsDebugPKcft _Z7rsDebugPKch _Z7rsDebugPKci _Z7rsDebugPKcj _Z7rsDebugPKcl _Z7rsDebugPKcm _Z7rsDebugPKcs _Z7rsDebugPKct _Z7rsDebugPKcx _Z7rsDebugPKcy _Z7rsGetDtv _Z9rsForEach9rs_script13rs_allocationS0_ _Z9rsForEach9rs_script13rs_allocationS0_PKv _Z9rsForEach9rs_script13rs_allocationS0_PKvPK14rs_script_call _Z9rsForEach9rs_script13rs_allocationS0_PKvj _Z9rsForEach9rs_script13rs_allocationS0_PKvjPK14rs_script_call _ZN7android12renderscript10rsrForEachEPNS0_7ContextEPNS0_6ScriptEjjPPNS0_10AllocationES6_PKvjPK12RsScriptCall _ZN7android12renderscript11rsrToClientEPNS0_7ContextEiPKvi _ZN7android12renderscript12rsrLocalTimeEPNS0_7ContextEP2tmPl _ZN7android12renderscript12rsrSetObjectEPKNS0_7ContextEPNS0_14rs_object_baseEPKNS0_10ObjectBaseE _ZN7android12renderscript13rsrTypeCreateEPNS0_7ContextEPvjjjbbj _ZN7android12renderscript14rsrClearObjectEPNS0_14rs_object_baseE _ZN7android12renderscript14rsrUptimeNanosEPNS0_7ContextE _ZN7android12renderscript15rsrUptimeMillisEPNS0_7ContextE _ZN7android12renderscript16rsrElementCreateEPNS0_7ContextE10RsDataType10RsDataKindbj _ZN7android12renderscript19rsrAllocationIoSendEPNS0_7ContextEPNS0_10AllocationE _ZN7android12renderscript19rsrToClientBlockingEPNS0_7ContextEiPKvi _ZN7android12renderscript22rsrAllocationIoReceiveEPNS0_7ContextEPNS0_10AllocationE _ZN7android12renderscript24rsrAllocationCopy1DRangeEPNS0_7ContextEPNS0_10AllocationEjjjS4_jj _ZN7android12renderscript24rsrAllocationCopy2DRangeEPNS0_7ContextEPNS0_10AllocationEjjjjjjS4_jjjj _ZN7android12renderscript24rsrAllocationCreateTypedEPNS0_7ContextEPv25RsAllocationMipmapControljj _ZN7android12renderscript7rsrTimeEPNS0_7ContextEPl _ZN7android12renderscript8rsrGetDtEPNS0_7ContextEPKNS0_6ScriptE _ZNK7android12renderscript10ObjectBase10decUserRefEv _Z32rsdScriptGroupUpdateCachedObjectPKN7android12renderscript7ContextEPKNS0_11ScriptGroupEPNS0_15rs_script_groupE _edata __bss_start _end libRS_internal.so libRSCpuRef.so liblog.so libnativewindow.so libc++.so
RenderScript rrsdGpuScriptIntrinsic::scaleworksize: Worksize and/or Workoffset not divisible by scaling factor Unable to read kernel execution status, error code: %d GPU kernel returned error code: %d Code -59 possibly due to out-of-bound memory access. rsdGpuScriptIntrinsicBuffer: failed to create buffer, error code: %d rsdGpuScriptIntrinsicBuffer: failed to map buffer, error code: %d rsdGpuScriptIntrinsicBuffer: failed to unmap buffer, error code: %d Intrinsic 3dLUT: cannot build program: %d Intrinsic 3dLUT: cannot create kernel %s: %d blend_clear blend_src blend_src_over blend_dst_over blend_src_in blend_dst_in blend_src_out blend_dst_out blend_src_atop blend_dst_atop blend_xor blend_multiply blend_add blend_subtract Intrinsic Blend: cannot build program: %d Intrinsic Blend: cannot create kernel %s: %d Intrinsic Blur: error %d creating input image X(%zu) Y(%zu) P(%zu) Intrinsic Blur: error %d creating temp image X(%zu) Y(%zu) P(%zu) Intrinsic Blur: error %d creating output image X(%zu) Y(%zu) P(%zu) gauss_blur_1d_vert gauss_blur_1d_horz Intrinsic Blur: cannot create kernel %s: %d Intrinsic Blur: error creating/mapping weights buffer %d gpuRunHelper: Can't SetKernelArg with mInStride: %d %d gpuRunHelper: Can't SetKernelArg with hp: %d gpuRunHelper: Can't SetKernelArg with mOutStride: %d %d colormatrix4x4_U8_4_to_U8_4 colormatrix4x4_U8_4_to_U8_4_1left colormatrix4x4_U8_4_to_U8_4_2left colormatrix4x4_U8_4_to_U8_4_3left Grey_U8_4_to_U8 Grey_U8_4_to_U8_int colormatrix4x4_U8_4_to_U8_4_simple colormatrix4x4_U8_4_to_U8_simple Intrinsic Colormatrix: cannot build program: %d Intrinsic ColorMatrix: cannot create kernel %s: %d conv33 conv33_block conv33_sobel_top conv33_sobel_bottom conv33_sobel_left conv33_sobel_right conv33_sharpen conv33_sharpen_block Intrinsic Convolve3x3: cannot build program: %d Intrinsic C33: failed to allocate kernel list Intrinsic C33: failed to allocate kernel info Intrinsic Convolve3x3: cannot create kernel %s: %d Intrinsic Convolve5x5: cannot build program: %d Intrinsic Convolve5x5: cannot create kernel %s: %d histogram_uchar_full histogram_uchar_n histogram_uchar4_full histogram_uchar4_n Intrinsic Histogram: cannot build program: %d Intrinsic Histogram: cannot create kernel %s: %d vendor/arm/mali/gpu/android/renderscript/android-8.0.0_r1/driver/rsdGpuIntrinsicHistogram.cpp Intrinsic LUT: cannot build program: %d Intrinsic LUT: cannot create kernel %s: %d Failed to load Mali accelerated driver: %s Cannot query platform name size, error code: %d Cannot allocate %zx bytes for platform name. getPlatformInfo, error code: %d ARM Platform Cannot query platform's vendor name size, error code: %d Cannot allocate %zx bytes for platform's vendor name. Cannot query platform info, error code: %d clReleaseContext clReleaseCommandQueue clFlush clFinish clCreateBuffer clEnqueueCopyBuffer clEnqueueMapBuffer clEnqueueUnmapMemObject clRetainMemObject clReleaseMemObject clGetMemObjectInfo clEnqueueReadBuffer clEnqueueWriteBuffer clCreateImage clGetExtensionFunctionAddress CL_GET_ARCH_POINTER_ARM clCreateProgramWithBinary clCreateProgramWithSource clBuildProgram clGetProgramBuildInfo clReleaseProgram clGetProgramInfo clCreateKernel clCreateKernelsInProgram clReleaseKernel clGetKernelInfo clSetKernelArg clEnqueueNDRangeKernel clCreateUserEvent clSetUserEventStatus clSetEventCallback clWaitForEvents clGetEventInfo clRetainEvent clReleaseEvent GPU buffer unmapping failed, reverting to CPU, error code: %d GPU buffer creation during resize failed, reverting to CPU, error code: %d rsdClSetAllocationArgument: failed to set argument %d, error code: %d rsdClSetPtrArgument: failed to set argument %d, error code: %d Failed to create argument buffer %d, error code: %d Failed mapping of buffer argument, error code: %d Failed unmapping of buffer argument, error code: %d debug.rs.dumpbin .arm.in.bc ########### writing %s - size(%u) ########### failed to open %s Error getting program binary sizes, error code: %d Error getting program binaries, error code: %d .arm.out.bc ########### writing %s - size(%zu) [RS-DIAG] Nested forEach not supported on GPU [RS-DIAG] Input allocation type is unsupported on the GPU [RS-DIAG] Output allocation type is unsupported on the GPU Failed to map buffer argument, error code: %d Failed to unmap buffer argument, error code: %d Failed to set inElementStride arg: %d Failed to set inElementSize arg: %d Failed to set outElementStride arg: %d Failed to set outElementSize arg: %d [RS-DIAG] A used allocation is not supported on the GPU [RS-DIAG] Launching GPU kernel : slot(%d) GPU launch failure, error code: %d Attempting CPU fallback Detected failure post kernel execution [RS-DIAG] Launching CPU script : slot(%d) rsAssert failed: %s, in %s at %i vendor/arm/mali/gpu/android/renderscript/android-8.0.0_r1/driver/rsdClBcc.cpp /vendor/lib64/libbccArm.so debug.rs.cpufb property is set to "on", GPU acceleration disabled /vendor/bin/bcc Can't use user-allocated buffers if usage is not USAGE_SCRIPT | USAGE_SHARED or USAGE_SCRIPT | USAGE_SHARED | USAGE_GRAPHICS_TEXTURE User-backed allocation failed stride requirement, falling back to separate allocation !"Size mismatch" vendor/arm/mali/gpu/android/renderscript/android-8.0.0_r1/driver/aosp/rsdAllocation.cpp Attempting to sync allocation from render target, for non-render target allocation src == RS_ALLOCATION_USAGE_SCRIPT || src == RS_ALLOCATION_USAGE_SHARED Non-script allocation copies not yet implemented. Error from %s void rsSetElementAt_char(::rs_allocation, const char *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_char(::rs_allocation, char *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_char2(::rs_allocation, const char2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_char2(::rs_allocation, char2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_char3(::rs_allocation, const char3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_char3(::rs_allocation, char3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_char4(::rs_allocation, const char4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_char4(::rs_allocation, char4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uchar(::rs_allocation, const uchar *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uchar(::rs_allocation, uchar *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uchar2(::rs_allocation, const uchar2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uchar2(::rs_allocation, uchar2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uchar3(::rs_allocation, const uchar3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uchar3(::rs_allocation, uchar3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uchar4(::rs_allocation, const uchar4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uchar4(::rs_allocation, uchar4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_short(::rs_allocation, const short *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_short(::rs_allocation, short *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_short2(::rs_allocation, const short2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_short2(::rs_allocation, short2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_short3(::rs_allocation, const short3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_short3(::rs_allocation, short3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_short4(::rs_allocation, const short4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_short4(::rs_allocation, short4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ushort(::rs_allocation, const ushort *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ushort(::rs_allocation, ushort *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ushort2(::rs_allocation, const ushort2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ushort2(::rs_allocation, ushort2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ushort3(::rs_allocation, const ushort3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ushort3(::rs_allocation, ushort3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ushort4(::rs_allocation, const ushort4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ushort4(::rs_allocation, ushort4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_int(::rs_allocation, const int *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_int(::rs_allocation, int *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_int2(::rs_allocation, const int2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_int2(::rs_allocation, int2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_int3(::rs_allocation, const int3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_int3(::rs_allocation, int3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_int4(::rs_allocation, const int4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_int4(::rs_allocation, int4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uint(::rs_allocation, const uint *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uint(::rs_allocation, uint *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uint2(::rs_allocation, const uint2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uint2(::rs_allocation, uint2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uint3(::rs_allocation, const uint3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uint3(::rs_allocation, uint3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uint4(::rs_allocation, const uint4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uint4(::rs_allocation, uint4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_long(::rs_allocation, const long *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_long(::rs_allocation, long *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_long2(::rs_allocation, const long2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_long2(::rs_allocation, long2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_long3(::rs_allocation, const long3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_long3(::rs_allocation, long3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_long4(::rs_allocation, const long4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_long4(::rs_allocation, long4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ulong(::rs_allocation, const ulong *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ulong(::rs_allocation, ulong *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ulong2(::rs_allocation, const ulong2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ulong2(::rs_allocation, ulong2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ulong3(::rs_allocation, const ulong3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ulong3(::rs_allocation, ulong3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ulong4(::rs_allocation, const ulong4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ulong4(::rs_allocation, ulong4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_half(::rs_allocation, const half *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_half(::rs_allocation, half *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_half2(::rs_allocation, const half2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_half2(::rs_allocation, half2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_half3(::rs_allocation, const half3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_half3(::rs_allocation, half3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_half4(::rs_allocation, const half4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_half4(::rs_allocation, half4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_float(::rs_allocation, const float *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_float(::rs_allocation, float *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_float2(::rs_allocation, const float2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_float2(::rs_allocation, float2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_float3(::rs_allocation, const float3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_float3(::rs_allocation, float3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_float4(::rs_allocation, const float4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_float4(::rs_allocation, float4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_double(::rs_allocation, const double *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_double(::rs_allocation, double *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_double2(::rs_allocation, const double2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_double2(::rs_allocation, double2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_double3(::rs_allocation, const double3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_double3(::rs_allocation, double3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_double4(::rs_allocation, const double4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_double4(::rs_allocation, double4 *, uint32_t, uint32_t, uint32_t) %s {%f, %f} %s {%f, %f, %f} %s {%f, %f, %f, %f} %s %lld 0x%llx %s {%lld, %lld} 0x%llx 0x%llx %s {%lld, %lld, %lld} 0x%llx 0x%llx 0x%llx %s {%lld, %lld, %lld, %lld} 0x%llx 0x%llx 0x%llx 0x%llx %s %llu 0x%llx %s {%llu, %llu} 0x%llx 0x%llx %s {%llu, %llu, %llu} 0x%llx 0x%llx 0x%llx %s {%llu, %llu, %llu, %llu} 0x%llx 0x%llx 0x%llx 0x%llx rs_type creation error: Invalid element rs_type creation error: Invalid yuv_format %d rs_type creation error: Both X and Y dimension required when Z is present. rs_type creation error: X dimension required when Y is present. rs_type creation error: mipmap control require 2D Types. rs_type creation error: Cube maps require 2D Types. rs_type creation error: YUV only supports basic 2D. Out range ElementAt Y %i of %i Out range ElementAt Z %i of %i Vector size mismatch for ElementAt %i of %i Data type mismatch for ElementAt %i of %i libunwind: %s %s:%d - %s external/libunwind_llvm/src/Unwind-EHABI.cpp unsupported register class during phase1 personality function said it would stop here, but now in phase2 it did not stop here external/libunwind_llvm/src/Registers.hpp unsupported arm register Unknown ARM float register index inlined table detected but pr function requires extra words /* Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ __kernel void lut3d( __global const uchar4 * restrict input, /* Pointer to the inputbuffer */ __global uchar4 * restrict output, /* Pointer to the output buffer */ __global const uchar * restrict lut, const int4 dims, /* Dimensions of lut */ const int4 coordMul, const int3 stride ) { const int2 pos = (int2)(get_global_id(0), get_global_id(1)); const int offset = pos.y * stride.x + pos.x; /* bring input position within bounds of LUT */ const uchar4 in = input[offset]; int4 baseCoord = convert_int4(in) * coordMul; int4 coord1 = baseCoord >> (int4) 15; /* get XdYdZd (difference between x and point above and below) */ int4 weight2 = baseCoord & 0x7fff; /* first 15 bits */ int4 weight1 = (int4) 0x8000 - weight2; /* 1 - original */ /* point below needed point in all axis */ const int lut_offset = (coord1.x * 4) + (coord1.y * stride.y) + (coord1.z * stride.z); /* v000 = v000.lo v100 = v000.hi */ uint8 v000 = convert_uint8(vload8(0, lut + lut_offset)); /* v010 = v010.lo v110 = v010.hi */ uint8 v010 = convert_uint8(vload8(0, lut + lut_offset + stride.y)); /* v001 = v001.lo v101 = v001.hi */ uint8 v001 = convert_uint8(vload8(0, lut + lut_offset + stride.z)); /* v011 = v011.lo v111 = v011.hi */ uint8 v011 = convert_uint8(vload8(0, lut + lut_offset + stride.y + stride.z)); /* Interpolate along x */ uint4 yz00 = ((v000.lo * weight1.x) + (v000.hi * weight2.x)) >> (int4)7; uint4 yz10 = ((v010.lo * weight1.x) + (v010.hi * weight2.x)) >> (int4)7; uint4 yz01 = ((v001.lo * weight1.x) + (v001.hi * weight2.x)) >> (int4)7; uint4 yz11 = ((v011.lo * weight1.x) + (v011.hi * weight2.x)) >> (int4)7; /* Interpolate along y */ uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15; uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15; /* Interpolate along z */ uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15; uint4 v2 = (v + 0x7f) >> (int4)8; /* part of integer math */ uchar4 ret = convert_uchar4(v2); ret.w = in.w; output[offset] = ret; } /* * Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ __kernel void blend_clear( uint width, /* Width of the in/out allocation in uchar4. */ __global const uchar * restrict input_data, /* Pointer to the input-buffer */ __global uchar * restrict output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; const uchar16 load = 0; vstore16(load, 0, output_data + offset); } __kernel void blend_src( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; const uchar16 load = vload16(0, input_data + offset); vstore16(load, 0, output_data + offset); } __kernel void blend_src_over( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_SRC_OVER */ short16 in_s = convert_short16( vload16(0, input_data + offset)); const short16 out_s = convert_short16( vload16(0, output_data + offset)); short16 alpha = (short16)255 - in_s.s33337777BBBBFFFF ; in_s = in_s + ((out_s * alpha) >> (short16)8); vstore16(convert_uchar16(in_s), 0, output_data + offset); } __kernel void blend_dst_over( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_DST_OVER */ const short16 in_s = convert_short16( vload16(0, input_data + offset)); short16 out_s = convert_short16( vload16(0, output_data + offset)); short16 alpha = (short16) 255 - out_s.s33337777BBBBFFFF; out_s = out_s + ((in_s * alpha )>> (short16)8); vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_src_in( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_SRC_IN */ short16 in_s = convert_short16(vload16(0, input_data + offset)); const short16 out_s = convert_short16(vload16(0,output_data + offset)); in_s = (in_s * out_s.s33337777BBBBFFFF) >> (short16)8; vstore16(convert_uchar16(in_s), 0, output_data + offset); } __kernel void blend_dst_in( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_DST_IN */ short16 out_s = convert_short16(vload16(0, output_data + offset)); const short16 in_s = convert_short16(vload16(0, input_data + offset)); out_s = (out_s * in_s.s33337777BBBBFFFF) >> (short16)8; vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_src_out( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_SRC_OUT */ short16 in_s = convert_short16(vload16(0, input_data + offset)); const short16 out_s = convert_short16(vload16(0, output_data + offset)); in_s = (in_s * ((short16)255 - out_s.s33337777BBBBFFFF)) >> (short16)8; vstore16(convert_uchar16(in_s), 0, output_data + offset); } __kernel void blend_dst_out( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_SRC_OUT */ const short16 in_s = convert_short16( vload16( 0, input_data + offset)); short16 out_s = convert_short16(vload16(0, output_data + offset)); out_s = (out_s * ((short16)255 - in_s.s33337777BBBBFFFF)) >> (short16)8; vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_src_atop( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col ; /* BLEND_SRC_ATOP */ const short16 in_s = convert_short16(vload16(0, input_data + offset)); short16 out_s = convert_short16(vload16(0, output_data + offset)); const short4 alphas = out_s.s37BF; const short16 dsta = out_s.s33337777BBBBFFFF; const short16 srca = in_s.s33337777BBBBFFFF; out_s = ((in_s * dsta) + (out_s * ((short16)255 - srca))) >> (short16)8; out_s.s37BF = alphas; vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_dst_atop( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col ; /* BLEND_SRC_ATOP */ const short16 in_s = convert_short16(vload16(0, input_data + offset)); short16 out_s = convert_short16(vload16(0, output_data + offset)); const short4 alphas = in_s.s37BF; const short16 dsta = out_s.s33337777BBBBFFFF; const short16 srca = in_s.s33337777BBBBFFFF; out_s = ((out_s * srca) + (in_s * ((short16)255 - dsta))) >> (short16)8; out_s.s37BF = alphas; vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_xor( uint width, /* Width of the in/out allocation in pixels. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; const uchar16 loadin = vload16(0, input_data + offset); uchar16 loadout = vload16(0, output_data + offset); loadout = loadout ^ loadin; vstore16(loadout, 0, output_data + offset); } __kernel void blend_multiply( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; /* BLEND_MULTIPLY */ const short16 loadin = convert_short16(vload16( 0, input_data + offset)); const short16 loadout = convert_short16(vload16( 0, output_data + offset)); const uchar16 res = convert_uchar16( (loadin * loadout) >> (short16)8); vstore16(res, 0, output_data + offset); } __kernel void blend_add( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; /* BLEND_ADD */ const short16 loadin = convert_short16(vload16(0, input_data + offset)); short16 loadout = convert_short16(vload16(0, output_data + offset)); loadout = loadout + loadin; const uchar16 res = convert_uchar16(min( (short16) 255, loadout)); vstore16(res, 0, output_data + offset); } __kernel void blend_subtract( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; /* BLEND_SUBTRACT */ const short16 loadin = convert_short16(vload16(0, input_data + offset)); short16 loadout = convert_short16(vload16(0, output_data + offset)); loadout = loadout - loadin; const uchar16 res = convert_uchar16(max( (short16) 0, loadout)); vstore16(res, 0, output_data + offset); } /* * Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2016 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; __kernel void gauss_blur_1d_vert(__read_only image2d_t src, __write_only image2d_t dst, const int num_coeffs, __global float* weights ) { int2 pos = {get_global_id(0), get_global_id(1)}; int i; half4 rgb_out = 0.0f; const float2 fpos = convert_float2(pos); float2 off_weight; off_weight = vload2(0, weights); for( i=0; i<num_coeffs; i++) { rgb_out += read_imageh(src, sampler, (float2) (fpos.x+0.5f, fpos.y + off_weight.x)) * (half)off_weight.y; off_weight = vload2(i+1, weights); } rgb_out += read_imageh(src, sampler, (float2) (fpos.x+0.5f, fpos.y + off_weight.x)) * (half)off_weight.y; write_imageh(dst, pos, rgb_out); } __kernel void gauss_blur_1d_horz( __read_only image2d_t src, __write_only image2d_t dst, const int num_coeffs, __global float* weights ) { const int2 pos = {get_global_id(0), get_global_id(1)}; int i; half4 rgb_out = 0.0f; const float2 fpos = convert_float2(pos); float2 off_weight; off_weight = vload2(0, weights); for( i=0; i<num_coeffs; i++) { rgb_out += read_imageh(src, sampler, (float2) (fpos.x + off_weight.x, fpos.y+0.5f)) * (half)off_weight.y; off_weight = vload2(i+1, weights); } rgb_out += read_imageh(src, sampler, (float2) (fpos.x + off_weight.x, fpos.y+0.5f)) * (half)off_weight.y; write_imageh(dst, pos, rgb_out); } /* * Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2016 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define PIXEL_SIZE sizeof(uchar4) #define COMPUTE_PIXEL(out, in, colormat) \ do { \ out = in.s0 * colormat.s0123; \ out += in.s1 * colormat.s4567; \ out += in.s2 * colormat.s89ab; \ out += in.s3 * colormat.scdef; \ } while(0) /* No leftover pixels */ __kernel void colormatrix4x4_U8_4_to_U8_4( __global uchar* in, __global uchar* out, const int src_offset, const int stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*stride; __global uchar * dst = out + src_offset + x*(4*PIXEL_SIZE) + y*stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar16 res = convert_uchar16(fsum); vstore16(res, 0, dst); } /* One leftover pixel */ __kernel void colormatrix4x4_U8_4_to_U8_4_1left( __global uchar* in, __global uchar* out, const int src_offset, const int stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*stride; __global uchar * dst = out + src_offset + x*(4*PIXEL_SIZE) + y*stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar16 res = convert_uchar16(fsum); vstore16(res, 0, dst); /* Do last one pixel */ if ( x == (get_global_size(0)-1)) { src = in + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; dst = out + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; p0 = convert_half4(vload4(0,src)); COMPUTE_PIXEL(d.s0123, p0, colormat); half4 fsum4 = (half4)(d.s0123) + ((half4)(add)); uchar4 res4 = convert_uchar4(fsum4); vstore4(res4, 0, dst); } } /* Two leftover pixels */ __kernel void colormatrix4x4_U8_4_to_U8_4_2left( __global uchar* in, __global uchar* out, const int src_offset, const int stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*stride; __global uchar * dst = out + src_offset + x*(4*PIXEL_SIZE) + y*stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar16 res = convert_uchar16(fsum); vstore16(res, 0, dst); /* Compute left over two pixels */ if ( x == (get_global_size(0)-1)) { src = in + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; dst = out + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; half8 p01 = convert_half8(vload8(0,src)); COMPUTE_PIXEL(d.s0123, p01.s0123, colormat); COMPUTE_PIXEL(d.s4567, p01.s4567, colormat); half8 fsum8 = (half8)(d.s0123,d.s4567) + ((half8)(add, add)); uchar8 res8 = convert_uchar8(fsum8); vstore8(res8, 0, dst); } } /* Three leftover pixels */ __kernel void colormatrix4x4_U8_4_to_U8_4_3left( __global uchar* in, __global uchar* out, const int src_offset, const int stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*stride; __global uchar * dst = out + src_offset + x*(4*PIXEL_SIZE) + y*stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar16 res = convert_uchar16(fsum); vstore16(res, 0, dst); /* Compute left over three pixels */ if ( x == (get_global_size(0)-1)) { src = in + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; dst = out + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; p = vload16(0,src); p0 = convert_half4(p.s0123); p1 = convert_half4(p.s4567); p2 = convert_half4(p.s89ab); COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); half8 fsum8 = (half8)(d.s0123,d.s4567) + ((half8)(add, add)); uchar8 res8 = convert_uchar8(fsum8); vstore8(res8, 0, dst); half4 fsum4 = (half4)(d.s89ab) + ((half4)(add)); uchar4 res4 = convert_uchar4(fsum4); vstore4(res4, 0, dst+8); } } __kernel void colormatrix4x4_U8_4_to_U8( __global uchar* in, __global uchar* out, const int src_offset, const int in_stride, const int dst_offset, const int out_stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*in_stride; /* U8_4 */ __global uchar * dst = out + dst_offset + x*(4) + y*out_stride; /* U8 */ /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar4 res = convert_uchar4(fsum.s048c); vstore4(res, 0, dst); } __kernel void Grey_U8_4_to_U8( __global uchar* in, __global uchar* out, const int in_stride, const int out_stride, const half4 colormat ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + x*(4*PIXEL_SIZE) + y*in_stride; /* U8_4 */ __global uchar * dst = out + x*(4) + y*out_stride; /* U8 */ /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half16 p0123 = convert_half16(p); half4 fsum = colormat.s0 * p0123.s048c; fsum += colormat.s1 * p0123.s159d; fsum += colormat.s2 * p0123.s26ae; uchar4 res = convert_uchar4(fsum); vstore4(res, 0, dst); } /* No add and no clipping */ __kernel void colormatrix4x4_U8_4_to_U8_4_simple( __global uchar* in, __global uchar* out, const int in_stride, const int out_stride, const half16 colormat ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + x*(4*PIXEL_SIZE) + y*in_stride; __global uchar * dst = out + x*(4*PIXEL_SIZE) + y*out_stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); uchar16 res = convert_uchar16(d); vstore16(res, 0, dst); } /* No add and no clipping */ __kernel void colormatrix4x4_U8_4_to_U8_simple( __global uchar* in, __global uchar* out, const int in_stride, const int out_stride, const half16 colormat ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + x*(4*PIXEL_SIZE) + y*in_stride; /* U8_4 */ __global uchar * dst = out + x*(4) + y*out_stride; /* U8 */ /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); uchar4 res = convert_uchar4(d.s048c); vstore4(res, 0, dst); } /* * Using integer coefficients * https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.601-7-201103-I!!PDF-E.pdf page 7 */ __kernel void Grey_U8_4_to_U8_int( __global uchar* in, __global uchar* out, const int in_stride, const int out_stride ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + x*(4*PIXEL_SIZE) + y*in_stride; /* U8_4 */ __global uchar * dst = out + x*(4) + y*out_stride; /* U8 */ /* do 4 pixels at a time */ uchar16 p = vload16(0,src); int16 p0123 = convert_int16(p); int4 fsum = (int) 77 * p0123.s048c; fsum += (int) 150 * p0123.s159d; fsum += (int) 29 * p0123.s26ae; fsum = fsum >> 8; uchar4 res = convert_uchar4(fsum); vstore4(res, 0, dst); } ? ? ? ?/* Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ /* Kernel to handle 4 horizontally adjacent pixels - clips to y access only, assumes x-1 & xmax+1 are accessible */ __kernel void conv33_block( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const float16 weights, const int stride ) { const size_t pixel_size = 4; float16 sum; float16 l, m, r; const int2 pos = (int2)( get_global_offset(0) + 4 * ( get_global_id(0) - get_global_offset(0) ), get_global_id(1) ); const int col2_offset = pos.x * pixel_size; const int col1_offset = col2_offset - pixel_size; /* Safe - we're never called with pos.x < 1 */ /* No need for col3_offset - it's within the block we're computing */ /* Row offsets incorporate offset to col1_offset (one pixel left of target block) */ const int row2_offset = col1_offset + pos.y * stride; const int row1_offset = max( row2_offset - stride, col1_offset ); const int row3_offset = col1_offset + ( min( (uint)(pos.y + 1), (uint)(height - 1) ) * stride ); l = convert_float16( vload16( 0, input_data + row1_offset ) ); r = convert_float16( vload16( 0, input_data + row1_offset + 2 * pixel_size ) ); m = (float16)(l.s456789ab, r.s456789ab); sum = weights.s0 * l; sum += weights.s1 * m; sum += weights.s2 * r; l = convert_float16( vload16( 0, input_data + row2_offset ) ); r = convert_float16( vload16( 0, input_data + row2_offset + 2 * pixel_size ) ); m = (float16)( l.s456789ab, r.s456789ab ); sum += weights.s3 * l; sum += weights.s4 * m; sum += weights.s5 * r; l = convert_float16( vload16( 0, input_data + row3_offset ) ); r = convert_float16( vload16( 0, input_data + row3_offset + 2 * pixel_size ) ); m = (float16)( l.s456789ab, r.s456789ab ); sum += weights.s6 * l; sum += weights.s7 * m; sum += weights.s8 * r; /* Must add pixel_size to row2_offset to point to col2 - first target pixel of block */ vstore16( convert_uchar16( clamp( sum, 0.f, 255.f ) ), 0, output_data + row2_offset + pixel_size ); } /* Kernel to handle left and right sections either side of middle section handled by conv33. Clips in x & y. */ __kernel void conv33( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const float16 weights, /* convolution kernel */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; float4 sum = weights.s0 * convert_float4( vload4( 0, input_data + row1_offset + col1_offset ) ) + weights.s1 * convert_float4( vload4( 0, input_data + row1_offset + col2_offset ) ) + weights.s2 * convert_float4( vload4( 0, input_data + row1_offset + col3_offset ) ) + weights.s3 * convert_float4( vload4( 0, input_data + row2_offset + col1_offset ) ) + weights.s4 * convert_float4( vload4( 0, input_data + row2_offset + col2_offset ) ) + weights.s5 * convert_float4( vload4( 0, input_data + row2_offset + col3_offset ) ) + weights.s6 * convert_float4( vload4( 0, input_data + row3_offset + col1_offset ) ) + weights.s7 * convert_float4( vload4( 0, input_data + row3_offset + col2_offset ) ) + weights.s8 * convert_float4( vload4( 0, input_data + row3_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, 0.f, 255.f ) ), 0, output_data + row2_offset + col2_offset ); } __kernel void conv33_sharpen_block( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_offset(0) + 4 * ( get_global_id(0) - get_global_offset(0) ), get_global_id(1) ); const int col2_offset = pos.x * pixel_size; const int col1_offset = col2_offset - pixel_size; /* Safe - we're never called with pos.x < 1 */ /* No need for col3_offset - it's within the block we're computing */ /* Row offsets incorporate offset to col1 (one pixel left of target block) */ const int row2_offset = col1_offset + pos.y * stride; const int row1_offset = max( row2_offset - stride, col1_offset ); const int row3_offset = col1_offset + ( min( (uint)(pos.y + 1), (uint)(height - 1) ) * stride ); /* Read data for centre row at left, middle and right offsets, synthesizing data already read */ short16 l = convert_short16( vload16( 0, input_data + row2_offset ) ); short16 r = (short16)(l.hi, convert_short8( vload8( 0, input_data + row2_offset + 4 * pixel_size ) ) ); short16 m = (short16)(l.s456789ab, r.s456789ab); short16 sum = (short)5 * m - l - r - convert_short16( vload16( 0, input_data + row1_offset + pixel_size ) ) - convert_short16( vload16( 0, input_data + row3_offset + pixel_size ) ); vstore16( convert_uchar16( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + pixel_size ); } /* Kernel for Sharpen: 0 |-1 | 0 -1| 5 |-1 0 |-1 | 0 */ __kernel void conv33_sharpen( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = ((short)5) * convert_short4(vload4( 0, input_data + row2_offset + col2_offset )) - convert_short4(vload4( 0, input_data + row1_offset + col2_offset )) - convert_short4(vload4( 0, input_data + row2_offset + col1_offset )) - convert_short4(vload4( 0, input_data + row2_offset + col3_offset )) - convert_short4(vload4( 0, input_data + row3_offset + col2_offset )) ; vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Sobel top: 1 | 2 | 1 0 | 0 | 0 -1|-2 |-1 */ __kernel void conv33_sobel_top( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row1_offset + col2_offset ) ) + convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row3_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Sobel bottom: -1|-2 |-1 0 | 0 | 0 1 | 2 | 1 */ __kernel void conv33_sobel_bottom( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row3_offset + col2_offset ) ) + convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row1_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Sobel left: 1 | 0 |-1 2 | 0 |-2 1 | 0 |-1 */ __kernel void conv33_sobel_left( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row2_offset + col1_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row2_offset + col3_offset ) ) + convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Sobel right: -1| 0 | 1 -2| 0 | 2 -1| 0 | 1 */ __kernel void conv33_sobel_right( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row2_offset + col3_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row2_offset + col1_offset ) ) + convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Identity: 0 | 0 | 0 0 | 1 | 0 0 | 0 | 0 */ __kernel void conv33_identity( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t row2_offset = pos.y * stride; vstore4( vload4( 0, input_data + row2_offset + col2_offset ) , 0, output_data + row2_offset + col2_offset ); } /* Kernel for Emboss: -2|-1 | 0 -1| 1 | 1 0| 1 | 2 */ __kernel void conv33_emboss( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row2_offset + col2_offset ) ) + convert_short4( vload4( 0, input_data + row2_offset + col3_offset ) ) + convert_short4( vload4( 0, input_data + row3_offset + col2_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row2_offset + col1_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Outline: -1 |-1 |-1 -1 | 8 |-1 -1 |-1 |-1 */ __kernel void conv33_outline( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const float16 weights, /* convolution kernel */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = (short)8 * convert_short4( vload4( 0, input_data + row2_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row2_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row2_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ __kernel void conv_55( __global const uchar * restrict input_data, /* Pointer to the inputbuffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ __global const float * weights, const int stride ) { float16 sum = 0; float16 l0, l1, l2, l3, l4; const int2 pos = (int2)(get_global_id(0)*16, get_global_id(1)); int offset = (pos.y - 2 )*stride + pos.x - 8; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); float16 we = vload16(0, weights); sum += we.s0 * l0 ; sum += we.s1 * l1 ; sum += we.s2 * l2 ; sum += we.s3 * l3 ; sum += we.s4 * l4 ; offset += stride; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); sum += we.s5 * l0 ; sum += we.s6 * l1 ; sum += we.s7 * l2 ; sum += we.s8 * l3 ; sum += we.s9 * l4 ; offset += stride; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); sum += we.sa * l0 ; sum += we.sb * l1 ; sum += we.sc * l2 ; sum += we.sd * l3 ; sum += we.se * l4 ; offset += stride; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); we = vload16(0, weights + 15); sum += we.s0 * l0 ; sum += we.s1 * l1 ; sum += we.s2 * l2 ; sum += we.s3 * l3 ; sum += we.s4 * l4 ; offset += stride; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); sum += we.s5 * l0 ; sum += we.s6 * l1 ; sum += we.s7 * l2 ; sum += we.s8 * l3 ; sum += we.s9 * l4 ; vstore16(convert_uchar16(clamp(sum, 0.f, 255.f)), 0, output_data + pos.y * stride + pos.x); } /* * Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2017 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ #pragma OPENCL EXTENSION cl_arm_core_id : enable #define MAX_NUM_HISTOGRAM_BUFS 16 /* Ideally this should be a power of 2. This must match kMaxNumHistogramBufs */ #define BYTES_PER_FULL_KERNEL 64 /* 32, 64 or 128 : This must match kBytesPerFullKernel */ #define VATOMIC_INC4(histogram, vals4) do { \ atomic_inc(histogram + (vals4.s0)); \ atomic_inc(histogram + (vals4.s1)); \ atomic_inc(histogram + (vals4.s2)); \ atomic_inc(histogram + (vals4.s3)); \ } while (0) #define VATOMIC_INC8(histogram, vals8) do { \ uint4 vals8_lo = convert_uint4((vals8).lo); \ VATOMIC_INC4((histogram), (vals8_lo)); \ uint4 vals8_hi = convert_uint4((vals8).hi); \ VATOMIC_INC4((histogram), (vals8_hi)); \ } while (0) #define VATOMIC_INC16(histogram, vals16) do { \ ushort8 vals16_lo = convert_ushort8((vals16).lo); \ VATOMIC_INC8((histogram), (vals16_lo)); \ ushort8 vals16_hi = convert_ushort8((vals16).hi); \ VATOMIC_INC8((histogram), (vals16_hi)); \ } while (0) __kernel void histogram_uchar_full( __global const uchar * restrict input_image, /* Pointer to the input-buffer */ __global uint * restrict histogram, /* Pointer to the histogram buffer */ uint stride /* stride/16 */ ) { histogram += ((arm_get_core_id() % MAX_NUM_HISTOGRAM_BUFS) * 256); // The offset is scaled by 16 #if BYTES_PER_FULL_KERNEL == 32 const uint2 offset = (uint2)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint2)(0,1); #elif BYTES_PER_FULL_KERNEL == 64 const uint4 offset = (uint4)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint4)(0,1,2,3); #elif BYTES_PER_FULL_KERNEL == 128 const uint8 offset = (uint8)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint8)(0,1,2,3,4,5,6,7); #else #error "Invalid BYTES_PER_FULL_KERNEL value" #endif const uchar16 vals0 = vload16(offset.s0, input_image); const uchar16 vals1 = vload16(offset.s1, input_image); #if BYTES_PER_FULL_KERNEL >= 64 const uchar16 vals2 = vload16(offset.s2, input_image); const uchar16 vals3 = vload16(offset.s3, input_image); #endif #if BYTES_PER_FULL_KERNEL >= 128 const uchar16 vals4 = vload16(offset.s4, input_image); const uchar16 vals5 = vload16(offset.s5, input_image); const uchar16 vals6 = vload16(offset.s6, input_image); const uchar16 vals7 = vload16(offset.s7, input_image); #endif VATOMIC_INC16(histogram, vals0); VATOMIC_INC16(histogram, vals1); #if BYTES_PER_FULL_KERNEL >= 64 VATOMIC_INC16(histogram, vals2); VATOMIC_INC16(histogram, vals3); #endif #if BYTES_PER_FULL_KERNEL >= 128 VATOMIC_INC16(histogram, vals4); VATOMIC_INC16(histogram, vals5); VATOMIC_INC16(histogram, vals6); VATOMIC_INC16(histogram, vals7); #endif } __kernel void histogram_uchar_n( __global const uchar * restrict input_image, /* Pointer to the input buffer */ __global uint * restrict histogram, /* Pointer to the histogram buffer */ const uint stride, /* stride in bytes */ const uint start_offset, /* starting offset in bytes */ const uchar4 n /* number of uchar's to process in multiples - s3(16):s2(8):s1(4):s0(1) */ ) { histogram += ((arm_get_core_id() % MAX_NUM_HISTOGRAM_BUFS) * 256); input_image += (get_global_id(1) * stride) + start_offset; uchar i = n.s3; while (i-- > 0) { const uchar16 vals = vload16(0, input_image); VATOMIC_INC16(histogram, vals); input_image+=16; } i = n.s2; while (i-- > 0) { const uchar8 vals = vload8(0, input_image); VATOMIC_INC8(histogram, vals); input_image+=8; } i = n.s1; while (i-- > 0) { const uchar4 vals = vload4(0, input_image); VATOMIC_INC4(histogram, vals); input_image+=4; } i = n.s0; while (i-- > 0) { atomic_inc(histogram + *input_image++); } } #undef VATOMIC_INC4 #define VATOMIC_INC4(histogram, vals4) do { \ atomic_inc(histogram + ((vals4.s0) * 4)); \ atomic_inc(histogram + ((vals4.s1) * 4) + 1); \ atomic_inc(histogram + ((vals4.s2) * 4) + 2); \ atomic_inc(histogram + ((vals4.s3) * 4) + 3); \ } while (0) __kernel void histogram_uchar4_full( __global const uchar * restrict input_image, /* Pointer to the input buffer */ __global uint * restrict histogram, /* Pointer to the histogram buffer */ uint stride /* stride/16 */ ) { histogram += ((arm_get_core_id() % MAX_NUM_HISTOGRAM_BUFS) * 256 * 4); // The offset is scaled by 16 #if BYTES_PER_FULL_KERNEL == 32 const uint2 offset = (uint2)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint2)(0,1); #elif BYTES_PER_FULL_KERNEL == 64 const uint4 offset = (uint4)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint4)(0,1,2,3); #elif BYTES_PER_FULL_KERNEL == 128 const uint8 offset = (uint8)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint8)(0,1,2,3,4,5,6,7); #else #error "Invalid BYTES_PER_FULL_KERNEL value" #endif const uchar16 vals0 = vload16(offset.s0, input_image); const uchar16 vals1 = vload16(offset.s1, input_image); #if BYTES_PER_FULL_KERNEL >= 64 const uchar16 vals2 = vload16(offset.s2, input_image); const uchar16 vals3 = vload16(offset.s3, input_image); #endif #if BYTES_PER_FULL_KERNEL >= 128 const uchar16 vals4 = vload16(offset.s4, input_image); const uchar16 vals5 = vload16(offset.s5, input_image); const uchar16 vals6 = vload16(offset.s6, input_image); const uchar16 vals7 = vload16(offset.s7, input_image); #endif VATOMIC_INC16(histogram, vals0); VATOMIC_INC16(histogram, vals1); #if BYTES_PER_FULL_KERNEL >= 64 VATOMIC_INC16(histogram, vals2); VATOMIC_INC16(histogram, vals3); #endif #if BYTES_PER_FULL_KERNEL >= 128 VATOMIC_INC16(histogram, vals4); VATOMIC_INC16(histogram, vals5); VATOMIC_INC16(histogram, vals6); VATOMIC_INC16(histogram, vals7); #endif } __kernel void histogram_uchar4_n( __global const uchar * restrict input_image, /* Pointer to the input buffer */ __global uint * restrict histogram, /* Pointer to the histogram buffer */ const uint stride, /* stride in bytes */ const uint start_offset, /* starting offset in bytes */ const uchar4 n /* number of uchar4's to process in multiples - s3(4):s2(2):s1(1) */ ) { histogram += ((arm_get_core_id() % MAX_NUM_HISTOGRAM_BUFS) * 256 * 4); input_image += (get_global_id(1) * stride) + start_offset; uchar i = n.s3; while (i-- > 0) { const uchar16 vals = vload16(0, input_image); VATOMIC_INC16(histogram, vals); input_image+=16; } i = n.s2; while (i-- > 0) { const uchar8 vals = vload8(0, input_image); VATOMIC_INC8(histogram, vals); input_image+=8; } i = n.s1; while (i-- > 0) { const uchar4 vals = vload4(0, input_image); VATOMIC_INC4(histogram, vals); input_image+=4; } } /* Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ __kernel void lut( uint stride, /* Stride of the in/out allocation in uchar4. */ __global const uchar * restrict input_data, /* Pointer to the inputbuffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ __global const uchar * restrict luttables /*Look up table of values 256 for each in RGBA order */ ) { const int2 pos = (int2)(get_global_id(0)*8, get_global_id(1)); const int offset = pos.y * stride * 4 + pos.x; const uchar8 in = vload8(0, input_data + offset); uchar8 out; out.s0 = luttables[in.s0]; out.s1 = luttables[in.s1 + 256]; out.s2 = luttables[in.s2 + 512]; out.s3 = luttables[in.s3 + 768]; out.s4 = luttables[in.s4]; out.s5 = luttables[in.s5 + 256]; out.s6 = luttables[in.s6 + 512]; out.s7 = luttables[in.s7 + 768]; vstore8(out, 0, output_data + offset); } c 4 3
p e^ h Q 2% E
GCC: (GNU) 4.9.x 20150123 (prerelease) Android clang version 5.0.300080 (based on LLVM 5.0.300080)
.shstrtab .note.android.ident .note.gnu.build-id .dynsym .dynstr .gnu.hash .gnu.version .gnu.version_d .gnu.version_r .rel.dyn .rel.plt .text .ARM.exidx .rodata .ARM.extab .fini_array .data.rel.ro .dynamic .got .data .bss .comment .note.gnu.gold-version .ARM.attributes
RenderScript rrsdGpuScriptIntrinsic::scaleworksize: Worksize and/or Workoffset not divisible by scaling factor Unable to read kernel execution status, error code: %d GPU kernel returned error code: %d Code -59 possibly due to out-of-bound memory access. rsdGpuScriptIntrinsicBuffer: failed to create buffer, error code: %d rsdGpuScriptIntrinsicBuffer: failed to map buffer, error code: %d rsdGpuScriptIntrinsicBuffer: failed to unmap buffer, error code: %d Intrinsic 3dLUT: cannot build program: %d Intrinsic 3dLUT: cannot create kernel %s: %d blend_clear blend_src blend_src_over blend_dst_over blend_src_in blend_dst_in blend_src_out blend_dst_out blend_src_atop blend_dst_atop blend_xor blend_multiply blend_add blend_subtract Intrinsic Blend: cannot build program: %d Intrinsic Blend: cannot create kernel %s: %d Intrinsic Blur: error %d creating input image X(%zu) Y(%zu) P(%zu) Intrinsic Blur: error %d creating temp image X(%zu) Y(%zu) P(%zu) Intrinsic Blur: error %d creating output image X(%zu) Y(%zu) P(%zu) gauss_blur_1d_vert gauss_blur_1d_horz Intrinsic Blur: cannot create kernel %s: %d Intrinsic Blur: error creating/mapping weights buffer %d gpuRunHelper: Can't SetKernelArg with mInStride: %d %d gpuRunHelper: Can't SetKernelArg with hp: %d gpuRunHelper: Can't SetKernelArg with mOutStride: %d %d colormatrix4x4_U8_4_to_U8_4 colormatrix4x4_U8_4_to_U8_4_1left colormatrix4x4_U8_4_to_U8_4_2left colormatrix4x4_U8_4_to_U8_4_3left Grey_U8_4_to_U8 Grey_U8_4_to_U8_int colormatrix4x4_U8_4_to_U8_4_simple colormatrix4x4_U8_4_to_U8_simple Intrinsic Colormatrix: cannot build program: %d Intrinsic ColorMatrix: cannot create kernel %s: %d conv33 conv33_block conv33_sobel_top conv33_sobel_bottom conv33_sobel_left conv33_sobel_right conv33_sharpen conv33_sharpen_block Intrinsic Convolve3x3: cannot build program: %d Intrinsic C33: failed to allocate kernel list Intrinsic C33: failed to allocate kernel info Intrinsic Convolve3x3: cannot create kernel %s: %d Intrinsic Convolve5x5: cannot build program: %d Intrinsic Convolve5x5: cannot create kernel %s: %d histogram_uchar_full histogram_uchar_n histogram_uchar4_full histogram_uchar4_n Intrinsic Histogram: cannot build program: %d Intrinsic Histogram: cannot create kernel %s: %d vendor/arm/mali/gpu/android/renderscript/android-8.0.0_r1/driver/rsdGpuIntrinsicHistogram.cpp Intrinsic LUT: cannot build program: %d Intrinsic LUT: cannot create kernel %s: %d Failed to load Mali accelerated driver: %s Cannot query platform name size, error code: %d Cannot allocate %zx bytes for platform name. getPlatformInfo, error code: %d ARM Platform Cannot query platform's vendor name size, error code: %d Cannot allocate %zx bytes for platform's vendor name. Cannot query platform info, error code: %d clReleaseContext clReleaseCommandQueue clFlush clFinish clCreateBuffer clEnqueueCopyBuffer clEnqueueMapBuffer clEnqueueUnmapMemObject clRetainMemObject clReleaseMemObject clGetMemObjectInfo clEnqueueReadBuffer clEnqueueWriteBuffer clCreateImage clGetExtensionFunctionAddress CL_GET_ARCH_POINTER_ARM clCreateProgramWithBinary clCreateProgramWithSource clBuildProgram clGetProgramBuildInfo clReleaseProgram clGetProgramInfo clCreateKernel clCreateKernelsInProgram clReleaseKernel clGetKernelInfo clSetKernelArg clEnqueueNDRangeKernel clCreateUserEvent clSetUserEventStatus clSetEventCallback clWaitForEvents clGetEventInfo clRetainEvent clReleaseEvent GPU buffer unmapping failed, reverting to CPU, error code: %d GPU buffer creation during resize failed, reverting to CPU, error code: %d rsdClSetAllocationArgument: failed to set argument %d, error code: %d rsdClSetPtrArgument: failed to set argument %d, error code: %d Failed to create argument buffer %d, error code: %d Failed mapping of buffer argument, error code: %d Failed unmapping of buffer argument, error code: %d debug.rs.dumpbin .arm.in.bc ########### writing %s - size(%u) ########### failed to open %s Error getting program binary sizes, error code: %d Error getting program binaries, error code: %d .arm.out.bc ########### writing %s - size(%zu) [RS-DIAG] Nested forEach not supported on GPU [RS-DIAG] Input allocation type is unsupported on the GPU [RS-DIAG] Output allocation type is unsupported on the GPU Failed to map buffer argument, error code: %d Failed to unmap buffer argument, error code: %d Failed to set inElementStride arg: %d Failed to set inElementSize arg: %d Failed to set outElementStride arg: %d Failed to set outElementSize arg: %d [RS-DIAG] A used allocation is not supported on the GPU [RS-DIAG] Launching GPU kernel : slot(%d) GPU launch failure, error code: %d Attempting CPU fallback Detected failure post kernel execution [RS-DIAG] Launching CPU script : slot(%d) rsAssert failed: %s, in %s at %i vendor/arm/mali/gpu/android/renderscript/android-8.0.0_r1/driver/rsdClBcc.cpp /vendor/lib64/libbccArm.so debug.rs.cpufb property is set to "on", GPU acceleration disabled /vendor/bin/bcc Can't use user-allocated buffers if usage is not USAGE_SCRIPT | USAGE_SHARED or USAGE_SCRIPT | USAGE_SHARED | USAGE_GRAPHICS_TEXTURE User-backed allocation failed stride requirement, falling back to separate allocation !"Size mismatch" vendor/arm/mali/gpu/android/renderscript/android-8.0.0_r1/driver/aosp/rsdAllocation.cpp Attempting to sync allocation from render target, for non-render target allocation src == RS_ALLOCATION_USAGE_SCRIPT || src == RS_ALLOCATION_USAGE_SHARED Non-script allocation copies not yet implemented. Error from %s void rsSetElementAt_char(::rs_allocation, const char *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_char(::rs_allocation, char *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_char2(::rs_allocation, const char2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_char2(::rs_allocation, char2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_char3(::rs_allocation, const char3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_char3(::rs_allocation, char3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_char4(::rs_allocation, const char4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_char4(::rs_allocation, char4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uchar(::rs_allocation, const uchar *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uchar(::rs_allocation, uchar *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uchar2(::rs_allocation, const uchar2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uchar2(::rs_allocation, uchar2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uchar3(::rs_allocation, const uchar3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uchar3(::rs_allocation, uchar3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uchar4(::rs_allocation, const uchar4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uchar4(::rs_allocation, uchar4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_short(::rs_allocation, const short *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_short(::rs_allocation, short *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_short2(::rs_allocation, const short2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_short2(::rs_allocation, short2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_short3(::rs_allocation, const short3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_short3(::rs_allocation, short3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_short4(::rs_allocation, const short4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_short4(::rs_allocation, short4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ushort(::rs_allocation, const ushort *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ushort(::rs_allocation, ushort *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ushort2(::rs_allocation, const ushort2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ushort2(::rs_allocation, ushort2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ushort3(::rs_allocation, const ushort3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ushort3(::rs_allocation, ushort3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ushort4(::rs_allocation, const ushort4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ushort4(::rs_allocation, ushort4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_int(::rs_allocation, const int *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_int(::rs_allocation, int *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_int2(::rs_allocation, const int2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_int2(::rs_allocation, int2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_int3(::rs_allocation, const int3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_int3(::rs_allocation, int3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_int4(::rs_allocation, const int4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_int4(::rs_allocation, int4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uint(::rs_allocation, const uint *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uint(::rs_allocation, uint *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uint2(::rs_allocation, const uint2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uint2(::rs_allocation, uint2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uint3(::rs_allocation, const uint3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uint3(::rs_allocation, uint3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_uint4(::rs_allocation, const uint4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_uint4(::rs_allocation, uint4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_long(::rs_allocation, const long *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_long(::rs_allocation, long *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_long2(::rs_allocation, const long2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_long2(::rs_allocation, long2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_long3(::rs_allocation, const long3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_long3(::rs_allocation, long3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_long4(::rs_allocation, const long4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_long4(::rs_allocation, long4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ulong(::rs_allocation, const ulong *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ulong(::rs_allocation, ulong *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ulong2(::rs_allocation, const ulong2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ulong2(::rs_allocation, ulong2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ulong3(::rs_allocation, const ulong3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ulong3(::rs_allocation, ulong3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_ulong4(::rs_allocation, const ulong4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_ulong4(::rs_allocation, ulong4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_half(::rs_allocation, const half *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_half(::rs_allocation, half *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_half2(::rs_allocation, const half2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_half2(::rs_allocation, half2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_half3(::rs_allocation, const half3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_half3(::rs_allocation, half3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_half4(::rs_allocation, const half4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_half4(::rs_allocation, half4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_float(::rs_allocation, const float *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_float(::rs_allocation, float *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_float2(::rs_allocation, const float2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_float2(::rs_allocation, float2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_float3(::rs_allocation, const float3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_float3(::rs_allocation, float3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_float4(::rs_allocation, const float4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_float4(::rs_allocation, float4 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_double(::rs_allocation, const double *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_double(::rs_allocation, double *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_double2(::rs_allocation, const double2 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_double2(::rs_allocation, double2 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_double3(::rs_allocation, const double3 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_double3(::rs_allocation, double3 *, uint32_t, uint32_t, uint32_t) void rsSetElementAt_double4(::rs_allocation, const double4 *, uint32_t, uint32_t, uint32_t) void rsGetElementAt_double4(::rs_allocation, double4 *, uint32_t, uint32_t, uint32_t) %s {%f, %f} %s {%f, %f, %f} %s {%f, %f, %f, %f} %s %lld 0x%llx %s {%lld, %lld} 0x%llx 0x%llx %s {%lld, %lld, %lld} 0x%llx 0x%llx 0x%llx %s {%lld, %lld, %lld, %lld} 0x%llx 0x%llx 0x%llx 0x%llx %s %llu 0x%llx %s {%llu, %llu} 0x%llx 0x%llx %s {%llu, %llu, %llu} 0x%llx 0x%llx 0x%llx %s {%llu, %llu, %llu, %llu} 0x%llx 0x%llx 0x%llx 0x%llx rs_type creation error: Invalid element rs_type creation error: Invalid yuv_format %d rs_type creation error: Both X and Y dimension required when Z is present. rs_type creation error: X dimension required when Y is present. rs_type creation error: mipmap control require 2D Types. rs_type creation error: Cube maps require 2D Types. rs_type creation error: YUV only supports basic 2D. Out range ElementAt Y %i of %i Out range ElementAt Z %i of %i Vector size mismatch for ElementAt %i of %i Data type mismatch for ElementAt %i of %i libunwind: %s %s:%d - %s external/libunwind_llvm/src/Unwind-EHABI.cpp unsupported register class during phase1 personality function said it would stop here, but now in phase2 it did not stop here external/libunwind_llvm/src/Registers.hpp unsupported arm register Unknown ARM float register index inlined table detected but pr function requires extra words /* Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ __kernel void lut3d( __global const uchar4 * restrict input, /* Pointer to the inputbuffer */ __global uchar4 * restrict output, /* Pointer to the output buffer */ __global const uchar * restrict lut, const int4 dims, /* Dimensions of lut */ const int4 coordMul, const int3 stride ) { const int2 pos = (int2)(get_global_id(0), get_global_id(1)); const int offset = pos.y * stride.x + pos.x; /* bring input position within bounds of LUT */ const uchar4 in = input[offset]; int4 baseCoord = convert_int4(in) * coordMul; int4 coord1 = baseCoord >> (int4) 15; /* get XdYdZd (difference between x and point above and below) */ int4 weight2 = baseCoord & 0x7fff; /* first 15 bits */ int4 weight1 = (int4) 0x8000 - weight2; /* 1 - original */ /* point below needed point in all axis */ const int lut_offset = (coord1.x * 4) + (coord1.y * stride.y) + (coord1.z * stride.z); /* v000 = v000.lo v100 = v000.hi */ uint8 v000 = convert_uint8(vload8(0, lut + lut_offset)); /* v010 = v010.lo v110 = v010.hi */ uint8 v010 = convert_uint8(vload8(0, lut + lut_offset + stride.y)); /* v001 = v001.lo v101 = v001.hi */ uint8 v001 = convert_uint8(vload8(0, lut + lut_offset + stride.z)); /* v011 = v011.lo v111 = v011.hi */ uint8 v011 = convert_uint8(vload8(0, lut + lut_offset + stride.y + stride.z)); /* Interpolate along x */ uint4 yz00 = ((v000.lo * weight1.x) + (v000.hi * weight2.x)) >> (int4)7; uint4 yz10 = ((v010.lo * weight1.x) + (v010.hi * weight2.x)) >> (int4)7; uint4 yz01 = ((v001.lo * weight1.x) + (v001.hi * weight2.x)) >> (int4)7; uint4 yz11 = ((v011.lo * weight1.x) + (v011.hi * weight2.x)) >> (int4)7; /* Interpolate along y */ uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15; uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15; /* Interpolate along z */ uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15; uint4 v2 = (v + 0x7f) >> (int4)8; /* part of integer math */ uchar4 ret = convert_uchar4(v2); ret.w = in.w; output[offset] = ret; } /* * Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ __kernel void blend_clear( uint width, /* Width of the in/out allocation in uchar4. */ __global const uchar * restrict input_data, /* Pointer to the input-buffer */ __global uchar * restrict output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; const uchar16 load = 0; vstore16(load, 0, output_data + offset); } __kernel void blend_src( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; const uchar16 load = vload16(0, input_data + offset); vstore16(load, 0, output_data + offset); } __kernel void blend_src_over( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_SRC_OVER */ short16 in_s = convert_short16( vload16(0, input_data + offset)); const short16 out_s = convert_short16( vload16(0, output_data + offset)); short16 alpha = (short16)255 - in_s.s33337777BBBBFFFF ; in_s = in_s + ((out_s * alpha) >> (short16)8); vstore16(convert_uchar16(in_s), 0, output_data + offset); } __kernel void blend_dst_over( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_DST_OVER */ const short16 in_s = convert_short16( vload16(0, input_data + offset)); short16 out_s = convert_short16( vload16(0, output_data + offset)); short16 alpha = (short16) 255 - out_s.s33337777BBBBFFFF; out_s = out_s + ((in_s * alpha )>> (short16)8); vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_src_in( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_SRC_IN */ short16 in_s = convert_short16(vload16(0, input_data + offset)); const short16 out_s = convert_short16(vload16(0,output_data + offset)); in_s = (in_s * out_s.s33337777BBBBFFFF) >> (short16)8; vstore16(convert_uchar16(in_s), 0, output_data + offset); } __kernel void blend_dst_in( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_DST_IN */ short16 out_s = convert_short16(vload16(0, output_data + offset)); const short16 in_s = convert_short16(vload16(0, input_data + offset)); out_s = (out_s * in_s.s33337777BBBBFFFF) >> (short16)8; vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_src_out( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_SRC_OUT */ short16 in_s = convert_short16(vload16(0, input_data + offset)); const short16 out_s = convert_short16(vload16(0, output_data + offset)); in_s = (in_s * ((short16)255 - out_s.s33337777BBBBFFFF)) >> (short16)8; vstore16(convert_uchar16(in_s), 0, output_data + offset); } __kernel void blend_dst_out( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = (row * width * 4 + col ); /* BLEND_SRC_OUT */ const short16 in_s = convert_short16( vload16( 0, input_data + offset)); short16 out_s = convert_short16(vload16(0, output_data + offset)); out_s = (out_s * ((short16)255 - in_s.s33337777BBBBFFFF)) >> (short16)8; vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_src_atop( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col ; /* BLEND_SRC_ATOP */ const short16 in_s = convert_short16(vload16(0, input_data + offset)); short16 out_s = convert_short16(vload16(0, output_data + offset)); const short4 alphas = out_s.s37BF; const short16 dsta = out_s.s33337777BBBBFFFF; const short16 srca = in_s.s33337777BBBBFFFF; out_s = ((in_s * dsta) + (out_s * ((short16)255 - srca))) >> (short16)8; out_s.s37BF = alphas; vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_dst_atop( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col ; /* BLEND_SRC_ATOP */ const short16 in_s = convert_short16(vload16(0, input_data + offset)); short16 out_s = convert_short16(vload16(0, output_data + offset)); const short4 alphas = in_s.s37BF; const short16 dsta = out_s.s33337777BBBBFFFF; const short16 srca = in_s.s33337777BBBBFFFF; out_s = ((out_s * srca) + (in_s * ((short16)255 - dsta))) >> (short16)8; out_s.s37BF = alphas; vstore16(convert_uchar16(out_s), 0, output_data + offset); } __kernel void blend_xor( uint width, /* Width of the in/out allocation in pixels. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; const uchar16 loadin = vload16(0, input_data + offset); uchar16 loadout = vload16(0, output_data + offset); loadout = loadout ^ loadin; vstore16(loadout, 0, output_data + offset); } __kernel void blend_multiply( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; /* BLEND_MULTIPLY */ const short16 loadin = convert_short16(vload16( 0, input_data + offset)); const short16 loadout = convert_short16(vload16( 0, output_data + offset)); const uchar16 res = convert_uchar16( (loadin * loadout) >> (short16)8); vstore16(res, 0, output_data + offset); } __kernel void blend_add( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; /* BLEND_ADD */ const short16 loadin = convert_short16(vload16(0, input_data + offset)); short16 loadout = convert_short16(vload16(0, output_data + offset)); loadout = loadout + loadin; const uchar16 res = convert_uchar16(min( (short16) 255, loadout)); vstore16(res, 0, output_data + offset); } __kernel void blend_subtract( uint width, /* Width of the in/out allocation. */ __global const uchar * input_data, /* Pointer to the input-buffer */ __global uchar * output_data /* Pointer to the output buffer */ ) { const uint col = get_global_id(0) * 16; const uint row = get_global_id(1); const uint offset = row * width * 4 + col; /* BLEND_SUBTRACT */ const short16 loadin = convert_short16(vload16(0, input_data + offset)); short16 loadout = convert_short16(vload16(0, output_data + offset)); loadout = loadout - loadin; const uchar16 res = convert_uchar16(max( (short16) 0, loadout)); vstore16(res, 0, output_data + offset); } /* * Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2016 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; __kernel void gauss_blur_1d_vert(__read_only image2d_t src, __write_only image2d_t dst, const int num_coeffs, __global float* weights ) { int2 pos = {get_global_id(0), get_global_id(1)}; int i; half4 rgb_out = 0.0f; const float2 fpos = convert_float2(pos); float2 off_weight; off_weight = vload2(0, weights); for( i=0; i<num_coeffs; i++) { rgb_out += read_imageh(src, sampler, (float2) (fpos.x+0.5f, fpos.y + off_weight.x)) * (half)off_weight.y; off_weight = vload2(i+1, weights); } rgb_out += read_imageh(src, sampler, (float2) (fpos.x+0.5f, fpos.y + off_weight.x)) * (half)off_weight.y; write_imageh(dst, pos, rgb_out); } __kernel void gauss_blur_1d_horz( __read_only image2d_t src, __write_only image2d_t dst, const int num_coeffs, __global float* weights ) { const int2 pos = {get_global_id(0), get_global_id(1)}; int i; half4 rgb_out = 0.0f; const float2 fpos = convert_float2(pos); float2 off_weight; off_weight = vload2(0, weights); for( i=0; i<num_coeffs; i++) { rgb_out += read_imageh(src, sampler, (float2) (fpos.x + off_weight.x, fpos.y+0.5f)) * (half)off_weight.y; off_weight = vload2(i+1, weights); } rgb_out += read_imageh(src, sampler, (float2) (fpos.x + off_weight.x, fpos.y+0.5f)) * (half)off_weight.y; write_imageh(dst, pos, rgb_out); } /* * Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2016 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define PIXEL_SIZE sizeof(uchar4) #define COMPUTE_PIXEL(out, in, colormat) \ do { \ out = in.s0 * colormat.s0123; \ out += in.s1 * colormat.s4567; \ out += in.s2 * colormat.s89ab; \ out += in.s3 * colormat.scdef; \ } while(0) /* No leftover pixels */ __kernel void colormatrix4x4_U8_4_to_U8_4( __global uchar* in, __global uchar* out, const int src_offset, const int stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*stride; __global uchar * dst = out + src_offset + x*(4*PIXEL_SIZE) + y*stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar16 res = convert_uchar16(fsum); vstore16(res, 0, dst); } /* One leftover pixel */ __kernel void colormatrix4x4_U8_4_to_U8_4_1left( __global uchar* in, __global uchar* out, const int src_offset, const int stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*stride; __global uchar * dst = out + src_offset + x*(4*PIXEL_SIZE) + y*stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar16 res = convert_uchar16(fsum); vstore16(res, 0, dst); /* Do last one pixel */ if ( x == (get_global_size(0)-1)) { src = in + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; dst = out + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; p0 = convert_half4(vload4(0,src)); COMPUTE_PIXEL(d.s0123, p0, colormat); half4 fsum4 = (half4)(d.s0123) + ((half4)(add)); uchar4 res4 = convert_uchar4(fsum4); vstore4(res4, 0, dst); } } /* Two leftover pixels */ __kernel void colormatrix4x4_U8_4_to_U8_4_2left( __global uchar* in, __global uchar* out, const int src_offset, const int stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*stride; __global uchar * dst = out + src_offset + x*(4*PIXEL_SIZE) + y*stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar16 res = convert_uchar16(fsum); vstore16(res, 0, dst); /* Compute left over two pixels */ if ( x == (get_global_size(0)-1)) { src = in + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; dst = out + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; half8 p01 = convert_half8(vload8(0,src)); COMPUTE_PIXEL(d.s0123, p01.s0123, colormat); COMPUTE_PIXEL(d.s4567, p01.s4567, colormat); half8 fsum8 = (half8)(d.s0123,d.s4567) + ((half8)(add, add)); uchar8 res8 = convert_uchar8(fsum8); vstore8(res8, 0, dst); } } /* Three leftover pixels */ __kernel void colormatrix4x4_U8_4_to_U8_4_3left( __global uchar* in, __global uchar* out, const int src_offset, const int stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*stride; __global uchar * dst = out + src_offset + x*(4*PIXEL_SIZE) + y*stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar16 res = convert_uchar16(fsum); vstore16(res, 0, dst); /* Compute left over three pixels */ if ( x == (get_global_size(0)-1)) { src = in + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; dst = out + src_offset + (x+1)*(4*PIXEL_SIZE) + y*stride; p = vload16(0,src); p0 = convert_half4(p.s0123); p1 = convert_half4(p.s4567); p2 = convert_half4(p.s89ab); COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); half8 fsum8 = (half8)(d.s0123,d.s4567) + ((half8)(add, add)); uchar8 res8 = convert_uchar8(fsum8); vstore8(res8, 0, dst); half4 fsum4 = (half4)(d.s89ab) + ((half4)(add)); uchar4 res4 = convert_uchar4(fsum4); vstore4(res4, 0, dst+8); } } __kernel void colormatrix4x4_U8_4_to_U8( __global uchar* in, __global uchar* out, const int src_offset, const int in_stride, const int dst_offset, const int out_stride, const half16 colormat, const half4 add ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + src_offset + x*(4*PIXEL_SIZE) + y*in_stride; /* U8_4 */ __global uchar * dst = out + dst_offset + x*(4) + y*out_stride; /* U8 */ /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); half16 fsum = d + ((half16)(add, add, add, add)); uchar4 res = convert_uchar4(fsum.s048c); vstore4(res, 0, dst); } __kernel void Grey_U8_4_to_U8( __global uchar* in, __global uchar* out, const int in_stride, const int out_stride, const half4 colormat ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + x*(4*PIXEL_SIZE) + y*in_stride; /* U8_4 */ __global uchar * dst = out + x*(4) + y*out_stride; /* U8 */ /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half16 p0123 = convert_half16(p); half4 fsum = colormat.s0 * p0123.s048c; fsum += colormat.s1 * p0123.s159d; fsum += colormat.s2 * p0123.s26ae; uchar4 res = convert_uchar4(fsum); vstore4(res, 0, dst); } /* No add and no clipping */ __kernel void colormatrix4x4_U8_4_to_U8_4_simple( __global uchar* in, __global uchar* out, const int in_stride, const int out_stride, const half16 colormat ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + x*(4*PIXEL_SIZE) + y*in_stride; __global uchar * dst = out + x*(4*PIXEL_SIZE) + y*out_stride; /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); uchar16 res = convert_uchar16(d); vstore16(res, 0, dst); } /* No add and no clipping */ __kernel void colormatrix4x4_U8_4_to_U8_simple( __global uchar* in, __global uchar* out, const int in_stride, const int out_stride, const half16 colormat ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + x*(4*PIXEL_SIZE) + y*in_stride; /* U8_4 */ __global uchar * dst = out + x*(4) + y*out_stride; /* U8 */ /* do 4 pixels at a time */ uchar16 p = vload16(0,src); half4 p0 = convert_half4(p.s0123); half4 p1 = convert_half4(p.s4567); half4 p2 = convert_half4(p.s89ab); half4 p3 = convert_half4(p.scdef); half16 d; COMPUTE_PIXEL(d.s0123, p0, colormat); COMPUTE_PIXEL(d.s4567, p1, colormat); COMPUTE_PIXEL(d.s89ab, p2, colormat); COMPUTE_PIXEL(d.scdef, p3, colormat); uchar4 res = convert_uchar4(d.s048c); vstore4(res, 0, dst); } /* * Using integer coefficients * https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.601-7-201103-I!!PDF-E.pdf page 7 */ __kernel void Grey_U8_4_to_U8_int( __global uchar* in, __global uchar* out, const int in_stride, const int out_stride ) { int x = get_global_id(0); int y = get_global_id(1); __global uchar * src = in + x*(4*PIXEL_SIZE) + y*in_stride; /* U8_4 */ __global uchar * dst = out + x*(4) + y*out_stride; /* U8 */ /* do 4 pixels at a time */ uchar16 p = vload16(0,src); int16 p0123 = convert_int16(p); int4 fsum = (int) 77 * p0123.s048c; fsum += (int) 150 * p0123.s159d; fsum += (int) 29 * p0123.s26ae; fsum = fsum >> 8; uchar4 res = convert_uchar4(fsum); vstore4(res, 0, dst); } ? ? ? ?/* Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ /* Kernel to handle 4 horizontally adjacent pixels - clips to y access only, assumes x-1 & xmax+1 are accessible */ __kernel void conv33_block( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const float16 weights, const int stride ) { const size_t pixel_size = 4; float16 sum; float16 l, m, r; const int2 pos = (int2)( get_global_offset(0) + 4 * ( get_global_id(0) - get_global_offset(0) ), get_global_id(1) ); const int col2_offset = pos.x * pixel_size; const int col1_offset = col2_offset - pixel_size; /* Safe - we're never called with pos.x < 1 */ /* No need for col3_offset - it's within the block we're computing */ /* Row offsets incorporate offset to col1_offset (one pixel left of target block) */ const int row2_offset = col1_offset + pos.y * stride; const int row1_offset = max( row2_offset - stride, col1_offset ); const int row3_offset = col1_offset + ( min( (uint)(pos.y + 1), (uint)(height - 1) ) * stride ); l = convert_float16( vload16( 0, input_data + row1_offset ) ); r = convert_float16( vload16( 0, input_data + row1_offset + 2 * pixel_size ) ); m = (float16)(l.s456789ab, r.s456789ab); sum = weights.s0 * l; sum += weights.s1 * m; sum += weights.s2 * r; l = convert_float16( vload16( 0, input_data + row2_offset ) ); r = convert_float16( vload16( 0, input_data + row2_offset + 2 * pixel_size ) ); m = (float16)( l.s456789ab, r.s456789ab ); sum += weights.s3 * l; sum += weights.s4 * m; sum += weights.s5 * r; l = convert_float16( vload16( 0, input_data + row3_offset ) ); r = convert_float16( vload16( 0, input_data + row3_offset + 2 * pixel_size ) ); m = (float16)( l.s456789ab, r.s456789ab ); sum += weights.s6 * l; sum += weights.s7 * m; sum += weights.s8 * r; /* Must add pixel_size to row2_offset to point to col2 - first target pixel of block */ vstore16( convert_uchar16( clamp( sum, 0.f, 255.f ) ), 0, output_data + row2_offset + pixel_size ); } /* Kernel to handle left and right sections either side of middle section handled by conv33. Clips in x & y. */ __kernel void conv33( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const float16 weights, /* convolution kernel */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; float4 sum = weights.s0 * convert_float4( vload4( 0, input_data + row1_offset + col1_offset ) ) + weights.s1 * convert_float4( vload4( 0, input_data + row1_offset + col2_offset ) ) + weights.s2 * convert_float4( vload4( 0, input_data + row1_offset + col3_offset ) ) + weights.s3 * convert_float4( vload4( 0, input_data + row2_offset + col1_offset ) ) + weights.s4 * convert_float4( vload4( 0, input_data + row2_offset + col2_offset ) ) + weights.s5 * convert_float4( vload4( 0, input_data + row2_offset + col3_offset ) ) + weights.s6 * convert_float4( vload4( 0, input_data + row3_offset + col1_offset ) ) + weights.s7 * convert_float4( vload4( 0, input_data + row3_offset + col2_offset ) ) + weights.s8 * convert_float4( vload4( 0, input_data + row3_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, 0.f, 255.f ) ), 0, output_data + row2_offset + col2_offset ); } __kernel void conv33_sharpen_block( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_offset(0) + 4 * ( get_global_id(0) - get_global_offset(0) ), get_global_id(1) ); const int col2_offset = pos.x * pixel_size; const int col1_offset = col2_offset - pixel_size; /* Safe - we're never called with pos.x < 1 */ /* No need for col3_offset - it's within the block we're computing */ /* Row offsets incorporate offset to col1 (one pixel left of target block) */ const int row2_offset = col1_offset + pos.y * stride; const int row1_offset = max( row2_offset - stride, col1_offset ); const int row3_offset = col1_offset + ( min( (uint)(pos.y + 1), (uint)(height - 1) ) * stride ); /* Read data for centre row at left, middle and right offsets, synthesizing data already read */ short16 l = convert_short16( vload16( 0, input_data + row2_offset ) ); short16 r = (short16)(l.hi, convert_short8( vload8( 0, input_data + row2_offset + 4 * pixel_size ) ) ); short16 m = (short16)(l.s456789ab, r.s456789ab); short16 sum = (short)5 * m - l - r - convert_short16( vload16( 0, input_data + row1_offset + pixel_size ) ) - convert_short16( vload16( 0, input_data + row3_offset + pixel_size ) ); vstore16( convert_uchar16( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + pixel_size ); } /* Kernel for Sharpen: 0 |-1 | 0 -1| 5 |-1 0 |-1 | 0 */ __kernel void conv33_sharpen( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = ((short)5) * convert_short4(vload4( 0, input_data + row2_offset + col2_offset )) - convert_short4(vload4( 0, input_data + row1_offset + col2_offset )) - convert_short4(vload4( 0, input_data + row2_offset + col1_offset )) - convert_short4(vload4( 0, input_data + row2_offset + col3_offset )) - convert_short4(vload4( 0, input_data + row3_offset + col2_offset )) ; vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Sobel top: 1 | 2 | 1 0 | 0 | 0 -1|-2 |-1 */ __kernel void conv33_sobel_top( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row1_offset + col2_offset ) ) + convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row3_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Sobel bottom: -1|-2 |-1 0 | 0 | 0 1 | 2 | 1 */ __kernel void conv33_sobel_bottom( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row3_offset + col2_offset ) ) + convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row1_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Sobel left: 1 | 0 |-1 2 | 0 |-2 1 | 0 |-1 */ __kernel void conv33_sobel_left( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row2_offset + col1_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row2_offset + col3_offset ) ) + convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Sobel right: -1| 0 | 1 -2| 0 | 2 -1| 0 | 1 */ __kernel void conv33_sobel_right( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row2_offset + col3_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row2_offset + col1_offset ) ) + convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Identity: 0 | 0 | 0 0 | 1 | 0 0 | 0 | 0 */ __kernel void conv33_identity( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t row2_offset = pos.y * stride; vstore4( vload4( 0, input_data + row2_offset + col2_offset ) , 0, output_data + row2_offset + col2_offset ); } /* Kernel for Emboss: -2|-1 | 0 -1| 1 | 1 0| 1 | 2 */ __kernel void conv33_emboss( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = convert_short4( vload4( 0, input_data + row2_offset + col2_offset ) ) + convert_short4( vload4( 0, input_data + row2_offset + col3_offset ) ) + convert_short4( vload4( 0, input_data + row3_offset + col2_offset ) ) + (short)2 * convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ) - (short)2 * convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row2_offset + col1_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Kernel for Outline: -1 |-1 |-1 -1 | 8 |-1 -1 |-1 |-1 */ __kernel void conv33_outline( const uint width, /* Width of the in/out allocation in pixels (uchar4) */ const uint height, /* Height of the in/out allocation in pixels (uchar4) */ __global const uchar * restrict input_data, /* Pointer to the input buffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ const float16 weights, /* convolution kernel */ const int stride ) { const size_t pixel_size = 4; const int2 pos = (int2)( get_global_id(0), get_global_id(1) ); const size_t col2_offset = pos.x * pixel_size; const size_t col1_offset = max( (int)col2_offset - (int)pixel_size, (int)0 ); const size_t col3_offset = min( (uint)(pos.x + 1), (uint)(width - 1) ) * pixel_size; const size_t row2_offset = pos.y * stride; const size_t row1_offset = max( (int)row2_offset - stride, (int)0 ); const size_t row3_offset = min( (uint)pos.y + 1, (uint)height - 1 ) * stride; short4 sum = (short)8 * convert_short4( vload4( 0, input_data + row2_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row1_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row2_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row2_offset + col3_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col1_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col2_offset ) ) - convert_short4( vload4( 0, input_data + row3_offset + col3_offset ) ); vstore4( convert_uchar4( clamp( sum, (short)0, (short)255 ) ), 0, output_data + row2_offset + col2_offset ); } /* Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ __kernel void conv_55( __global const uchar * restrict input_data, /* Pointer to the inputbuffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ __global const float * weights, const int stride ) { float16 sum = 0; float16 l0, l1, l2, l3, l4; const int2 pos = (int2)(get_global_id(0)*16, get_global_id(1)); int offset = (pos.y - 2 )*stride + pos.x - 8; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); float16 we = vload16(0, weights); sum += we.s0 * l0 ; sum += we.s1 * l1 ; sum += we.s2 * l2 ; sum += we.s3 * l3 ; sum += we.s4 * l4 ; offset += stride; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); sum += we.s5 * l0 ; sum += we.s6 * l1 ; sum += we.s7 * l2 ; sum += we.s8 * l3 ; sum += we.s9 * l4 ; offset += stride; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); sum += we.sa * l0 ; sum += we.sb * l1 ; sum += we.sc * l2 ; sum += we.sd * l3 ; sum += we.se * l4 ; offset += stride; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); we = vload16(0, weights + 15); sum += we.s0 * l0 ; sum += we.s1 * l1 ; sum += we.s2 * l2 ; sum += we.s3 * l3 ; sum += we.s4 * l4 ; offset += stride; l0 = convert_float16(vload16( 0, input_data + offset)); l4 = convert_float16(vload16( 0, input_data + offset + 16)); l2 = (float16)(l0.hi, l4.lo); l1 = (float16)(l0.s456789ab, l2.s456789ab); l3 = (float16)(l2.s456789ab, l4.s456789ab); sum += we.s5 * l0 ; sum += we.s6 * l1 ; sum += we.s7 * l2 ; sum += we.s8 * l3 ; sum += we.s9 * l4 ; vstore16(convert_uchar16(clamp(sum, 0.f, 255.f)), 0, output_data + pos.y * stride + pos.x); } /* * Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2017 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ #pragma OPENCL EXTENSION cl_arm_core_id : enable #define MAX_NUM_HISTOGRAM_BUFS 16 /* Ideally this should be a power of 2. This must match kMaxNumHistogramBufs */ #define BYTES_PER_FULL_KERNEL 64 /* 32, 64 or 128 : This must match kBytesPerFullKernel */ #define VATOMIC_INC4(histogram, vals4) do { \ atomic_inc(histogram + (vals4.s0)); \ atomic_inc(histogram + (vals4.s1)); \ atomic_inc(histogram + (vals4.s2)); \ atomic_inc(histogram + (vals4.s3)); \ } while (0) #define VATOMIC_INC8(histogram, vals8) do { \ uint4 vals8_lo = convert_uint4((vals8).lo); \ VATOMIC_INC4((histogram), (vals8_lo)); \ uint4 vals8_hi = convert_uint4((vals8).hi); \ VATOMIC_INC4((histogram), (vals8_hi)); \ } while (0) #define VATOMIC_INC16(histogram, vals16) do { \ ushort8 vals16_lo = convert_ushort8((vals16).lo); \ VATOMIC_INC8((histogram), (vals16_lo)); \ ushort8 vals16_hi = convert_ushort8((vals16).hi); \ VATOMIC_INC8((histogram), (vals16_hi)); \ } while (0) __kernel void histogram_uchar_full( __global const uchar * restrict input_image, /* Pointer to the input-buffer */ __global uint * restrict histogram, /* Pointer to the histogram buffer */ uint stride /* stride/16 */ ) { histogram += ((arm_get_core_id() % MAX_NUM_HISTOGRAM_BUFS) * 256); // The offset is scaled by 16 #if BYTES_PER_FULL_KERNEL == 32 const uint2 offset = (uint2)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint2)(0,1); #elif BYTES_PER_FULL_KERNEL == 64 const uint4 offset = (uint4)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint4)(0,1,2,3); #elif BYTES_PER_FULL_KERNEL == 128 const uint8 offset = (uint8)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint8)(0,1,2,3,4,5,6,7); #else #error "Invalid BYTES_PER_FULL_KERNEL value" #endif const uchar16 vals0 = vload16(offset.s0, input_image); const uchar16 vals1 = vload16(offset.s1, input_image); #if BYTES_PER_FULL_KERNEL >= 64 const uchar16 vals2 = vload16(offset.s2, input_image); const uchar16 vals3 = vload16(offset.s3, input_image); #endif #if BYTES_PER_FULL_KERNEL >= 128 const uchar16 vals4 = vload16(offset.s4, input_image); const uchar16 vals5 = vload16(offset.s5, input_image); const uchar16 vals6 = vload16(offset.s6, input_image); const uchar16 vals7 = vload16(offset.s7, input_image); #endif VATOMIC_INC16(histogram, vals0); VATOMIC_INC16(histogram, vals1); #if BYTES_PER_FULL_KERNEL >= 64 VATOMIC_INC16(histogram, vals2); VATOMIC_INC16(histogram, vals3); #endif #if BYTES_PER_FULL_KERNEL >= 128 VATOMIC_INC16(histogram, vals4); VATOMIC_INC16(histogram, vals5); VATOMIC_INC16(histogram, vals6); VATOMIC_INC16(histogram, vals7); #endif } __kernel void histogram_uchar_n( __global const uchar * restrict input_image, /* Pointer to the input buffer */ __global uint * restrict histogram, /* Pointer to the histogram buffer */ const uint stride, /* stride in bytes */ const uint start_offset, /* starting offset in bytes */ const uchar4 n /* number of uchar's to process in multiples - s3(16):s2(8):s1(4):s0(1) */ ) { histogram += ((arm_get_core_id() % MAX_NUM_HISTOGRAM_BUFS) * 256); input_image += (get_global_id(1) * stride) + start_offset; uchar i = n.s3; while (i-- > 0) { const uchar16 vals = vload16(0, input_image); VATOMIC_INC16(histogram, vals); input_image+=16; } i = n.s2; while (i-- > 0) { const uchar8 vals = vload8(0, input_image); VATOMIC_INC8(histogram, vals); input_image+=8; } i = n.s1; while (i-- > 0) { const uchar4 vals = vload4(0, input_image); VATOMIC_INC4(histogram, vals); input_image+=4; } i = n.s0; while (i-- > 0) { atomic_inc(histogram + *input_image++); } } #undef VATOMIC_INC4 #define VATOMIC_INC4(histogram, vals4) do { \ atomic_inc(histogram + ((vals4.s0) * 4)); \ atomic_inc(histogram + ((vals4.s1) * 4) + 1); \ atomic_inc(histogram + ((vals4.s2) * 4) + 2); \ atomic_inc(histogram + ((vals4.s3) * 4) + 3); \ } while (0) __kernel void histogram_uchar4_full( __global const uchar * restrict input_image, /* Pointer to the input buffer */ __global uint * restrict histogram, /* Pointer to the histogram buffer */ uint stride /* stride/16 */ ) { histogram += ((arm_get_core_id() % MAX_NUM_HISTOGRAM_BUFS) * 256 * 4); // The offset is scaled by 16 #if BYTES_PER_FULL_KERNEL == 32 const uint2 offset = (uint2)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint2)(0,1); #elif BYTES_PER_FULL_KERNEL == 64 const uint4 offset = (uint4)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint4)(0,1,2,3); #elif BYTES_PER_FULL_KERNEL == 128 const uint8 offset = (uint8)((get_global_id(1) * stride) + (get_global_id(0) * BYTES_PER_FULL_KERNEL/16)) + (uint8)(0,1,2,3,4,5,6,7); #else #error "Invalid BYTES_PER_FULL_KERNEL value" #endif const uchar16 vals0 = vload16(offset.s0, input_image); const uchar16 vals1 = vload16(offset.s1, input_image); #if BYTES_PER_FULL_KERNEL >= 64 const uchar16 vals2 = vload16(offset.s2, input_image); const uchar16 vals3 = vload16(offset.s3, input_image); #endif #if BYTES_PER_FULL_KERNEL >= 128 const uchar16 vals4 = vload16(offset.s4, input_image); const uchar16 vals5 = vload16(offset.s5, input_image); const uchar16 vals6 = vload16(offset.s6, input_image); const uchar16 vals7 = vload16(offset.s7, input_image); #endif VATOMIC_INC16(histogram, vals0); VATOMIC_INC16(histogram, vals1); #if BYTES_PER_FULL_KERNEL >= 64 VATOMIC_INC16(histogram, vals2); VATOMIC_INC16(histogram, vals3); #endif #if BYTES_PER_FULL_KERNEL >= 128 VATOMIC_INC16(histogram, vals4); VATOMIC_INC16(histogram, vals5); VATOMIC_INC16(histogram, vals6); VATOMIC_INC16(histogram, vals7); #endif } __kernel void histogram_uchar4_n( __global const uchar * restrict input_image, /* Pointer to the input buffer */ __global uint * restrict histogram, /* Pointer to the histogram buffer */ const uint stride, /* stride in bytes */ const uint start_offset, /* starting offset in bytes */ const uchar4 n /* number of uchar4's to process in multiples - s3(4):s2(2):s1(1) */ ) { histogram += ((arm_get_core_id() % MAX_NUM_HISTOGRAM_BUFS) * 256 * 4); input_image += (get_global_id(1) * stride) + start_offset; uchar i = n.s3; while (i-- > 0) { const uchar16 vals = vload16(0, input_image); VATOMIC_INC16(histogram, vals); input_image+=16; } i = n.s2; while (i-- > 0) { const uchar8 vals = vload8(0, input_image); VATOMIC_INC8(histogram, vals); input_image+=8; } i = n.s1; while (i-- > 0) { const uchar4 vals = vload4(0, input_image); VATOMIC_INC4(histogram, vals); input_image+=4; } } /* Copyright: * ---------------------------------------------------------------------------- * This confidential and proprietary software may be used only as authorized * by a licensing agreement from ARM Limited. * (C) COPYRIGHT 2013 ARM Limited, ALL RIGHTS RESERVED * The entire notice above must be reproduced on all authorized copies and * copies may only be made to the extent permitted by a licensing agreement * from ARM Limited. * ---------------------------------------------------------------------------- */ __kernel void lut( uint stride, /* Stride of the in/out allocation in uchar4. */ __global const uchar * restrict input_data, /* Pointer to the inputbuffer */ __global uchar * restrict output_data, /* Pointer to the output buffer */ __global const uchar * restrict luttables /*Look up table of values 256 for each in RGBA order */ ) { const int2 pos = (int2)(get_global_id(0)*8, get_global_id(1)); const int offset = pos.y * stride * 4 + pos.x; const uchar8 in = vload8(0, input_data + offset); uchar8 out; out.s0 = luttables[in.s0]; out.s1 = luttables[in.s1 + 256]; out.s2 = luttables[in.s2 + 512]; out.s3 = luttables[in.s3 + 768]; out.s4 = luttables[in.s4]; out.s5 = luttables[in.s5 + 256]; out.s6 = luttables[in.s6 + 512]; out.s7 = luttables[in.s7 + 768]; vstore8(out, 0, output_data + offset); } c 4 3
p e^ h Q 2% E
GCC: (GNU) 4.9.x 20150123 (prerelease) Android clang version 5.0.300080 (based on LLVM 5.0.300080)
.shstrtab .note.android.ident .note.gnu.build-id .dynsym .dynstr .gnu.hash .gnu.version .gnu.version_d .gnu.version_r .rel.dyn .rel.plt .text .ARM.exidx .rodata .ARM.extab .fini_array .data.rel.ro .dynamic .got .data .bss .comment .note.gnu.gold-version .ARM.attributes