1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <android/log.h>
18 #include <math.h>
19 #include <stdlib.h>
20 #include <unistd.h>
21 
22 #include "Bench.h"
23 
24 
Bench()25 Bench::Bench()
26 {
27     mTimeBucket = NULL;
28     mTimeBuckets = 0;
29     mTimeBucketDivisor = 1;
30 
31     mMemLatencyLastSize = 0;
32     mMemDst = NULL;
33     mMemSrc = NULL;
34     mMemLoopCount = 0;
35 }
36 
37 
~Bench()38 Bench::~Bench()
39 {
40 }
41 
getTimeNanos() const42 uint64_t Bench::getTimeNanos() const
43 {
44     struct timespec t;
45     clock_gettime(CLOCK_MONOTONIC, &t);
46     return t.tv_nsec + ((uint64_t)t.tv_sec * 1000 * 1000 * 1000);
47 }
48 
getTimeMillis() const49 uint64_t Bench::getTimeMillis() const
50 {
51     return getTimeNanos() / 1000000;
52 }
53 
54 
testWork(void * usr,uint32_t idx)55 void Bench::testWork(void *usr, uint32_t idx)
56 {
57     Bench *b = (Bench *)usr;
58     //__android_log_print(ANDROID_LOG_INFO, "bench", "test %i   %p", idx, b);
59 
60     float f1 = 0.f;
61     float f2 = 0.f;
62     float f3 = 0.f;
63     float f4 = 0.f;
64 
65     float *ipk = b->mIpKernel[idx];
66     volatile float *src = b->mSrcBuf[idx];
67     volatile float *out = b->mOutBuf[idx];
68 
69     //__android_log_print(ANDROID_LOG_INFO, "bench", "test %p %p %p", ipk, src, out);
70 
71     do {
72 
73         for (int i = 0; i < 1024; i++) {
74             f1 += src[i * 4] * ipk[i];
75             f2 += src[i * 4 + 1] * ipk[i];
76             f3 += src[i * 4 + 2] * ipk[i];
77             f4 += sqrtf(f1 + f2 + f3);
78         }
79         out[0] = f1;
80         out[1] = f2;
81         out[2] = f3;
82         out[3] = f4;
83 
84     } while (b->incTimeBucket());
85 }
86 
initIP()87 bool Bench::initIP() {
88     int workers = mWorkers.getWorkerCount();
89 
90     mIpKernel = new float *[workers];
91     mSrcBuf = new float *[workers];
92     mOutBuf = new float *[workers];
93 
94     for (int i = 0; i < workers; i++) {
95         mIpKernel[i] = new float[1024];
96         mSrcBuf[i] = new float[4096];
97         mOutBuf[i] = new float[4];
98     }
99 
100     return true;
101 }
102 
runPowerManagementTest(uint64_t options)103 bool Bench::runPowerManagementTest(uint64_t options) {
104     //__android_log_print(ANDROID_LOG_INFO, "bench", "rpmt x %i", options);
105 
106     mTimeBucketDivisor = 1000 * 1000;  // use ms
107     allocateBuckets(2 * 1000);
108 
109     usleep(2 * 1000 * 1000);
110 
111     //__android_log_print(ANDROID_LOG_INFO, "bench", "rpmt 2  b %i", mTimeBuckets);
112 
113     mTimeStartNanos = getTimeNanos();
114     mTimeEndNanos = mTimeStartNanos + mTimeBuckets * mTimeBucketDivisor;
115     memset(mTimeBucket, 0, sizeof(uint32_t) * mTimeBuckets);
116 
117     bool useMT = false;
118 
119     //__android_log_print(ANDROID_LOG_INFO, "bench", "rpmt 2.1  b %i", mTimeBuckets);
120     mTimeEndGroupNanos = mTimeStartNanos;
121     do  {
122         // Advance 8ms
123         mTimeEndGroupNanos += 8 * 1000 * 1000;
124 
125         int threads = useMT ? 1 : 0;
126         useMT = !useMT;
127         if ((options & 0x1f) != 0) {
128             threads = options & 0x1f;
129         }
130 
131         //__android_log_print(ANDROID_LOG_INFO, "bench", "threads %i", threads);
132 
133         mWorkers.launchWork(testWork, this, threads);
134     } while (mTimeEndGroupNanos <= mTimeEndNanos);
135 
136     return true;
137 }
138 
allocateBuckets(size_t bucketCount)139 bool Bench::allocateBuckets(size_t bucketCount) {
140     if (bucketCount == mTimeBuckets) {
141         return true;
142     }
143 
144     if (mTimeBucket != NULL) {
145         delete[] mTimeBucket;
146         mTimeBucket = NULL;
147     }
148 
149     mTimeBuckets = bucketCount;
150     if (mTimeBuckets > 0) {
151         mTimeBucket = new uint32_t[mTimeBuckets];
152     }
153 
154     return true;
155 }
156 
init()157 bool Bench::init() {
158     mWorkers.init();
159 
160     initIP();
161     //ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount + 1);
162 
163     return true;
164 }
165 
incTimeBucket() const166 bool Bench::incTimeBucket() const {
167     uint64_t time = getTimeNanos();
168     uint64_t bucket = (time - mTimeStartNanos) / mTimeBucketDivisor;
169 
170     if (bucket >= mTimeBuckets) {
171         return false;
172     }
173 
174     __sync_fetch_and_add(&mTimeBucket[bucket], 1);
175 
176     return time < mTimeEndGroupNanos;
177 }
178 
getData(float * data,size_t count) const179 void Bench::getData(float *data, size_t count) const {
180     if (count > mTimeBuckets) {
181         count = mTimeBuckets;
182     }
183     for (size_t ct = 0; ct < count; ct++) {
184         data[ct] = (float)mTimeBucket[ct];
185     }
186 }
187 
runCPUHeatSoak(uint64_t)188 bool Bench::runCPUHeatSoak(uint64_t /* options */)
189 {
190     mTimeBucketDivisor = 1000 * 1000;  // use ms
191     allocateBuckets(1000);
192 
193     mTimeStartNanos = getTimeNanos();
194     mTimeEndNanos = mTimeStartNanos + mTimeBuckets * mTimeBucketDivisor;
195     memset(mTimeBucket, 0, sizeof(uint32_t) * mTimeBuckets);
196 
197     mTimeEndGroupNanos = mTimeEndNanos;
198     mWorkers.launchWork(testWork, this, 0);
199     return true;
200 }
201 
runMemoryBandwidthTest(uint64_t size)202 float Bench::runMemoryBandwidthTest(uint64_t size)
203 {
204     uint64_t t1 = getTimeMillis();
205     for (size_t ct = mMemLoopCount; ct > 0; ct--) {
206         memcpy(mMemDst, mMemSrc, size);
207     }
208     double dt = getTimeMillis() - t1;
209     dt /= 1000;
210 
211     double bw = ((double)size) * mMemLoopCount / dt;
212     bw /= 1024 * 1024 * 1024;
213 
214     float targetTime = 0.2f;
215     if (dt > targetTime) {
216         mMemLoopCount = (size_t)((double)mMemLoopCount / (dt / targetTime));
217     }
218 
219     return (float)bw;
220 }
221 
runMemoryLatencyTest(uint64_t size)222 float Bench::runMemoryLatencyTest(uint64_t size)
223 {
224     //__android_log_print(ANDROID_LOG_INFO, "bench", "latency %i", (int)size);
225     void ** sp = (void **)mMemSrc;
226     size_t maxIndex = size / sizeof(void *);
227     size_t loops = ((maxIndex / 2) & (~3));
228     //loops = 10;
229 
230     if (size != mMemLatencyLastSize) {
231         __android_log_print(ANDROID_LOG_INFO, "bench", "latency build %i %i", (int)maxIndex, loops);
232         mMemLatencyLastSize = size;
233         memset((void *)mMemSrc, 0, mMemLatencyLastSize);
234 
235         size_t lastIdx = 0;
236         for (size_t ct = 0; ct < loops; ct++) {
237             size_t ni = rand() * rand();
238             ni = ni % maxIndex;
239             while ((sp[ni] != NULL) || (ni == lastIdx)) {
240                 ni++;
241                 if (ni >= maxIndex) {
242                     ni = 1;
243                 }
244     //            __android_log_print(ANDROID_LOG_INFO, "bench", "gen ni loop %i %i", lastIdx, ni);
245             }
246       //      __android_log_print(ANDROID_LOG_INFO, "bench", "gen ct = %i  %i  %i  %p  %p", (int)ct, lastIdx, ni, &sp[lastIdx], &sp[ni]);
247             sp[lastIdx] = &sp[ni];
248             lastIdx = ni;
249         }
250         sp[lastIdx] = 0;
251     }
252     //__android_log_print(ANDROID_LOG_INFO, "bench", "latency testing");
253 
254     uint64_t t1 = getTimeNanos();
255     for (size_t ct = mMemLoopCount; ct > 0; ct--) {
256         size_t lc = 1;
257         volatile void *p = sp[0];
258         while (p != NULL) {
259             // Unroll once to minimize branching overhead.
260             void **pn = (void **)p;
261             p = pn[0];
262             pn = (void **)p;
263             p = pn[0];
264         }
265     }
266     //__android_log_print(ANDROID_LOG_INFO, "bench", "v %i %i", loops * mMemLoopCount, v);
267 
268     double dt = getTimeNanos() - t1;
269     double dts = dt / 1000000000;
270     double lat = dt / (loops * mMemLoopCount);
271     __android_log_print(ANDROID_LOG_INFO, "bench", "latency ret %f", lat);
272 
273     float targetTime = 0.2f;
274     if (dts > targetTime) {
275         mMemLoopCount = (size_t)((double)mMemLoopCount / (dts / targetTime));
276         if (mMemLoopCount < 1) {
277             mMemLoopCount = 1;
278         }
279     }
280 
281     return (float)lat;
282 }
283 
startMemTests()284 bool Bench::startMemTests()
285 {
286     mMemSrc = (uint8_t *)malloc(1024*1024*64);
287     mMemDst = (uint8_t *)malloc(1024*1024*64);
288 
289     memset(mMemSrc, 0, 1024*1024*16);
290     memset(mMemDst, 0, 1024*1024*16);
291 
292     mMemLoopCount = 1;
293     uint64_t start = getTimeMillis();
294     while((getTimeMillis() - start) < 500) {
295         memcpy(mMemDst, mMemSrc, 1024);
296         mMemLoopCount++;
297     }
298     mMemLatencyLastSize = 0;
299     return true;
300 }
301 
endMemTests()302 void Bench::endMemTests()
303 {
304     free(mMemSrc);
305     free(mMemDst);
306     mMemSrc = NULL;
307     mMemDst = NULL;
308     mMemLatencyLastSize = 0;
309 }
310 
GflopKernelC()311 void Bench::GflopKernelC() {
312     int halfKX = (mGFlop.kernelXSize / 2);
313     for (int x = halfKX; x < (mGFlop.imageXSize - halfKX - 1); x++) {
314         const float * krnPtr = mGFlop.kernelBuffer;
315         float sum = 0.f;
316 
317         int srcInc = mGFlop.imageXSize - mGFlop.kernelXSize;
318         const float * srcPtr = &mGFlop.srcBuffer[x - halfKX];
319 
320         for (int ix = 0; ix < mGFlop.kernelXSize; ix++) {
321             sum += srcPtr[0] * krnPtr[0];
322             krnPtr++;
323             srcPtr++;
324         }
325 
326         float * dstPtr = &mGFlop.dstBuffer[x];
327         dstPtr[0] = sum;
328 
329     }
330 
331 }
332 
GflopKernelC_y3()333 void Bench::GflopKernelC_y3() {
334 }
335 
runGFlopsTest(uint64_t)336 float Bench::runGFlopsTest(uint64_t /* options */)
337 {
338     mTimeBucketDivisor = 1000 * 1000;  // use ms
339     allocateBuckets(1000);
340 
341     mTimeStartNanos = getTimeNanos();
342     mTimeEndNanos = mTimeStartNanos + mTimeBuckets * mTimeBucketDivisor;
343     memset(mTimeBucket, 0, sizeof(uint32_t) * mTimeBuckets);
344 
345     mTimeEndGroupNanos = mTimeEndNanos;
346     mWorkers.launchWork(testWork, this, 0);
347 
348     // Simulate image convolve
349     mGFlop.kernelXSize = 27;
350     mGFlop.imageXSize = 1024 * 1024;
351 
352     mGFlop.srcBuffer = (float *)malloc(mGFlop.imageXSize * sizeof(float));
353     mGFlop.dstBuffer = (float *)malloc(mGFlop.imageXSize * sizeof(float));
354     mGFlop.kernelBuffer = (float *)malloc(mGFlop.kernelXSize * sizeof(float));
355 
356     double ops = mGFlop.kernelXSize;
357     ops = ops * 2.f - 1.f;
358     ops *= mGFlop.imageXSize;
359 
360     uint64_t t1 = getTimeNanos();
361     GflopKernelC();
362     double dt = getTimeNanos() - t1;
363 
364     dt /= 1000.f * 1000.f * 1000.f;
365 
366     double gflops = ops / dt / 1000000000.f;
367 
368     __android_log_print(ANDROID_LOG_INFO, "bench", "v %f %f %f", dt, ops, gflops);
369 
370     return (float)gflops;
371 }
372 
373 
374