Android_10.0.0_r40/s

/**
 * Copyright 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "run_tflite.h"

#include <android/log.h>
#include <dirent.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <ftw.h>
#include <sys/time.h>
#include <unistd.h>

#include <cstdio>
#include <fstream>

#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
#include "tensorflow/lite/kernels/register.h"
#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"

#define LOG_TAG "NN_BENCHMARK"

#define FATAL(fmt, ...)                                                  \
  do {                                                                   \
    __android_log_print(ANDROID_LOG_FATAL, LOG_TAG, fmt, ##__VA_ARGS__); \
    assert(false);                                                       \
  } while (0)

namespace {

long long currentTimeInUsec() {
  timeval tv;
  gettimeofday(&tv, NULL);
  return ((tv.tv_sec * 1000000L) + tv.tv_usec);
}

// Workaround for build systems that make difficult to pick the correct NDK API
// level. NDK tracing methods are dynamically loaded from libandroid.so.
typedef void* (*fp_ATrace_beginSection)(const char* sectionName);
typedef void* (*fp_ATrace_endSection)();
struct TraceFunc {
  fp_ATrace_beginSection ATrace_beginSection;
  fp_ATrace_endSection ATrace_endSection;
};
TraceFunc setupTraceFunc() {
  void* lib = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
  if (lib == nullptr) {
    FATAL("unable to open libandroid.so");
  }
  return {
      reinterpret_cast<fp_ATrace_beginSection>(
          dlsym(lib, "ATrace_beginSection")),
      reinterpret_cast<fp_ATrace_endSection>(dlsym(lib, "ATrace_endSection"))};
}
static TraceFunc kTraceFunc{setupTraceFunc()};

}  // namespace

BenchmarkModel* BenchmarkModel::create(const char* modelfile, bool use_nnapi,
                                       bool enable_intermediate_tensors_dump, int* nnapiErrno,
                                       const char* nnapi_device_name, bool mmapModel,
                                       const char* nnapi_cache_dir) {
  BenchmarkModel* model = new BenchmarkModel();
  if (!model->init(modelfile, use_nnapi, enable_intermediate_tensors_dump, nnapiErrno,
                   nnapi_device_name, mmapModel, nnapi_cache_dir)) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to init model %s", modelfile);
    delete model;
    return nullptr;
  }
  return model;
}

bool BenchmarkModel::init(const char* modelfile, bool use_nnapi,
                          bool enable_intermediate_tensors_dump, int* nnapiErrno,
                          const char* nnapi_device_name, bool mmapModel,
                          const char* nnapi_cache_dir) {
  mModelFile = modelfile;
  mUseNnApi = use_nnapi;
  if (nnapi_cache_dir) {
    mCacheDir = nnapi_cache_dir;
  }
  if (nnapi_device_name) {
    mNnApiDeviceName = nnapi_device_name;
  }

  if (mmapModel) {
    // Memory map the model. NOTE this needs lifetime greater than or equal
    // to interpreter context.
    mTfliteModel = tflite::FlatBufferModel::BuildFromFile(modelfile);
  } else {
    std::ifstream t(modelfile);
    mModelBuffer = std::string((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
    mTfliteModel = tflite::FlatBufferModel::BuildFromBuffer(mModelBuffer.c_str(), mModelBuffer.size());
  }
  if (!mTfliteModel) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to load model %s",
                        modelfile);
    return false;
  }

  tflite::ops::builtin::BuiltinOpResolver resolver;
  tflite::InterpreterBuilder(*mTfliteModel, resolver)(&mTfliteInterpreter);
  if (!mTfliteInterpreter) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                        "Failed to create TFlite interpreter");
    return false;
  }

  if (enable_intermediate_tensors_dump) {
    // Make output of every op a model output. This way we will be able to
    // fetch each intermediate tensor when running with delegates.
    outputs.clear();
    for (size_t node = 0; node < mTfliteInterpreter->nodes_size(); ++node) {
      auto node_outputs =
          mTfliteInterpreter->node_and_registration(node)->first.outputs;
      outputs.insert(outputs.end(), node_outputs->data,
                     node_outputs->data + node_outputs->size);
    }
    mTfliteInterpreter->SetOutputs(outputs);
  }

  // Allow Fp16 precision for all models
  mTfliteInterpreter->SetAllowFp16PrecisionForFp32(true);

  if (use_nnapi) {
    tflite::StatefulNnApiDelegate::Options nnapi_options;
    nnapi_options.accelerator_name = nnapi_device_name;
    mTfliteNnapiDelegate = std::make_unique<tflite::StatefulNnApiDelegate>(nnapi_options);
    int delegationStatus = mTfliteInterpreter->ModifyGraphWithDelegate(mTfliteNnapiDelegate.get());
    *nnapiErrno = mTfliteNnapiDelegate->GetNnApiErrno();
    if (delegationStatus != kTfLiteOk ||
        *nnapiErrno != ANEURALNETWORKS_NO_ERROR) {
      __android_log_print(
          ANDROID_LOG_ERROR, LOG_TAG,
          "Failed to initialize NNAPI Delegate for model %s, nnapi_errno is %d",
          modelfile, *nnapiErrno);
      return false;
    }
  }
  return true;
}

BenchmarkModel::BenchmarkModel() {}
BenchmarkModel::~BenchmarkModel() {}

bool BenchmarkModel::setInput(const uint8_t* dataPtr, size_t length) {
  int input = mTfliteInterpreter->inputs()[0];
  auto* input_tensor = mTfliteInterpreter->tensor(input);

  switch (input_tensor->type) {
    case kTfLiteFloat32:
    case kTfLiteUInt8: {
      void* raw = input_tensor->data.raw;
      memcpy(raw, dataPtr, length);
      break;
    }
    default:
      __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                          "Input tensor type not supported");
      return false;
  }
  return true;
}
void BenchmarkModel::saveInferenceOutput(InferenceResult* result,
                                         int output_index) {
  int output = mTfliteInterpreter->outputs()[output_index];
  auto* output_tensor = mTfliteInterpreter->tensor(output);
  auto& sink = result->inferenceOutputs[output_index];
  sink.insert(sink.end(), output_tensor->data.uint8,
              output_tensor->data.uint8 + output_tensor->bytes);
}

void BenchmarkModel::getOutputError(const uint8_t* expected_data, size_t length,
                                    InferenceResult* result, int output_index) {
  int output = mTfliteInterpreter->outputs()[output_index];
  auto* output_tensor = mTfliteInterpreter->tensor(output);
  if (output_tensor->bytes != length) {
    FATAL("Wrong size of output tensor, expected %zu, is %zu",
          output_tensor->bytes, length);
  }

  size_t elements_count = 0;
  float err_sum = 0.0;
  float max_error = 0.0;
  switch (output_tensor->type) {
    case kTfLiteUInt8: {
      uint8_t* output_raw = mTfliteInterpreter->typed_tensor<uint8_t>(output);
      elements_count = output_tensor->bytes;
      for (size_t i = 0; i < output_tensor->bytes; ++i) {
        float err = ((float)output_raw[i]) - ((float)expected_data[i]);
        if (err > max_error) max_error = err;
        err_sum += err * err;
      }
      break;
    }
    case kTfLiteFloat32: {
      const float* expected = reinterpret_cast<const float*>(expected_data);
      float* output_raw = mTfliteInterpreter->typed_tensor<float>(output);
      elements_count = output_tensor->bytes / sizeof(float);
      for (size_t i = 0; i < output_tensor->bytes / sizeof(float); ++i) {
        float err = output_raw[i] - expected[i];
        if (err > max_error) max_error = err;
        err_sum += err * err;
      }
      break;
    }
    default:
      FATAL("Output sensor type %d not supported", output_tensor->type);
  }
  result->meanSquareErrors[output_index] = err_sum / elements_count;
  result->maxSingleErrors[output_index] = max_error;
}

bool BenchmarkModel::resizeInputTensors(std::vector<int> shape) {
  // The benchmark only expects single input tensor, hardcoded as 0.
  int input = mTfliteInterpreter->inputs()[0];
  mTfliteInterpreter->ResizeInputTensor(input, shape);
  if (mTfliteInterpreter->AllocateTensors() != kTfLiteOk) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                        "Failed to allocate tensors!");
    return false;
  }
  return true;
}

bool BenchmarkModel::runInference() {
  auto status = mTfliteInterpreter->Invoke();
  auto nnapi_errno = mTfliteNnapiDelegate
                         ? mTfliteNnapiDelegate->GetNnApiErrno()
                         : ANEURALNETWORKS_NO_ERROR;
  if (status != kTfLiteOk || nnapi_errno != ANEURALNETWORKS_NO_ERROR) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                        "Failed to invoke, tflite status: %d, nnapi errno: %d!",
                        (int)status, nnapi_errno);
    return false;
  }
  return true;
}

bool BenchmarkModel::resetStates() {
  auto status = mTfliteInterpreter->ResetVariableTensors();
  if (status != kTfLiteOk) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                        "Failed to reset variable tensors: %d!", (int)status);
    return false;
  }
  return true;
}

bool BenchmarkModel::benchmark(
    const std::vector<InferenceInOutSequence>& inOutData,
    int seqInferencesMaxCount, float timeout, int flags,
    std::vector<InferenceResult>* results) {
  if (inOutData.empty()) {
    __android_log_print(ANDROID_LOG_WARN, LOG_TAG,
                        "Input/output vector is empty");
    return true;
  }

  float inferenceTotal = 0.0;
  for (int seqInferenceIndex = 0; seqInferenceIndex < seqInferencesMaxCount;
       ++seqInferenceIndex) {
    resetStates();

    const int inputOutputSequenceIndex = seqInferenceIndex % inOutData.size();
    const InferenceInOutSequence& seq = inOutData[inputOutputSequenceIndex];
    for (int i = 0; i < seq.size(); ++i) {
      const InferenceInOut& data = seq[i];

      // For NNAPI systrace usage documentation, see
      // frameworks/ml/nn/common/include/Tracing.h.
      kTraceFunc.ATrace_beginSection("[NN_LA_PE]BenchmarkModel::benchmark");
      kTraceFunc.ATrace_beginSection("[NN_LA_PIO]BenchmarkModel::input");
      if (data.input) {
        setInput(data.input, data.input_size);
      } else {
        int input = mTfliteInterpreter->inputs()[0];
        auto* input_tensor = mTfliteInterpreter->tensor(input);
        if (!data.createInput((uint8_t*)input_tensor->data.raw,
                              input_tensor->bytes)) {
          __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                              "Input creation %d failed", i);
          return false;
        }
      }
      kTraceFunc.ATrace_endSection();
      long long startTime = currentTimeInUsec();
      const bool success = runInference();
      kTraceFunc.ATrace_endSection();
      long long endTime = currentTimeInUsec();
      if (!success) {
        __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
                            i);
        return false;
      }

      float inferenceTime =
          static_cast<float>(endTime - startTime) / 1000000.0f;
      size_t outputsCount = mTfliteInterpreter->outputs().size();
      InferenceResult result{
          inferenceTime, {}, {}, {}, inputOutputSequenceIndex, i};
      result.meanSquareErrors.resize(outputsCount);
      result.maxSingleErrors.resize(outputsCount);
      result.inferenceOutputs.resize(outputsCount);

      if ((flags & FLAG_IGNORE_GOLDEN_OUTPUT) == 0) {
        if (outputsCount != data.outputs.size()) {
          __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                              "Golden/actual outputs (%zu/%zu) count mismatch",
                              data.outputs.size(), outputsCount);
          return false;
        }
        for (int j = 0; j < outputsCount; ++j) {
          getOutputError(data.outputs[j].ptr, data.outputs[j].size, &result, j);
        }
      }

      if ((flags & FLAG_DISCARD_INFERENCE_OUTPUT) == 0) {
        for (int j = 0; j < outputsCount; ++j) {
          saveInferenceOutput(&result, j);
        }
      }
      results->push_back(result);
      inferenceTotal += inferenceTime;
    }

    // Timeout?
    if (timeout > 0.001 && inferenceTotal > timeout) {
      return true;
    }
  }
  return true;
}

// If cacheDir is not nullptr, compilation caching will be used with NNAPI.
bool BenchmarkModel::runCompilation(const char* cacheDir) {
  std::unique_ptr<tflite::Interpreter> interpreter;
  tflite::ops::builtin::BuiltinOpResolver resolver;
  tflite::InterpreterBuilder(*mTfliteModel, resolver)(&interpreter);
  if (!interpreter) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to create TFlite interpreter");
    return false;
  }

  // Allow Fp16 precision for all models
  interpreter->SetAllowFp16PrecisionForFp32(true);

  if (mUseNnApi) {
    tflite::StatefulNnApiDelegate::Options nnapi_options;
    nnapi_options.accelerator_name = mNnApiDeviceName.empty() ? nullptr : mNnApiDeviceName.c_str();
    if (cacheDir) {
      nnapi_options.cache_dir = cacheDir;
      nnapi_options.model_token = mModelFile.c_str();
    }
    tflite::StatefulNnApiDelegate delegate(nnapi_options);
    int delegationStatus = interpreter->ModifyGraphWithDelegate(&delegate);
    auto nnapiErrno = delegate.GetNnApiErrno();
    if (delegationStatus != kTfLiteOk || nnapiErrno != ANEURALNETWORKS_NO_ERROR) {
      __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                          "Failed to initialize NNAPI Delegate for model %s, nnapi_errno is %d",
                          mModelFile.c_str(), nnapiErrno);
      return false;
    }
  }
  return true;
}

// A helper class to manage the lifetime of a temporary cache directory.
class ScopedTempDirectory {
 public:
  ScopedTempDirectory(std::string base) : mBase(std::move(base)) {}
  ~ScopedTempDirectory() { cleanup(); }

  // Create a new temp directory, remove the old one if needed.
  void recreate() {
    cleanup();
    mTempDir = mBase + "/XXXXXX";
    mkdtemp(&mTempDir[0]);
  }

  // Get the path to the temp directory.
  const char* get() const { return mTempDir.empty() ? nullptr : mTempDir.c_str(); }

 private:
  void cleanup() {
    if (mTempDir.empty()) {
      return;
    }
    auto callback = [](const char* entry, const struct stat*, int, struct FTW*) {
      return remove(entry);
    };
    nftw(mTempDir.c_str(), callback, 128, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
    mTempDir.clear();
  }

  std::string mBase;
  std::string mTempDir;
};

bool BenchmarkModel::getCompilationCacheSize(int* cacheSizeBytes) {
  if (cacheSizeBytes == nullptr) return false;

  // Create cache files.
  ScopedTempDirectory tempDir(mCacheDir.value());
  tempDir.recreate();
  const bool success = runCompilation(tempDir.get());
  if (!success) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Save to cache failed");
    return false;
  }

  // Compute total size of cache files.
  int totalSize = 0;
  DIR* dir = opendir(tempDir.get());
  if (dir == nullptr) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to open cache directory");
    return false;
  }
  struct dirent* dp = nullptr;
  while ((dp = readdir(dir)) != nullptr) {
    char fullPath[1024];
    snprintf(fullPath, 1024, "%s/%s", tempDir.get(), dp->d_name);
    struct stat st;
    int err = stat(fullPath, &st);
    if (err != 0) {
      closedir(dir);
      __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to stat %s", fullPath);
      return false;
    }
    // Only accumulate sizes of regular files. This will exclude '.' and '..'.
    if (S_ISREG(st.st_mode)) {
      totalSize += st.st_size;
    }
  }
  closedir(dir);
  *cacheSizeBytes = totalSize;
  return true;
}

bool BenchmarkModel::benchmarkSingleTypeOfCompilation(CompilationBenchmarkType type,
                                                      int maxNumIterations, float timeout,
                                                      std::vector<float>* results) {
  if (results != nullptr) {
    results->clear();
  }
  ScopedTempDirectory tempDir(mCacheDir.value());

  // Initialize cache files to benchmark cache hit.
  if (type == CompilationBenchmarkType::PREPARE_FROM_CACHE) {
    tempDir.recreate();
    const bool success = runCompilation(tempDir.get());
    if (!success) {
      __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Save to cache failed");
      return false;
    }
  }

  float compilationTotal = 0.0;
  for (int i = 0; i < maxNumIterations; i++) {
    const char* cacheDir = nullptr;
    switch (type) {
      case CompilationBenchmarkType::WITHOUT_CACHE:
        cacheDir = nullptr;
        break;
      case CompilationBenchmarkType::SAVE_TO_CACHE:
        // Remove the cache files from the last iteration to benchmark cache miss.
        tempDir.recreate();
        [[fallthrough]];
      case CompilationBenchmarkType::PREPARE_FROM_CACHE:
        cacheDir = tempDir.get();
        break;
      default:
        __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Unknown CompilationBenchmarkType: %d",
                            static_cast<int>(type));
        return false;
    }

    kTraceFunc.ATrace_beginSection("[NN_LA_PC]BenchmarkModel::benchmarkCompilation");
    const long long startTime = currentTimeInUsec();
    const bool success = runCompilation(cacheDir);
    const long long endTime = currentTimeInUsec();
    kTraceFunc.ATrace_endSection();
    if (!success) {
      __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Compilation %d failed", i);
      return false;
    }

    const float compilationTime = static_cast<float>(endTime - startTime) / 1000000.0f;
    if (results != nullptr) {
      results->push_back(compilationTime);
    }

    // Timeout?
    compilationTotal += compilationTime;
    if (timeout > 0.001 && compilationTotal > timeout) {
      return true;
    }
  }
  return true;
}

bool BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup(CompilationBenchmarkType type,
                                                                int maxNumIterations,
                                                                float warmupTimeout,
                                                                float runTimeout,
                                                                std::vector<float>* results) {
  kTraceFunc.ATrace_beginSection(
          "[NN_LA_PWM]BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup");
  bool success = benchmarkSingleTypeOfCompilation(type, maxNumIterations, warmupTimeout, nullptr);
  kTraceFunc.ATrace_endSection();
  if (!success) return false;

  kTraceFunc.ATrace_beginSection(
          "[NN_LA_PBM]BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup");
  success = benchmarkSingleTypeOfCompilation(type, maxNumIterations, runTimeout, results);
  kTraceFunc.ATrace_endSection();
  return success;
}

bool BenchmarkModel::benchmarkCompilation(int maxNumIterations, float warmupTimeout,
                                          float runTimeout, CompilationBenchmarkResult* result) {
  if (result == nullptr) return false;

  // Benchmark compile without cache.
  bool success = benchmarkSingleTypeOfCompilationWithWarmup(
          CompilationBenchmarkType::WITHOUT_CACHE, maxNumIterations, warmupTimeout, runTimeout,
          &result->compileWithoutCacheTimeSec);
  if (!success) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                        "Failed to benchmark compilation without cache");
    return false;
  }

  // Get compilation cache size.
  success = getCompilationCacheSize(&result->cacheSizeBytes);
  if (!success) {
    __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to retrieve compilation cache size");
    return false;
  }

  // Benchmark saving to cache and preparing from cache only if supported.
  if (result->cacheSizeBytes > 0) {
    // Benchmark saving to cache.
    auto& saveToCacheTimeSec = result->saveToCacheTimeSec.emplace();
    success = benchmarkSingleTypeOfCompilationWithWarmup(
            CompilationBenchmarkType::SAVE_TO_CACHE, maxNumIterations, warmupTimeout, runTimeout,
            &saveToCacheTimeSec);
    if (!success) {
      __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark saving to cache");
      return false;
    }

    // Benchmark preparing from cache.
    auto& prepareFromCacheTimeSec = result->prepareFromCacheTimeSec.emplace();
    success = benchmarkSingleTypeOfCompilationWithWarmup(
            CompilationBenchmarkType::PREPARE_FROM_CACHE, maxNumIterations, warmupTimeout,
            runTimeout, &prepareFromCacheTimeSec);
    if (!success) {
      __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark preparing from cache");
      return false;
    }
  }
  return result;
}

bool BenchmarkModel::dumpAllLayers(
    const char* path, const std::vector<InferenceInOutSequence>& inOutData) {
  if (inOutData.empty()) {
    FATAL("Input/output vector is empty");
  }

  for (int seqInferenceIndex = 0; seqInferenceIndex < inOutData.size();
       ++seqInferenceIndex) {
    resetStates();

    const InferenceInOutSequence& seq = inOutData[seqInferenceIndex];
    for (int i = 0; i < seq.size(); ++i) {
      const InferenceInOut& data = seq[i];
      setInput(data.input, data.input_size);
      const bool success = runInference();
      if (!success) {
        __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
                            i);
        return false;
      }

      // The order of the tensor is not sorted by the tensor index
      for (int tensor_order = 0; tensor_order < outputs.size(); ++tensor_order) {
        int tensor_index = outputs[tensor_order];
        auto* output_tensor = mTfliteInterpreter->tensor(tensor_index);
        if (output_tensor->data.raw == nullptr) {
          __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                      "output_tensor->data.raw == nullptr at index %d ", tensor_index);
          continue;
        }
        char fullpath[1024];
        snprintf(fullpath, 1024, "%s/dump_%.3d_seq_%.3d_order_%.3d_tensor_%.3d", path,
                 seqInferenceIndex, i, tensor_order, tensor_index);
        FILE* f = fopen(fullpath, "wb");
        fwrite(output_tensor->data.raw, output_tensor->bytes, 1, f);
        fclose(f);
      }
    }
  }
  return true;
}