如果需要运行 MaskTextSpotter
, 最少需要 4GB
显存,低于这个容量,运行不起来。
安装最新版本的 cuda-10.1
,低版本的编译会出问题:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# 卸载之前已经安装的cuda $ sudo apt-get remove nvidia-cuda-toolkit $ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin $ sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 $ wget http://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-ubuntu1804-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb $ sudo dpkg -i cuda-repo-ubuntu1804-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb $ sudo apt-key add /var/cuda-repo-10-1-local-10.1.243-418.87.00/7fa2af80.pub $ sudo apt-get update $ sudo apt-get -y install cuda # 部分驱动可能会更新,需要执行更新,否则可能依旧不正常 $ sudo apt-get dist-upgrade $ sudo apt-get autoremove # 可能需要删除一下XWindow的配置文件,否则驱动可能不能正常加载 $ sudo rm -rf ~/.Xauthority # 如果出现如下错误 # ubuntu 18.04 "nvidia-340 导致 /usr/lib/x86_64-linux-gnu/libGL.so.1 # 转移到 /usr/lib/x86_64-linux-gnu/libGL.so.1.distrib" # 参考 http://www.mobibrw.com/?p=21739 # 删除安装源,可以节约几个GB的磁盘,安装完成后这部分已经用不上了 $ sudo apt-get remove --purge cuda-repo-ubuntu1804-10-1-local-10.1.243-418.87.00 $ sudo apt-get update # 部分驱动可能会更新,需要执行更新,否则可能依旧不正常 $ sudo apt-get dist-upgrade $ sudo apt-get autoremove |
配置独立环境
1 2 3 4 5 6 7 |
# first, make sure that your conda is setup properly with the right environment # for that, check that `which conda`, `which pip` and `which python` points to the # right path. From a clean conda env, this is what you need to do # conda remove -n MaskTextSpotter --all $ conda create -n MaskTextSpotter -y python=3.6.8 pip |
编译安装 Pytoch
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
$ sudo apt-get install git # 进入运行环境 $ source activate MaskTextSpotter $ conda install numpy pyyaml mkl=2019.1 mkl-include=2019.1 setuptools cmake cffi typing pybind11 $ conda install ninja # magma-cuda90 magma-cuda91 magma-cuda92 会编译失败 $ conda install -c pytorch magma-cuda101 $ git clone https://github.com/pytorch/pytorch # 也可直接本站下载一份同步好的代码 wget https://www.mobibrw.com/wp-content/uploads/2019/11/pytorch.zip $ cd pytorch # pytorch 1.0.1 版本支持“Compute Capability” 低于3.0版本的硬件,pytorch 1.2.0需要至少3.5版本的硬件才可以正常运行 # https://github.com/pytorch/pytorch/blob/v1.3.0/torch/utils/cpp_extension.py $ git checkout v1.0.1 -b v1.0.1 $ git submodule sync $ git submodule update --init --recursive $ export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} $ python setup.py clean # 卸载以前安装的pytorch $ conda uninstall pytorch $ pip uninstall pytorch # 从Nvidia开发网站查询到自己硬件对应的“Compute Capability” # 比如 “GeForce GTX 760” 对应 “3.0” 计算能力,能力不正确会导致运行异常 # RuntimeError: cuda runtime error (48) : no kernel image is available for execution on the device $ TORCH_CUDA_ARCH_LIST="3.0" python setup.py install # 一定要退出 pytorch 的编译目录,在pytorch代码目录下执行命令会出现异常 $ cd .. # 退出环境 $ conda deactivate |
如果出现如下错误:
1 2 3 4 5 |
[ 68%] Building NVCC (Device) object caffe2/CMakeFiles/caffe2_gpu.dir/__/aten/src/ATen/native/sparse/cuda/caffe2_gpu_generated_SparseCUDABlas.cu.o ~/pytorch/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu(58): error: more than one instance of function "at::native::sparse::cuda::cusparseGetErrorString" matches the argument list: function "cusparseGetErrorString(cusparseStatus_t)" function "at::native::sparse::cuda::cusparseGetErrorString(cusparseStatus_t)" argument types are: (cusparseStatus_t) |
则需要调整代码 aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
, 在其中的 cusparseGetErrorString
函数上增加 #if (!((CUSPARSE_VER_MAJOR >= 10) && (CUSPARSE_VER_MINOR >= 2)))
如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
#if (!((CUSPARSE_VER_MAJOR >= 10) && (CUSPARSE_VER_MINOR >= 2))) const char* cusparseGetErrorString(cusparseStatus_t status) { switch(status) { case CUSPARSE_STATUS_SUCCESS: return "success"; case CUSPARSE_STATUS_NOT_INITIALIZED: return "library not initialized"; case CUSPARSE_STATUS_ALLOC_FAILED: return "resource allocation failed"; case CUSPARSE_STATUS_INVALID_VALUE: return "an invalid numeric value was used as an argument"; case CUSPARSE_STATUS_ARCH_MISMATCH: return "an absent device architectural feature is required"; case CUSPARSE_STATUS_MAPPING_ERROR: return "an access to GPU memory space failed"; case CUSPARSE_STATUS_EXECUTION_FAILED: return "the GPU program failed to execute"; case CUSPARSE_STATUS_INTERNAL_ERROR: return "an internal operation failed"; case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "the matrix type is not supported by this function"; case CUSPARSE_STATUS_ZERO_PIVOT: return "an entry of the matrix is either structural zero or numerical zero (singular block)"; default: return "unknown error"; } } #endif |
这样解决跟 CUDA-10.1
自带函数的冲突问题。
具体参考: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
编译安装 TorchVision
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
$ sudo apt-get install git # 进入运行环境 $ source activate MaskTextSpotter $ git clone https://github.com/pytorch/vision.git # 也可本站下载一份拷贝 wget https://www.mobibrw.com/wp-content/uploads/2019/11/vision.zip $ cd vision $ git checkout v0.2.1 -b v0.2.1 $ python setup.py install # 退出环境 $ conda deactivate |
源代码编译
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
$ source activate MaskTextSpotter # this installs the right pip and dependencies for the fresh python $ conda install ipython pip # python dependencies $ pip install ninja yacs cython matplotlib tqdm opencv-python shapely scipy tensorboardX $ export INSTALL_DIR=$PWD # install pycocotools $ cd $INSTALL_DIR $ git clone https://github.com/cocodataset/cocoapi.git $ cd cocoapi/PythonAPI $ python setup.py build_ext install # 本站下载 https://www.mobibrw.com/wp-content/uploads/2019/11/cocoapi.zip # install apex (optional) $ cd $INSTALL_DIR $ git clone https://github.com/NVIDIA/apex.git $ cd apex $ python setup.py install --cuda_ext --cpp_ext # 本站下载 wget https://www.mobibrw.com/wp-content/uploads/2019/11/apex.zip # clone repo $ cd $INSTALL_DIR $ git clone https://github.com/MhLiao/MaskTextSpotter.git $ cd MaskTextSpotter # 本站下载 wget https://www.mobibrw.com/wp-content/uploads/2019/11/MaskTextSpotter.zip # build $ python setup.py build develop $ unset INSTALL_DIR |
准备测试数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# 创建目录(源代码根目录) $ mkdir outputs $ cd outputs $ mkdir finetune $ cd finetune # 下载已经训练好的模型 https://drive.google.com/open?id=1pPRS7qS_K1keXjSye0kksqhvoyD0SARz # 本站下载 $ wget https://www.mobibrw.com/wp-content/uploads/2019/11/model_finetune.zip $ unzip model_finetune.zip $ cd ../../ $ mkdir datasets $ cd datasets # 下载 icdar2013 数据集 $ wget https://www.mobibrw.com/wp-content/uploads/2019/11/icdar2013.zip $ unzip icdar2013.zip $ cd icdar2013 # 下载测试集文件 $ git clone https://github.com/zazaliu/ICDAR2PASCAL_VOC.git # 本站下载 wget https://www.mobibrw.com/wp-content/uploads/2019/11/ICDAR2PASCAL_VOC.zip $ cp -r ICDAR2PASCAL_VOC/ICDAR2015/ch4_training_localization_transcription_gt/ test_gts # 执行测试 $ cd ../../ # 预先删除生成的文件,否则可能会启动之后就崩溃退出 $ rm -rf outputs/finetune/inference/ $ bash test.sh |
执行测试的时候,如果出现如下错误信息:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
File "tools/test_net.py", line 95, in <module> main() File "tools/test_net.py", line 89, in main cfg=cfg, File "~/MaskTextSpotter/maskrcnn_benchmark/engine/text_inference.py", line 380, in inference predictions = compute_on_dataset(model, data_loader, device) File "~/MaskTextSpotter/maskrcnn_benchmark/engine/text_inference.py", line 55, in compute_on_dataset for i, batch in tqdm(enumerate(data_loader)): File "~.conda/envs/MaskTextSpotter/lib/python3.6/site-packages/tqdm/std.py", line 1091, in __iter__ for obj in iterable: File "~.conda/envs/MaskTextSpotter/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 637, in __next__ return self._process_next_batch(batch) File "~.conda/envs/MaskTextSpotter/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 658, in _process_next_batch raise batch.exc_type(batch.exc_msg) ValueError: Traceback (most recent call last): File "~.conda/envs/MaskTextSpotter/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in _worker_loop samples = collate_fn([dataset[i] for i in batch_indices]) File "~.conda/envs/MaskTextSpotter/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in <listcomp> samples = collate_fn([dataset[i] for i in batch_indices]) File "~/MaskTextSpotter/maskrcnn_benchmark/data/datasets/icdar.py", line 32, in __getitem__ words,boxes,charsbbs,segmentations=self.load_gt_from_txt(gt_path,height,width) File "~/MaskTextSpotter/maskrcnn_benchmark/data/datasets/icdar.py", line 94, in load_gt_from_txt strs, loc = self.line2boxes(line) File "~/MaskTextSpotter/maskrcnn_benchmark/data/datasets/icdar.py", line 153, in line2boxes loc = np.vstack(v).transpose() File "<__array_function__ internals>", line 6, in vstack File "~.conda/envs/MaskTextSpotter/lib/python3.6/site-packages/numpy/core/shape_base.py", line 282, in vstack return _nx.concatenate(arrs, 0) File "<__array_function__ internals>", line 6, in concatenate ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 2 and the array at index 1 has size 1 |
那么问题出现的原因是maskrcnn_benchmark/data/datasets/icdar.py
解析文件的时候,遇到了478,239,511,241,511,255,478,253,$5,000
这样的数据,测试代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
import numpy as np line = '478,239,511,241,511,255,478,253,$5,000' def line2boxes(line): parts = line.strip().split(',') if '\xef\xbb\xbf' in parts[0]: parts[0] = parts[0][3:] if '\ufeff' in parts[0]: parts[0] = parts[0].replace('\ufeff', '') x1 = np.array([int(float(x)) for x in parts[::9]]) y1 = np.array([int(float(x)) for x in parts[1::9]]) x2 = np.array([int(float(x)) for x in parts[2::9]]) y2 = np.array([int(float(x)) for x in parts[3::9]]) x3 = np.array([int(float(x)) for x in parts[4::9]]) y3 = np.array([int(float(x)) for x in parts[5::9]]) x4 = np.array([int(float(x)) for x in parts[6::9]]) y4 = np.array([int(float(x)) for x in parts[7::9]]) strs = parts[8::9] print(x1) loc = np.vstack((x1, y1, x2, y2, x3, y3, x4, y4)).transpose() print(loc) return strs, loc line2boxes(line) |
修正后的代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
import numpy as np line = '478,239,511,241,511,255,478,253,$5,000' def line2boxes(line): parts = line.strip().split(',', 8) if '\xef\xbb\xbf' in parts[0]: parts[0] = parts[0][3:] if '\ufeff' in parts[0]: parts[0] = parts[0].replace('\ufeff', '') x1 = np.array([int(float(x)) for x in parts[::9]]) y1 = np.array([int(float(x)) for x in parts[1::9]]) x2 = np.array([int(float(x)) for x in parts[2::9]]) y2 = np.array([int(float(x)) for x in parts[3::9]]) x3 = np.array([int(float(x)) for x in parts[4::9]]) y3 = np.array([int(float(x)) for x in parts[5::9]]) x4 = np.array([int(float(x)) for x in parts[6::9]]) y4 = np.array([int(float(x)) for x in parts[7::9]]) strs = parts[8::9] print(x1) loc = np.vstack((x1, y1, x2, y2, x3, y3, x4, y4)).transpose() print(loc) return strs, loc line2boxes(line) |
其他错误,可能是中途软件安装卸载造成的软件版本冲突,则直接删除环境,重新创建一个干净的环境重新构建。