• NVIDIA DALI学习:数据加载


    DALI的工作流, 如下图:

    • 读取数据
    • 图像解码和变换,可以放到GPU上进行,也是加速的关键
    • 生成处理好的数据, 导出给计算引擎
      在这里插入图片描述

    测试用例

    import ctypes
    
    import numpy as np
    import nvidia.dali.fn as fn
    import nvidia.dali.types as types
    # import pycuda.driver as cuda
    from nvidia.dali.backend import TensorGPU, TensorListGPU
    from nvidia.dali.pipeline import Pipeline
    import cv2
    
    class ExternalInputIterator(object):
        def __init__(self, batch_size, image_dir=''):
            self.batch_size = batch_size
            self.files = []
            self.image_dir = image_dir
    
        def __iter__(self):
            self.i = 0
            self.n = self.batch_size
            return self
    
        def feed(self, inputs):
            # print('feed: %d' % len(inputs))
            self.files.extend(inputs)
            # print('files: %d' % len(self.files))
    
        def __next__(self):
            batch = []
            # print('files: %d' % len(self.files))
            if len(self.files) < self.batch_size:
                raise StopIteration()
    
            for _ in range(self.batch_size):
                jpeg_filename = self.files.pop()
                # print(self.image_dir + jpeg_filename)
                f = open(self.image_dir + jpeg_filename, 'rb')
                batch.append(np.frombuffer(f.read(), dtype=np.uint8))
            return batch
    
    
    class DaliPipeline(object):
        def __init__(self, batch_size, device_id, stream=None):
            self.batch_size = batch_size
            self.device_id = device_id
            self.stream = stream
            self.pipe = None
            self.eii = ExternalInputIterator(batch_size)
    
        def load(self, inputs):
            if self.pipe is None:
                # create pipeline
                pipe = Pipeline(batch_size=self.batch_size, num_threads=1, device_id=self.device_id,
                                exec_pipelined=False, exec_async=False)
                with pipe:
                    # jpegs = fn.external_source(source=self.eii, num_outputs=1, device="gpu", cuda_stream=self.stream)
                    jpegs = fn.external_source(source=self.eii, device="cpu")
                    decode = fn.decoders.image(jpegs, device="mixed", output_type=types.BGR)
                    out = fn.resize(decode, device="gpu", resize_shorter=224,
                                    interp_type=types.INTERP_TRIANGULAR)
                    pipe.set_outputs(out)
                pipe.build()
                self.pipe = pipe
            # feed data
            self.eii.feed(inputs)
            pipe_out = self.pipe.run()
            return pipe_out
    
    def test_pipeline(pipe, imgpathlist, count=1):
        # from tqdm import tqdm
        # bar = tqdm(total=count, ncols=60)
        batchsize = len(imgpathlist)
        start = time.time()
    
        for i in range(count):
            output_tensor, = pipe.load(imgpathlist)
            # output = output_tensor.as_cpu().as_array()
            # for img in output:
            #     cv2.imwrite("out.jpg", img)
            # bar.update(1)
        end = time.time()
        # bar.close()
        # report speed
        v = count * batchsize / (end - start)
        print('Time: %.3f Speed=%.3f img/sec' % (end - start, v))
        return v
    
    
    if __name__ == '__main__':
        import time
        import os
    
        batchsize = 8
        device_id = 0
        img_paths = ['img/building.jpg', 'img/lena.jpg', 'img/dog.png']
        for img_path in img_paths:
            imgpathlist = []
            for i in range(batchsize):
                imgpathlist.append(img_path)
    
            # test
            pipe = DaliPipeline(batchsize, device_id)
            v1 = test_pipeline(pipe, imgpathlist)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
  • 相关阅读:
    Curator实现Zookeeper分布式锁
    i.MX6ULL配置GPIO一共需要几步?
    Pytest----如何通过filterwarnings配置不显示告警或将告警报错
    vue2和vue3 的双向绑定原理
    屏幕录像推荐:Apeaksoft Screen Recorder 中文 for mac
    flutter EventBus
    VMware Fusion 13在M2芯片的Mac上安装 Windows 11
    git指令之git rebase的用法
    Lua系列文章(1)---Lua5.4参考手册学习总结
    使用 PHP 和 MySQL 的投票和投票系统
  • 原文地址:https://blog.csdn.net/zfjBIT/article/details/132941382