본문 바로가기

프로그래밍/iOS,macOS

[Metal] 이미지 렌더링~ 가우시안 블러~ Kernel 쉐이더

이전 가우시안 블러 샘플을 Kernel(compute) 쉐이더를 사용해 구현해 본다.
원본이미지를 리사이징해 작은 텍스처를 만들고, 블러링 작업은 이전 샘플과 동일하게 kernel 쉐이더로 5회 적용했다. 
속도는 fragment 쉐이더와 큰 차이는 없다.

렌더타겟은 사이즈를 줄여 렌더링하는 용도로만 사용한다.
가로, 세로 작업을 위한 MTLTexture, MTLComputePipelineState 와 MTLFunction을 선언했다.

class Renderer:NSObject {
    var device:MTLDevice!
    
    var commandQueue: MTLCommandQueue!
    var sharedDataPtr: UnsafeMutablePointer<SharedData>?
    
    var imageVertexBuffer: MTLBuffer!
    var sharedDataBuffer: MTLBuffer!
    var imagePipelineState: MTLRenderPipelineState!
    
    // 두개의 작업 텍스처
    var workHTargetTexture: MTLTexture?
    var workVTargetTexture: MTLTexture?

    
    var renderPassDescriptor: MTLRenderPassDescriptor!
    var renderPipelineState: MTLRenderPipelineState!
    
    
    // 입력 이미지
    var imageTexture: MTLTexture?
    var imageResizeTexture: MTLTexture?
    var imageDepthState:MTLDepthStencilState!
    
    // 렌더링 쉐이더
    var imageVertexFunction: MTLFunction!
    var renderScreenFragmentFunction: MTLFunction!
    var renderTextureFragmentFunction: MTLFunction!
    
    //
    var computeHPipelineState: MTLComputePipelineState!
    var computeVPipelineState: MTLComputePipelineState!
    var computeHFunction: MTLFunction!
    var computeVFunction: MTLFunction!
    
    override init() {
        super.init()
        
        self.device = MTLCreateSystemDefaultDevice()
        initMetal()
    }
    
    .
    .
    .
 }

 

초기화

kernel 함수 두개를 추가한 부분외에는 동일

func initMetal() {
  guard let defaultLibrary = try? self.device.makeDefaultLibrary(bundle: Bundle(for: Renderer.self)) else {
    print("[Renderer.initMetal] init error")
    return
  }

  imageVertexFunction = defaultLibrary.makeFunction(name: "imageVertexFunction")
  renderScreenFragmentFunction = defaultLibrary.makeFunction(name: "swapFragmentFunction")
  renderTextureFragmentFunction = defaultLibrary.makeFunction(name: "imageResizeFragmentFunction")
  computeHFunction = defaultLibrary.makeFunction(name: "gaussianBlurHFunction")
  computeVFunction = defaultLibrary.makeFunction(name: "gaussianBlurVFunction")

  self.commandQueue = self.device.makeCommandQueue()

  let size = kImagePlaneVertexData.count * MemoryLayout<Float>.size
  imageVertexBuffer = self.device.makeBuffer(bytes: kImagePlaneVertexData, length: size)
  imageVertexBuffer.label = "ImageVertexBuffer"

  // 공유데이터 버퍼
  let sharedBufferSize = (MemoryLayout<SharedData>.size & ~0xFF) + 0x100
  sharedDataBuffer = self.device.makeBuffer(length: sharedBufferSize, options: .storageModeShared)
  sharedDataBuffer.label = "SharedBuffer"


  initRederTarget()
  initSwapRender()
  initKernelTarget()
  initGaussianFilter()

  self.imageTexture = loadTexture(name:"sample", ext:"png")
}

 

렌더타겟 설정

기존 샘플과 큰 차이는 없으며, 작업용 텍스처를 별도로 2개 생성했다.

func initRederTarget() {
  let imageVertexDescriptor = MTLVertexDescriptor()
  imageVertexDescriptor.attributes[0].format = .float2
  imageVertexDescriptor.attributes[0].offset = 0
  imageVertexDescriptor.attributes[0].bufferIndex = 0
  imageVertexDescriptor.attributes[1].format = .float2
  imageVertexDescriptor.attributes[1].offset = 8
  imageVertexDescriptor.attributes[1].bufferIndex = 0
  imageVertexDescriptor.layouts[0].stride = 16
  imageVertexDescriptor.layouts[0].stepRate = 1
  imageVertexDescriptor.layouts[0].stepFunction = .perVertex


  let imagePipelineDescriptor = MTLRenderPipelineDescriptor()
  imagePipelineDescriptor.label = "ImageResizeRenderPipeline"
  imagePipelineDescriptor.sampleCount = 1
  imagePipelineDescriptor.vertexFunction = imageVertexFunction
  imagePipelineDescriptor.fragmentFunction = renderTextureFragmentFunction
  imagePipelineDescriptor.vertexDescriptor = imageVertexDescriptor
  imagePipelineDescriptor.depthAttachmentPixelFormat = .invalid
  imagePipelineDescriptor.colorAttachments[0].pixelFormat = .bgra8Unorm

  do {
    try self.renderPipelineState = self.device.makeRenderPipelineState(descriptor: imagePipelineDescriptor)
  } catch let error {
    print("error=\(error.localizedDescription)")
  }

  let texDescriptor = MTLTextureDescriptor()
  texDescriptor.textureType = MTLTextureType.type2D
  texDescriptor.width = 256
  texDescriptor.height = 256
  texDescriptor.pixelFormat = .bgra8Unorm
  texDescriptor.storageMode = .private
  texDescriptor.usage = [.renderTarget, .shaderRead]

  self.imageResizeTexture = self.device.makeTexture(descriptor: texDescriptor)



  let tex2Descriptor = MTLTextureDescriptor()
  tex2Descriptor.textureType = MTLTextureType.type2D
  tex2Descriptor.width = 256
  tex2Descriptor.height = 256
  tex2Descriptor.pixelFormat = .bgra8Unorm
  tex2Descriptor.storageMode = .private
  tex2Descriptor.usage = [.shaderRead, .shaderWrite]


  self.workHTargetTexture = self.device.makeTexture(descriptor: tex2Descriptor)
  self.workVTargetTexture = self.device.makeTexture(descriptor: tex2Descriptor)

  let clearColor = MTLClearColor(red: 0, green: 0, blue: 0, alpha: 1)

  self.renderPassDescriptor = MTLRenderPassDescriptor()
  self.renderPassDescriptor.colorAttachments[0].texture = self.imageResizeTexture
  self.renderPassDescriptor.colorAttachments[0].loadAction = .clear
  self.renderPassDescriptor.colorAttachments[0].clearColor = clearColor
  self.renderPassDescriptor.colorAttachments[0].storeAction = .store
}

 

커널 파이프라인 설정

func initKernelTarget() {
  do {
    try self.computeHPipelineState = self.device.makeComputePipelineState(function: self.computeHFunction)
  } catch let error {
    print("error=\(error.localizedDescription)")
  }


  do {
    try self.computeVPipelineState = self.device.makeComputePipelineState(function: self.computeVFunction)
  } catch let error {
    print("error=\(error.localizedDescription)")
  }
}

 

가우시안 가중치 계산

func initGaussianFilter() {
  // 쉐이더에 공통적으로 전달할 데이터 생성
  // 시그마에 따른 가우시안
  let SIGMA = 4.0     // sigma^2
  let PI2 = 6.28319 // 2pi
  let TAP = 7

  let data = self.sharedDataBuffer.contents().assumingMemoryBound(to: SharedData.self)
  data.pointee.tapCount = Float(TAP)

  var total:Double = 0
  var result = [Double](repeating: 0.0, count: TAP)
  for i in 0..<TAP {
    let x = Double(i - (TAP - 1) / 2)
    result[i] = (1 / sqrtl(PI2 * SIGMA))*(expl( -(x*x) / (2*SIGMA)))
    print("\(x)=\(result[i])")
    total += result[i]
  }
  print("total=\(total)")


  // 버퍼 데이터에 저장
  // 저장시 합산이 1이 되도록 정규화
  withUnsafeMutablePointer(to: &data.pointee.gaussian) { pointer in
    pointer.withMemoryRebound(to: Float.self, capacity: TAP) { buffer in
      var index = 0
      for value in result {
        buffer[index] = Float(value) / Float(total)
        index += 1
      }
    }
  }

  print("\(data.pointee.gaussian)")
}

 

렌더링

원본이미지를 리사이징해서 텍스처를 생성하고, 5회 블러링 적용 후 화면에 렌더링을 진행한다.
kernel 쉐이더는 16x16 의 스레드 그룹으로 구성. 처음에만 리사이징된 텍스처를 사용하고, 이후에는 H, V 텍스처를 교체하며 블러링을 진행한다.

func render(view:MTKView) {
  print("render")
  let startTime = Int64((Date().timeIntervalSince1970 * 1000.0).rounded())
  guard let renderPass = view.currentRenderPassDescriptor else { return }
  guard let drawable = view.currentDrawable else { return }
  guard let commandBuffer = self.commandQueue.makeCommandBuffer() else { return }
  commandBuffer.label = "RenderCommand"


  if let encoder = commandBuffer.makeRenderCommandEncoder(descriptor: self.renderPassDescriptor)  {
    encoder.label = "RenderResizeEncoder"
    encoder.setCullMode(.front)
    encoder.setRenderPipelineState(self.renderPipelineState)
    encoder.setVertexBuffer(self.imageVertexBuffer, offset: 0, index: 0)
    encoder.setFragmentTexture(self.imageTexture!, index: 0)
    encoder.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4)
    encoder.endEncoding()
  }


  // 블러
  let threadGroupCount = MTLSizeMake(16, 16, 1)
  let threadCountPerGroup = MTLSizeMake(
                                self.workHTargetTexture!.width / threadGroupCount.width,
                                self.workHTargetTexture!.height / threadGroupCount.height,
                                1)

  for i in 0...5 {
    var inputTexture:MTLTexture = self.workVTargetTexture!
    if i == 0 {
      inputTexture = self.imageResizeTexture!
    }

    if let encoder = commandBuffer.makeComputeCommandEncoder() {
      encoder.setComputePipelineState(self.computeHPipelineState)
      encoder.setTexture(inputTexture, index: 0)
      encoder.setTexture(self.workHTargetTexture, index: 1)
      encoder.setBuffer(self.sharedDataBuffer, offset: 0, index: 0)
      encoder.dispatchThreadgroups(threadCountPerGroup, threadsPerThreadgroup: threadGroupCount)
      encoder.endEncoding()
    }

    if let encoder = commandBuffer.makeComputeCommandEncoder() {
      encoder.setComputePipelineState(self.computeVPipelineState)
      encoder.setTexture(self.workHTargetTexture, index: 0)
      encoder.setTexture(self.workVTargetTexture, index: 1)
      encoder.setBuffer(self.sharedDataBuffer, offset: 0, index: 0)
      encoder.dispatchThreadgroups(threadCountPerGroup, threadsPerThreadgroup: threadGroupCount)
      encoder.endEncoding()
    }
  }


  if let encoder = commandBuffer.makeRenderCommandEncoder(descriptor: renderPass) {
    encoder.label = "SwapEncoder"
    encoder.setCullMode(.front)
    encoder.setRenderPipelineState(self.imagePipelineState)
    encoder.setDepthStencilState(self.imageDepthState)
    encoder.setVertexBuffer(self.imageVertexBuffer, offset: 0, index: 0)
    encoder.setFragmentTexture(self.workVTargetTexture, index: 0)
    encoder.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4)
    encoder.endEncoding()
  }


  commandBuffer.present(drawable)
  commandBuffer.commit()
  commandBuffer.waitUntilCompleted()

  let endTime = Int64((Date().timeIntervalSince1970 * 1000.0).rounded())
  print("complete: \(endTime - startTime)ms)")
}

 

쉐이더

역시 큰 차이는 없으며, 전달된 그리드의 x, y 좌표에서 인접한 픽셀값을 가져와 가중치 적용 후 더해주면 된다.

kernel void gaussianBlurHFunction(
                                 texture2d<float, access::read> input [[texture(0)]],
                                 texture2d<float, access::write> output [[texture(1)]],
                                 constant SharedData &sharedData [[buffer(0)]],
                                 uint2 gid[[thread_position_in_grid]]) {
    
    float3 sum = float3(0.0, 0.0, 0.0);
    for (int i=0;i<sharedData.tapCount;i++) {
        int index = i - (sharedData.tapCount - 1) / 2;
        uint2 id = uint2(gid.x + index, gid.y);
        sum += input.read(id).rgb * sharedData.gaussian[i];
    }
    
    float4 color = float4(sum, 1.0);
    output.write( color, gid);

}

kernel void gaussianBlurVFunction(
                                 texture2d<float, access::read> input [[texture(0)]],
                                 texture2d<float, access::write> output [[texture(1)]],
                                 constant SharedData &sharedData [[buffer(0)]],
                                 uint2 gid[[thread_position_in_grid]]) {
    
    float3 sum = float3(0.0, 0.0, 0.0);
    for (int i=0;i<sharedData.tapCount;i++) {
        int index = i - (sharedData.tapCount - 1) / 2;
        uint2 id = uint2(gid.x, gid.y + index);
        sum += input.read(id).rgb * sharedData.gaussian[i];
    }
    
    float4 color = float4(sum, 1.0);
    output.write( color, gid);

}

나머지 쉐이더들은 처리할게 없으므로 기본 샘플링 색상만 리턴해 주면 된다.

typedef struct {
    float3 position [[attribute(0)]];
    float2 texCoord [[attribute(1)]];
} ImageVertex;

typedef struct {
    float4 position [[position]];
    float2 texCoord;
} ImageOut;


vertex ImageOut imageVertexFunction( ImageVertex in [[stage_in]]) {
    ImageOut out;

    float4 position = float4(in.position, 1.0);
    out.position = position;
    out.texCoord = in.texCoord;
    return out;
}

fragment float4 imageResizeFragmentFunction(ImageOut in [[stage_in]],
                                       texture2d<float> texture1 [[texture(0)]] ) {
    constexpr sampler colorSampler;
    float4 color = texture1.sample(colorSampler, in.texCoord);
    return color;
}


fragment float4 swapFragmentFunction(ImageOut in [[stage_in]], texture2d<float> texture1 [[texture(0)]]) {
    
    constexpr sampler colorSampler;
    float4 color = texture1.sample(colorSampler, in.texCoord);
    return color;
}