swift JCL 説
swift JCL 説に則り pipeline は固定のまま、入力バッファも固定のまま、引数指定のみを繰り返して、2回平方根をとります。
import Metal @_cdecl("mymetal") public func mymetaln(n: Int32, px: UnsafeMutablePointer<Float32>) { let device = MTLCreateSystemDefaultDevice() let cmdQueue = device?.makeCommandQueue() let cmdBuff = cmdQueue?.makeCommandBuffer() let encoder = cmdBuff?.makeComputeCommandEncoder() // make pipeline for GPU binary let lib = device?.makeDefaultLibrary() let kernelSub: MTLFunction! = lib?.makeFunction(name: "mykernel") let pipeline = try! device?.makeComputePipelineState(function: kernelSub) encoder?.setComputePipelineState(pipeline!) // make memory buffers var nsize = Int(n) let nlen = MemoryLayout<Float32>.size * nsize let buff0 = device?.makeBuffer(bytes: px, length: nlen, options: [.storageModeShared, .cpuCacheModeWriteCombined]) encoder?.setBuffer(buff0, offset: 0, index: 0) let buff1 = device?.makeBuffer(bytes: &nsize, length: MemoryLayout<Int>.size, options: .storageModeShared) encoder?.setBuffer(buff1, offset: 0, index: 1) // group GPU threads // non-uniform thread group assumed let w = pipeline?.threadExecutionWidth let perGroup = MTLSize(width: w! , height: 1, depth: 1) let perGrid = MTLSize(width: nsize, height: 1, depth: 1) encoder?.dispatchThreads(perGrid, threadsPerThreadgroup: perGroup) // second run encoder?.setBuffer(buff0, offset: 0, index: 0) encoder?.setBuffer(buff1, offset: 0, index: 1) encoder?.dispatchThreads(perGrid, threadsPerThreadgroup: perGroup) encoder?.endEncoding() // run GPU kernel cmdBuff?.commit() cmdBuff?.waitUntilCompleted() // to array let pbuff = buff0?.contents().bindMemory(to: Float32.self, capacity: nsize) for i in 0 ..< n { px[Int(i)] = pbuff![Int(i)] } print("GPU calculation finished") }
module test_m implicit none interface subroutine metal_calc(n, x) bind(c, name = 'mymetal') integer, value :: n real, intent(in out) :: x(n) end subroutine metal_calc end interface end module test_m program test use :: test_m implicit none integer, parameter :: n = 10**2 integer :: i real :: x0(n), x1(n), eps = epsilon(0.0) print *, 'start' x0 = [(real(i), i = 0, n - 1)] x1 = sqrt(sqrt(x0)) print *, 'cpu ' ! calclation by C++(Metal GPU) via Swift call metal_calc(n, x0) print *, 'By GPU (Metal C++ via Swift)' print '(5es15.7)', x0 print * print *, 'By CPU (Fortran)' print '(5es15.7)', x1 print * print '(5es15.7)', x1 - x0 print *, 'not eq', count(abs(x1 - x0) > eps) / real(n) print *, 'Machine epsilon =', eps print *, ' 2 epsilin, 4 epsilon, 8 epsilon' print *, 2 * eps, 4 * eps, 8 * eps end program test
変更なし
// // kernel.metal // mymetal // // Created by HO on 2021/04/21. // //using namespace metal; #include <metal_stdlib> kernel void mykernel(device float* arr [[buffer(0)]], constant int32_t &nsize [[buffer(1)]], uint pos [[thread_position_in_grid]]) { if (pos < (uint)nsize) { arr[pos] = metal::sqrt(arr[pos]); } }
実行結果
2回ルートを取って、4乗根が求まっています。
[a] M1:~/fortran/swift% swiftc mymetal.swift -emit-library [a] M1:~/fortran/swift% gfortran mymetal.f90 libmymetal.dylib nikai [a] M1:~/fortran/swift% ./a.out start cpu GPU calculation finished By GPU (Metal C++ via Swift) 0.0000000E+00 1.0000000E+00 1.1892071E+00 1.3160740E+00 1.4142135E+00 1.4953488E+00 1.5650847E+00 1.6265765E+00 1.6817930E+00 1.7320508E+00 1.7782794E+00 1.8211604E+00 1.8612098E+00 1.8988290E+00 1.9343365E+00 1.9679897E+00 2.0000000E+00 2.0305433E+00 2.0597670E+00 2.0877976E+00 2.1147425E+00 2.1406953E+00 2.1657367E+00 2.1899388E+00 2.2133639E+00 2.2360680E+00 2.2581010E+00 2.2795072E+00 2.3003266E+00 2.3205957E+00 2.3403473E+00 2.3596113E+00 2.3784142E+00 2.3967819E+00 2.4147365E+00 2.4322994E+00 2.4494898E+00 2.4663258E+00 2.4828238E+00 2.4989996E+00 2.5148668E+00 2.5304396E+00 2.5457299E+00 2.5607498E+00 2.5755095E+00 2.5900202E+00 2.6042907E+00 2.6183305E+00 2.6321480E+00 2.6457515E+00 2.6591482E+00 2.6723452E+00 2.6853495E+00 2.6981680E+00 2.7108061E+00 2.7232699E+00 2.7355647E+00 2.7476962E+00 2.7596693E+00 2.7714880E+00 2.7831578E+00 2.7946827E+00 2.8060663E+00 2.8173132E+00 2.8284271E+00 2.8394117E+00 2.8502700E+00 2.8610055E+00 2.8716218E+00 2.8821216E+00 2.8925076E+00 2.9027832E+00 2.9129505E+00 2.9230127E+00 2.9329722E+00 2.9428308E+00 2.9525919E+00 2.9622567E+00 2.9718277E+00 2.9813075E+00 2.9906976E+00 3.0000000E+00 3.0092168E+00 3.0183496E+00 3.0274000E+00 3.0363703E+00 3.0452616E+00 3.0540757E+00 3.0628145E+00 3.0714786E+00 3.0800703E+00 3.0885909E+00 3.0970411E+00 3.1054230E+00 3.1137376E+00 3.1219859E+00 3.1301694E+00 3.1382890E+00 3.1463466E+00 3.1543422E+00 By CPU (Fortran) 0.0000000E+00 1.0000000E+00 1.1892071E+00 1.3160740E+00 1.4142135E+00 1.4953488E+00 1.5650846E+00 1.6265765E+00 1.6817929E+00 1.7320508E+00 1.7782794E+00 1.8211603E+00 1.8612098E+00 1.8988289E+00 1.9343364E+00 1.9679897E+00 2.0000000E+00 2.0305431E+00 2.0597670E+00 2.0877976E+00 2.1147425E+00 2.1406951E+00 2.1657367E+00 2.1899388E+00 2.2133639E+00 2.2360680E+00 2.2581010E+00 2.2795069E+00 2.3003266E+00 2.3205957E+00 2.3403473E+00 2.3596110E+00 2.3784142E+00 2.3967817E+00 2.4147363E+00 2.4322994E+00 2.4494898E+00 2.4663258E+00 2.4828238E+00 2.4989994E+00 2.5148668E+00 2.5304396E+00 2.5457299E+00 2.5607495E+00 2.5755095E+00 2.5900199E+00 2.6042907E+00 2.6183305E+00 2.6321480E+00 2.6457512E+00 2.6591480E+00 2.6723452E+00 2.6853497E+00 2.6981678E+00 2.7108061E+00 2.7232697E+00 2.7355649E+00 2.7476962E+00 2.7596691E+00 2.7714880E+00 2.7831578E+00 2.7946825E+00 2.8060663E+00 2.8173132E+00 2.8284271E+00 2.8394115E+00 2.8502700E+00 2.8610055E+00 2.8716216E+00 2.8821213E+00 2.8925076E+00 2.9027832E+00 2.9129505E+00 2.9230127E+00 2.9329722E+00 2.9428310E+00 2.9525919E+00 2.9622567E+00 2.9718277E+00 2.9813075E+00 2.9906976E+00 3.0000000E+00 3.0092168E+00 3.0183494E+00 3.0274003E+00 3.0363703E+00 3.0452616E+00 3.0540760E+00 3.0628142E+00 3.0714786E+00 3.0800703E+00 3.0885906E+00 3.0970411E+00 3.1054227E+00 3.1137373E+00 3.1219857E+00 3.1301692E+00 3.1382890E+00 3.1463463E+00 3.1543422E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 -1.1920929E-07 0.0000000E+00 -1.1920929E-07 0.0000000E+00 0.0000000E+00 -1.1920929E-07 0.0000000E+00 -1.1920929E-07 -1.1920929E-07 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 -2.3841858E-07 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 -2.3841858E-07 0.0000000E+00 2.3841858E-07 -2.3841858E-07 0.0000000E+00 -2.3841858E-07 2.3841858E-07 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 -2.3841858E-07 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 2.3841858E-07 0.0000000E+00 0.0000000E+00 2.3841858E-07 -2.3841858E-07 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 -2.3841858E-07 -2.3841858E-07 -2.3841858E-07 -2.3841858E-07 0.0000000E+00 -2.3841858E-07 0.0000000E+00 not eq 0.310000002 Machine epsilon = 1.19209290E-07 2 epsilin, 4 epsilon, 8 epsilon 2.38418579E-07 4.76837158E-07 9.53674316E-07
基礎から学ぶ Metal〜MetalによるGPUプログラミング入門
- 作者:林 晃
- 発売日: 2021/01/20
- メディア: 単行本(ソフトカバー)