swift JCL 説
swift JCL 説に則り pipeline は固定のまま、入力バッファも固定のまま、引数指定のみを繰り返して、2回平方根をとります。
import Metal
@_cdecl("mymetal")
public func mymetaln(n: Int32, px: UnsafeMutablePointer<Float32>) {
let device = MTLCreateSystemDefaultDevice()
let cmdQueue = device?.makeCommandQueue()
let cmdBuff = cmdQueue?.makeCommandBuffer()
let encoder = cmdBuff?.makeComputeCommandEncoder()
let lib = device?.makeDefaultLibrary()
let kernelSub: MTLFunction! = lib?.makeFunction(name: "mykernel")
let pipeline = try! device?.makeComputePipelineState(function: kernelSub)
encoder?.setComputePipelineState(pipeline!)
var nsize = Int(n)
let nlen = MemoryLayout<Float32>.size * nsize
let buff0 = device?.makeBuffer(bytes: px, length: nlen, options: [.storageModeShared, .cpuCacheModeWriteCombined])
encoder?.setBuffer(buff0, offset: 0, index: 0)
let buff1 = device?.makeBuffer(bytes: &nsize, length: MemoryLayout<Int>.size, options: .storageModeShared)
encoder?.setBuffer(buff1, offset: 0, index: 1)
let w = pipeline?.threadExecutionWidth
let perGroup = MTLSize(width: w! , height: 1, depth: 1)
let perGrid = MTLSize(width: nsize, height: 1, depth: 1)
encoder?.dispatchThreads(perGrid, threadsPerThreadgroup: perGroup)
encoder?.setBuffer(buff0, offset: 0, index: 0)
encoder?.setBuffer(buff1, offset: 0, index: 1)
encoder?.dispatchThreads(perGrid, threadsPerThreadgroup: perGroup)
encoder?.endEncoding()
cmdBuff?.commit()
cmdBuff?.waitUntilCompleted()
let pbuff = buff0?.contents().bindMemory(to: Float32.self, capacity: nsize)
for i in 0 ..< n {
px[Int(i)] = pbuff![Int(i)]
}
print("GPU calculation finished")
}
module test_m
implicit none
interface
subroutine metal_calc(n, x) bind(c, name = 'mymetal')
integer, value :: n
real, intent(in out) :: x(n)
end subroutine metal_calc
end interface
end module test_m
program test
use :: test_m
implicit none
integer, parameter :: n = 10**2
integer :: i
real :: x0(n), x1(n), eps = epsilon(0.0)
print *, 'start'
x0 = [(real(i), i = 0, n - 1)]
x1 = sqrt(sqrt(x0))
print *, 'cpu '
call metal_calc(n, x0)
print *, 'By GPU (Metal C++ via Swift)'
print '(5es15.7)', x0
print *
print *, 'By CPU (Fortran)'
print '(5es15.7)', x1
print *
print '(5es15.7)', x1 - x0
print *, 'not eq', count(abs(x1 - x0) > eps) / real(n)
print *, 'Machine epsilon =', eps
print *, ' 2 epsilin, 4 epsilon, 8 epsilon'
print *, 2 * eps, 4 * eps, 8 * eps
end program test
変更なし
#include <metal_stdlib>
kernel void mykernel(device float* arr [[buffer(0)]],
constant int32_t &nsize [[buffer(1)]],
uint pos [[thread_position_in_grid]])
{
if (pos < (uint)nsize) {
arr[pos] = metal::sqrt(arr[pos]);
}
}
実行結果
2回ルートを取って、4乗根が求まっています。
[a] M1:~/fortran/swift% swiftc mymetal.swift -emit-library
[a] M1:~/fortran/swift% gfortran mymetal.f90 libmymetal.dylib nikai
[a] M1:~/fortran/swift% ./a.out
start
cpu
GPU calculation finished
By GPU (Metal C++ via Swift)
0.0000000E+00 1.0000000E+00 1.1892071E+00 1.3160740E+00 1.4142135E+00
1.4953488E+00 1.5650847E+00 1.6265765E+00 1.6817930E+00 1.7320508E+00
1.7782794E+00 1.8211604E+00 1.8612098E+00 1.8988290E+00 1.9343365E+00
1.9679897E+00 2.0000000E+00 2.0305433E+00 2.0597670E+00 2.0877976E+00
2.1147425E+00 2.1406953E+00 2.1657367E+00 2.1899388E+00 2.2133639E+00
2.2360680E+00 2.2581010E+00 2.2795072E+00 2.3003266E+00 2.3205957E+00
2.3403473E+00 2.3596113E+00 2.3784142E+00 2.3967819E+00 2.4147365E+00
2.4322994E+00 2.4494898E+00 2.4663258E+00 2.4828238E+00 2.4989996E+00
2.5148668E+00 2.5304396E+00 2.5457299E+00 2.5607498E+00 2.5755095E+00
2.5900202E+00 2.6042907E+00 2.6183305E+00 2.6321480E+00 2.6457515E+00
2.6591482E+00 2.6723452E+00 2.6853495E+00 2.6981680E+00 2.7108061E+00
2.7232699E+00 2.7355647E+00 2.7476962E+00 2.7596693E+00 2.7714880E+00
2.7831578E+00 2.7946827E+00 2.8060663E+00 2.8173132E+00 2.8284271E+00
2.8394117E+00 2.8502700E+00 2.8610055E+00 2.8716218E+00 2.8821216E+00
2.8925076E+00 2.9027832E+00 2.9129505E+00 2.9230127E+00 2.9329722E+00
2.9428308E+00 2.9525919E+00 2.9622567E+00 2.9718277E+00 2.9813075E+00
2.9906976E+00 3.0000000E+00 3.0092168E+00 3.0183496E+00 3.0274000E+00
3.0363703E+00 3.0452616E+00 3.0540757E+00 3.0628145E+00 3.0714786E+00
3.0800703E+00 3.0885909E+00 3.0970411E+00 3.1054230E+00 3.1137376E+00
3.1219859E+00 3.1301694E+00 3.1382890E+00 3.1463466E+00 3.1543422E+00
By CPU (Fortran)
0.0000000E+00 1.0000000E+00 1.1892071E+00 1.3160740E+00 1.4142135E+00
1.4953488E+00 1.5650846E+00 1.6265765E+00 1.6817929E+00 1.7320508E+00
1.7782794E+00 1.8211603E+00 1.8612098E+00 1.8988289E+00 1.9343364E+00
1.9679897E+00 2.0000000E+00 2.0305431E+00 2.0597670E+00 2.0877976E+00
2.1147425E+00 2.1406951E+00 2.1657367E+00 2.1899388E+00 2.2133639E+00
2.2360680E+00 2.2581010E+00 2.2795069E+00 2.3003266E+00 2.3205957E+00
2.3403473E+00 2.3596110E+00 2.3784142E+00 2.3967817E+00 2.4147363E+00
2.4322994E+00 2.4494898E+00 2.4663258E+00 2.4828238E+00 2.4989994E+00
2.5148668E+00 2.5304396E+00 2.5457299E+00 2.5607495E+00 2.5755095E+00
2.5900199E+00 2.6042907E+00 2.6183305E+00 2.6321480E+00 2.6457512E+00
2.6591480E+00 2.6723452E+00 2.6853497E+00 2.6981678E+00 2.7108061E+00
2.7232697E+00 2.7355649E+00 2.7476962E+00 2.7596691E+00 2.7714880E+00
2.7831578E+00 2.7946825E+00 2.8060663E+00 2.8173132E+00 2.8284271E+00
2.8394115E+00 2.8502700E+00 2.8610055E+00 2.8716216E+00 2.8821213E+00
2.8925076E+00 2.9027832E+00 2.9129505E+00 2.9230127E+00 2.9329722E+00
2.9428310E+00 2.9525919E+00 2.9622567E+00 2.9718277E+00 2.9813075E+00
2.9906976E+00 3.0000000E+00 3.0092168E+00 3.0183494E+00 3.0274003E+00
3.0363703E+00 3.0452616E+00 3.0540760E+00 3.0628142E+00 3.0714786E+00
3.0800703E+00 3.0885906E+00 3.0970411E+00 3.1054227E+00 3.1137373E+00
3.1219857E+00 3.1301692E+00 3.1382890E+00 3.1463463E+00 3.1543422E+00
0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00
0.0000000E+00 -1.1920929E-07 0.0000000E+00 -1.1920929E-07 0.0000000E+00
0.0000000E+00 -1.1920929E-07 0.0000000E+00 -1.1920929E-07 -1.1920929E-07
0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00
0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00
0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00
0.0000000E+00 -2.3841858E-07 0.0000000E+00 -2.3841858E-07 -2.3841858E-07
0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07
0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 0.0000000E+00
-2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07
-2.3841858E-07 0.0000000E+00 2.3841858E-07 -2.3841858E-07 0.0000000E+00
-2.3841858E-07 2.3841858E-07 0.0000000E+00 -2.3841858E-07 0.0000000E+00
0.0000000E+00 -2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00
-2.3841858E-07 0.0000000E+00 0.0000000E+00 -2.3841858E-07 -2.3841858E-07
0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00
2.3841858E-07 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00
0.0000000E+00 0.0000000E+00 0.0000000E+00 -2.3841858E-07 2.3841858E-07
0.0000000E+00 0.0000000E+00 2.3841858E-07 -2.3841858E-07 0.0000000E+00
0.0000000E+00 -2.3841858E-07 0.0000000E+00 -2.3841858E-07 -2.3841858E-07
-2.3841858E-07 -2.3841858E-07 0.0000000E+00 -2.3841858E-07 0.0000000E+00
not eq 0.310000002
Machine epsilon = 1.19209290E-07
2 epsilin, 4 epsilon, 8 epsilon
2.38418579E-07 4.76837158E-07 9.53674316E-07