fortran66のブログ

fortran について書きます。

【メモ帳】Fortran から Swift を呼ぶ その4

swift JCL 説

swift JCL 説に則り pipeline は固定のまま、入力バッファも固定のまま、引数指定のみを繰り返して、2回平方根をとります。

import Metal

@_cdecl("mymetal")

public func mymetaln(n: Int32, px: UnsafeMutablePointer<Float32>) {
    let device = MTLCreateSystemDefaultDevice()
    let cmdQueue = device?.makeCommandQueue()
    let cmdBuff = cmdQueue?.makeCommandBuffer()
    let encoder = cmdBuff?.makeComputeCommandEncoder()
    
    // make pipeline for GPU binary
    let lib = device?.makeDefaultLibrary()
    let kernelSub: MTLFunction! = lib?.makeFunction(name: "mykernel")
    let pipeline = try! device?.makeComputePipelineState(function: kernelSub)
    encoder?.setComputePipelineState(pipeline!)

    // make memory buffers
    var nsize = Int(n)
    let nlen = MemoryLayout<Float32>.size * nsize
    let buff0 = device?.makeBuffer(bytes: px, length: nlen, options: [.storageModeShared, .cpuCacheModeWriteCombined])
    encoder?.setBuffer(buff0, offset: 0, index: 0)
    let buff1 = device?.makeBuffer(bytes: &nsize, length: MemoryLayout<Int>.size, options: .storageModeShared)
    encoder?.setBuffer(buff1, offset: 0, index: 1)
    
    // group GPU threads
    // non-uniform thread group assumed
    let w = pipeline?.threadExecutionWidth
    let perGroup = MTLSize(width: w!   , height: 1, depth: 1)
    let perGrid  = MTLSize(width: nsize, height: 1, depth: 1)
    encoder?.dispatchThreads(perGrid, threadsPerThreadgroup: perGroup)

    // second run 
    encoder?.setBuffer(buff0, offset: 0, index: 0)
    encoder?.setBuffer(buff1, offset: 0, index: 1)
    encoder?.dispatchThreads(perGrid, threadsPerThreadgroup: perGroup)

    encoder?.endEncoding()  
    // run GPU kernel
    cmdBuff?.commit()
    cmdBuff?.waitUntilCompleted()
    
    // to array
    let pbuff = buff0?.contents().bindMemory(to: Float32.self, capacity: nsize)
    for i in 0 ..< n {
        px[Int(i)] = pbuff![Int(i)]
    }

    print("GPU calculation finished")
}
module test_m
    implicit none
    interface
        subroutine metal_calc(n, x) bind(c, name = 'mymetal')
            integer, value :: n
            real, intent(in out) :: x(n)
        end subroutine metal_calc
    end interface

end module test_m



program test
    use :: test_m
    implicit none
    integer, parameter :: n = 10**2
    integer :: i
    real :: x0(n), x1(n), eps = epsilon(0.0) 
    print *, 'start'
    x0 = [(real(i), i = 0, n - 1)] 
    x1 = sqrt(sqrt(x0))
    print *, 'cpu  '
    ! calclation by C++(Metal GPU) via Swift
    call metal_calc(n, x0)
    print *, 'By GPU (Metal C++ via Swift)'
    print '(5es15.7)', x0
    print *
    print *, 'By CPU (Fortran)'
    print '(5es15.7)', x1
    print *
    print '(5es15.7)', x1 - x0
    print *, 'not eq', count(abs(x1 - x0) > eps) / real(n)
    
    print *, 'Machine epsilon =', eps
    print *, '   2 epsilin,       4 epsilon,       8 epsilon'
    print *, 2 * eps, 4 * eps, 8 * eps
end program test

変更なし

//
//  kernel.metal
//  mymetal
//
//  Created by HO on 2021/04/21.
//

//using namespace metal;


#include <metal_stdlib>

kernel void mykernel(device float* arr [[buffer(0)]],
                     constant int32_t &nsize [[buffer(1)]],
                     uint pos [[thread_position_in_grid]])
{
    if (pos < (uint)nsize) {
        arr[pos] = metal::sqrt(arr[pos]);
    }
}

実行結果

2回ルートを取って、4乗根が求まっています。

[a] M1:~/fortran/swift% swiftc mymetal.swift -emit-library
[a] M1:~/fortran/swift% gfortran mymetal.f90 libmymetal.dylib nikai 
[a] M1:~/fortran/swift% ./a.out                           

 start
 cpu  
GPU calculation finished
 By GPU (Metal C++ via Swift)
  0.0000000E+00  1.0000000E+00  1.1892071E+00  1.3160740E+00  1.4142135E+00
  1.4953488E+00  1.5650847E+00  1.6265765E+00  1.6817930E+00  1.7320508E+00
  1.7782794E+00  1.8211604E+00  1.8612098E+00  1.8988290E+00  1.9343365E+00
  1.9679897E+00  2.0000000E+00  2.0305433E+00  2.0597670E+00  2.0877976E+00
  2.1147425E+00  2.1406953E+00  2.1657367E+00  2.1899388E+00  2.2133639E+00
  2.2360680E+00  2.2581010E+00  2.2795072E+00  2.3003266E+00  2.3205957E+00
  2.3403473E+00  2.3596113E+00  2.3784142E+00  2.3967819E+00  2.4147365E+00
  2.4322994E+00  2.4494898E+00  2.4663258E+00  2.4828238E+00  2.4989996E+00
  2.5148668E+00  2.5304396E+00  2.5457299E+00  2.5607498E+00  2.5755095E+00
  2.5900202E+00  2.6042907E+00  2.6183305E+00  2.6321480E+00  2.6457515E+00
  2.6591482E+00  2.6723452E+00  2.6853495E+00  2.6981680E+00  2.7108061E+00
  2.7232699E+00  2.7355647E+00  2.7476962E+00  2.7596693E+00  2.7714880E+00
  2.7831578E+00  2.7946827E+00  2.8060663E+00  2.8173132E+00  2.8284271E+00
  2.8394117E+00  2.8502700E+00  2.8610055E+00  2.8716218E+00  2.8821216E+00
  2.8925076E+00  2.9027832E+00  2.9129505E+00  2.9230127E+00  2.9329722E+00
  2.9428308E+00  2.9525919E+00  2.9622567E+00  2.9718277E+00  2.9813075E+00
  2.9906976E+00  3.0000000E+00  3.0092168E+00  3.0183496E+00  3.0274000E+00
  3.0363703E+00  3.0452616E+00  3.0540757E+00  3.0628145E+00  3.0714786E+00
  3.0800703E+00  3.0885909E+00  3.0970411E+00  3.1054230E+00  3.1137376E+00
  3.1219859E+00  3.1301694E+00  3.1382890E+00  3.1463466E+00  3.1543422E+00

 By CPU (Fortran)
  0.0000000E+00  1.0000000E+00  1.1892071E+00  1.3160740E+00  1.4142135E+00
  1.4953488E+00  1.5650846E+00  1.6265765E+00  1.6817929E+00  1.7320508E+00
  1.7782794E+00  1.8211603E+00  1.8612098E+00  1.8988289E+00  1.9343364E+00
  1.9679897E+00  2.0000000E+00  2.0305431E+00  2.0597670E+00  2.0877976E+00
  2.1147425E+00  2.1406951E+00  2.1657367E+00  2.1899388E+00  2.2133639E+00
  2.2360680E+00  2.2581010E+00  2.2795069E+00  2.3003266E+00  2.3205957E+00
  2.3403473E+00  2.3596110E+00  2.3784142E+00  2.3967817E+00  2.4147363E+00
  2.4322994E+00  2.4494898E+00  2.4663258E+00  2.4828238E+00  2.4989994E+00
  2.5148668E+00  2.5304396E+00  2.5457299E+00  2.5607495E+00  2.5755095E+00
  2.5900199E+00  2.6042907E+00  2.6183305E+00  2.6321480E+00  2.6457512E+00
  2.6591480E+00  2.6723452E+00  2.6853497E+00  2.6981678E+00  2.7108061E+00
  2.7232697E+00  2.7355649E+00  2.7476962E+00  2.7596691E+00  2.7714880E+00
  2.7831578E+00  2.7946825E+00  2.8060663E+00  2.8173132E+00  2.8284271E+00
  2.8394115E+00  2.8502700E+00  2.8610055E+00  2.8716216E+00  2.8821213E+00
  2.8925076E+00  2.9027832E+00  2.9129505E+00  2.9230127E+00  2.9329722E+00
  2.9428310E+00  2.9525919E+00  2.9622567E+00  2.9718277E+00  2.9813075E+00
  2.9906976E+00  3.0000000E+00  3.0092168E+00  3.0183494E+00  3.0274003E+00
  3.0363703E+00  3.0452616E+00  3.0540760E+00  3.0628142E+00  3.0714786E+00
  3.0800703E+00  3.0885906E+00  3.0970411E+00  3.1054227E+00  3.1137373E+00
  3.1219857E+00  3.1301692E+00  3.1382890E+00  3.1463463E+00  3.1543422E+00

  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
  0.0000000E+00 -1.1920929E-07  0.0000000E+00 -1.1920929E-07  0.0000000E+00
  0.0000000E+00 -1.1920929E-07  0.0000000E+00 -1.1920929E-07 -1.1920929E-07
  0.0000000E+00  0.0000000E+00 -2.3841858E-07  0.0000000E+00  0.0000000E+00
  0.0000000E+00 -2.3841858E-07  0.0000000E+00  0.0000000E+00  0.0000000E+00
  0.0000000E+00  0.0000000E+00 -2.3841858E-07  0.0000000E+00  0.0000000E+00
  0.0000000E+00 -2.3841858E-07  0.0000000E+00 -2.3841858E-07 -2.3841858E-07
  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00 -2.3841858E-07
  0.0000000E+00  0.0000000E+00  0.0000000E+00 -2.3841858E-07  0.0000000E+00
 -2.3841858E-07  0.0000000E+00  0.0000000E+00  0.0000000E+00 -2.3841858E-07
 -2.3841858E-07  0.0000000E+00  2.3841858E-07 -2.3841858E-07  0.0000000E+00
 -2.3841858E-07  2.3841858E-07  0.0000000E+00 -2.3841858E-07  0.0000000E+00
  0.0000000E+00 -2.3841858E-07  0.0000000E+00  0.0000000E+00  0.0000000E+00
 -2.3841858E-07  0.0000000E+00  0.0000000E+00 -2.3841858E-07 -2.3841858E-07
  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
  2.3841858E-07  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
  0.0000000E+00  0.0000000E+00  0.0000000E+00 -2.3841858E-07  2.3841858E-07
  0.0000000E+00  0.0000000E+00  2.3841858E-07 -2.3841858E-07  0.0000000E+00
  0.0000000E+00 -2.3841858E-07  0.0000000E+00 -2.3841858E-07 -2.3841858E-07
 -2.3841858E-07 -2.3841858E-07  0.0000000E+00 -2.3841858E-07  0.0000000E+00
 not eq  0.310000002    
 Machine epsilon =   1.19209290E-07
    2 epsilin,       4 epsilon,       8 epsilon
   2.38418579E-07   4.76837158E-07   9.53674316E-07

基礎から学ぶ Metal〜MetalによるGPUプログラミング入門

基礎から学ぶ Metal〜MetalによるGPUプログラミング入門

  • 作者:林 晃
  • 発売日: 2021/01/20
  • メディア: 単行本(ソフトカバー)