必要なライブラリは cudart.lib のみです。すべて Fortran 側から CUDA API を呼び出すので、cu ファイルは必要ありません。
MODULE m_cuda
USE, INTRINSIC :: ISO_C_BINDING
IMPLICIT NONE
ENUM, BIND(C)
ENUMERATOR :: cudaMemcpyHostToHost = 0
ENUMERATOR :: cudaMemcpyHostToDevice
ENUMERATOR :: cudaMemcpyDeviceToHost
ENUMERATOR :: cudaMemcpyDeviceToDevice
END ENUM
ENUM, BIND(C)
ENUMERATOR :: CUDA_SUCCESS = 0
ENUMERATOR :: CUDA_ERROR_INVALID_VALUE = 1
ENUMERATOR :: CUDA_ERROR_OUT_OF_MEMORY = 2
ENUMERATOR :: CUDA_ERROR_NOT_INITIALIZED = 3
ENUMERATOR :: CUDA_ERROR_DEINITIALIZED = 4
ENUMERATOR :: CUDA_ERROR_NO_DEVICE = 100
ENUMERATOR :: CUDA_ERROR_INVALID_DEVICE = 101
ENUMERATOR :: CUDA_ERROR_INVALID_IMAGE = 200
ENUMERATOR :: CUDA_ERROR_INVALID_CONTEXT = 201
ENUMERATOR :: CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
ENUMERATOR :: CUDA_ERROR_MAP_FAILED = 205
ENUMERATOR :: CUDA_ERROR_UNMAP_FAILED = 206
ENUMERATOR :: CUDA_ERROR_ARRAY_IS_MAPPED = 207
ENUMERATOR :: CUDA_ERROR_ALREADY_MAPPED = 208
ENUMERATOR :: CUDA_ERROR_NO_BINARY_FOR_GPU = 209
ENUMERATOR :: CUDA_ERROR_ALREADY_ACQUIRED = 210
ENUMERATOR :: CUDA_ERROR_NOT_MAPPED = 211
ENUMERATOR :: CUDA_ERROR_INVALID_SOURCE = 300
ENUMERATOR :: CUDA_ERROR_FILE_NOT_FOUND = 301
ENUMERATOR :: CUDA_ERROR_INVALID_HANDLE = 400
ENUMERATOR :: CUDA_ERROR_NOT_FOUND = 500
ENUMERATOR :: CUDA_ERROR_NOT_READY = 600
ENUMERATOR :: CUDA_ERROR_LAUNCH_FAILED = 700
ENUMERATOR :: CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
ENUMERATOR :: CUDA_ERROR_LAUNCH_TIMEOUT = 702
ENUMERATOR :: CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
ENUMERATOR :: CUDA_ERROR_UNKNOWN = 999
END ENUM
TYPE, BIND(C) :: t_cudaDeviceProp
CHARACTER (256) :: name
INTEGER(C_SIZE_T) :: totalGlobalMem
INTEGER(C_SIZE_T) :: sharedMemPerBlock
INTEGER(C_INT) :: regsPerBlock
INTEGER(C_INT) :: warpSize
INTEGER(C_SIZE_T) :: memPitch
INTEGER(C_INT) :: maxThreadsPerBlock
INTEGER(C_INT) :: maxThreadsDim(3)
INTEGER(C_INT) :: maxGridSize(3)
INTEGER(C_INT) :: clockRate
INTEGER(C_SIZE_T) :: totalConstMem
INTEGER(C_INT) :: major
INTEGER(C_INT) :: minor
INTEGER(C_SIZE_T) :: textureAlignment
INTEGER(C_INT) :: deviceOverlap
INTEGER(C_INT) :: multiProcessorCount
INTEGER(C_INT) :: kernelExecTimeoutEnabled
INTEGER(C_INT) :: cudaReserved(39)
END TYPE
INTERFACE
SUBROUTINE cudaGetDeviceCount(n) BIND(C, name='cudaGetDeviceCount')
INTEGER, INTENT(OUT) :: n
END SUBROUTINE cudaGetDeviceCount
INTEGER FUNCTION cudaGetDeviceProperties(cudaDeviceProp, idev) BIND(C, name='cudaGetDeviceProperties')
USE, INTRINSIC :: ISO_C_BINDING
IMPORT
TYPE (t_cudaDeviceProp), INTENT(OUT) :: cudaDeviceProp
INTEGER, VALUE, INTENT(IN) :: idev
END FUNCTION cudaGetDeviceProperties
INTEGER FUNCTION cudaSetDevice(idev) BIND(C, name='cudaSetDevice')
INTEGER, VALUE, INTENT(IN) :: idev
END FUNCTION cudaSetDevice
INTEGER FUNCTION cudaMalloc(ip, nn) BIND(C, name='cudaMalloc')
USE, INTRINSIC :: ISO_C_BINDING
TYPE (C_PTR), INTENT(OUT) :: ip
INTEGER (C_SIZE_T), VALUE, INTENT(IN) :: nn
END FUNCTION cudaMalloc
INTEGER FUNCTION cudaFree(ip) BIND(C, name='cudaFree')
USE, INTRINSIC :: ISO_C_BINDING
TYPE (C_PTR), VALUE, INTENT(IN) :: ip
END FUNCTION cudaFree
INTEGER FUNCTION cudaMallocHost(ip, nn) BIND(C, name='cudaMallocHost')
USE, INTRINSIC :: ISO_C_BINDING
TYPE (C_PTR), INTENT(OUT) :: ip
INTEGER (C_SIZE_T), VALUE, INTENT(IN) :: nn
END FUNCTION cudaMallocHost
INTEGER FUNCTION cudaFreeHost(ip) BIND(C, name='cudaFreeHost')
USE, INTRINSIC :: ISO_C_BINDING
TYPE (C_PTR), VALUE, INTENT(IN) :: ip
END FUNCTION cudaFreeHost
INTEGER FUNCTION cudaMemcpy(p_out, p_in, nsize, itype) BIND(C, name='cudaMemcpy')
USE, INTRINSIC :: ISO_C_BINDING
TYPE (C_PTR), VALUE, INTENT(IN) :: p_out
TYPE (C_PTR), VALUE, INTENT(IN) :: p_in
INTEGER (C_SIZE_T), VALUE, INTENT(IN) :: nsize
INTEGER (C_INT), VALUE, INTENT(IN) :: itype
END FUNCTION cudaMemcpy
END INTERFACE
CONTAINS
SUBROUTINE printCudaLastError()
USE, INTRINSIC :: ISO_C_BINDING
IMPLICIT NONE
INTERFACE
INTEGER FUNCTION cudaGetLastError() BIND(C, name='cudaGetLastError')
END FUNCTION cudaGetLastError
FUNCTION cudaGetErrorString(n) RESULT(res) BIND(C, name='cudaGetErrorString')
USE, INTRINSIC :: ISO_C_BINDING
INTEGER, VALUE, INTENT(IN) :: n
TYPE (C_PTR) :: res
END FUNCTION cudaGetErrorString
END INTERFACE
CHARACTER (512), POINTER :: buff
TYPE (C_PTR) :: ibuff
INTEGER :: nerr, nlen
nerr = cudaGetLastError()
ibuff = cudaGetErrorString( nerr )
CALL C_F_POINTER(ibuff, buff)
nlen = INDEX(buff, ACHAR(0)) - 1
PRINT '(a)', buff(1:nlen)
RETURN
END SUBROUTINE printCudaLastError
SUBROUTINE CheckCudaRet(iret, text)
USE, INTRINSIC :: ISO_C_BINDING
IMPLICIT NONE
INTEGER, INTENT(IN) :: iret
CHARACTER (*), INTENT(IN), OPTIONAL :: text
IF (iret /= CUDA_SUCCESS) THEN
IF (PRESENT(text)) PRINT '(a)', text
CALL printCudaLastError()
STOP
END IF
RETURN
END SUBROUTINE CheckCudaRet
SUBROUTINE gpu_info()
USE, INTRINSIC :: ISO_C_BINDING
IMPLICIT NONE
TYPE (t_cudaDeviceProp) :: prop
INTEGER :: idev, ndev, iret
CALL cudaGetDeviceCount(ndev)
PRINT *, 'Number of Device(s) =', ndev
DO idev = 0, ndev - 1
iret = cudaGetDeviceProperties(prop, idev)
CALL CheckCudaRet(iret, 'cudaGetDeviceProperties')
PRINT '(1x, a, i4)', 'Device No.:', idev
PRINT '(1x, a, a )', 'name: ', prop%name( 1:INDEX(prop%name, ACHAR(0)) - 1 )
PRINT '(1x, a25, i10, a, i5, a)', 'totalGlobalMem: ', prop%totalGlobalMem, ' bytes =', &
prop%totalGlobalMem / 2**20, ' Mbytes'
PRINT '(1x, a25, i10, a, i5, a)', 'sharedMemPerBlock: ', prop%sharedMemPerBlock, ' bytes =', &
prop%sharedMemPerBlock / 2**10, ' Kbytes'
PRINT '(1x, a25, i10, a, i5, a)', 'regsPerBlock: ', prop%regsPerBlock, ' bytes =', &
prop%regsPerBlock / 2**10, ' Kbytes'
PRINT '(1x, a25, i10)', 'warpSize: ', prop%warpSize
PRINT '(1x, a25, i10)', 'memPitch: ', prop%memPitch
PRINT '(1x, a25, i10)', 'maxThreadsPerBlock: ', prop%maxThreadsPerBlock
PRINT '(1x, a25, 3i10)', 'maxThreadsDim: ', prop%maxThreadsDim
PRINT '(1x, a25, 3i10)', 'maxGridSize: ', prop%maxGridSize
PRINT '(1x, a25, i10, a, i5, a)', 'totalConstMem: ', prop%totalConstMem, ' bytes =', &
prop%totalConstMem / 2**10, ' Kbytes'
PRINT '(1x, a25, 2i10)', 'major, minor: ', prop%major, prop%minor
PRINT '(1x, a25, i10, a, f8.2, a)', 'clockRate: ', prop%clockRate, ' Hz =', &
prop%clockRate / 1.0e6, ' GHz'
PRINT '(1x, a25, i10)', 'textureAlignment: ', prop%textureAlignment
PRINT '(1x, a25, i10)', 'deviceOverlap: ', prop%deviceOverlap
PRINT '(1x, a25, i10)', 'multiProcessorCount: ', prop%multiProcessorCount
PRINT '(1x, a25, i10)', 'kernelExecTimeoutEnabled:', prop%kernelExecTimeoutEnabled
PRINT *
END DO
END SUBROUTINE gpu_info
END MODULE m_cuda
PROGRAM CudaTest
USE, INTRINSIC :: ISO_C_BINDING
USE :: m_cuda
IMPLICIT NONE
INTEGER, PARAMETER :: N = 2**24
REAL (8) :: t0, t1, tt0, tt1
CHARACTER(10) :: time0, time1
INTEGER :: i, iret, ic0, ic1, icrate
TYPE (C_PTR) :: h_c, h_b, d_p, d_q, d_r(10)
REAL (C_FLOAT), POINTER:: c(:), b(:)
INTEGER (C_SIZE_T) :: nn
CALL gpu_info()
iret = cudaSetDevice( 0 )
CALL CheckCudaRet(iret, 'cudaSetDevice')
nn = sizeof(C_FLOAT) * n
PRINT *, nn, 'bytes ', nn / 2**20, 'Mbytes'
iret = cudaMallocHost(h_c, nn)
CALL CheckCudaRet(iret, 'cudaMallocHost1')
CALL C_F_POINTER( h_c, c, [n] )
iret = cudaMallocHost(h_b, nn)
CALL CheckCudaRet(iret, 'cudaMallocHost2')
CALL C_F_POINTER( h_b, b, [n] )
c = 1.0
iret = cudaMalloc(d_p, nn)
CALL CheckCudaRet(iret, 'cudaMalloc1')
iret = cudaMalloc(d_q, nn)
CALL CheckCudaRet(iret, 'cudaMalloc2')
DO i = 1, 3
iret = cudaMalloc(d_r(i), nn)
PRINT *, 'Alloc ', i
CALL CheckCudaRet(iret, 'cudaMalloc3')
END DO
CALL CPU_TIME(t0)
CALL SYSTEM_CLOCK(ic0)
CALL date_and_time(time = time0)
DO i = 1, 500
iret = cudaMemcpy( d_p, h_c, nn, cudaMemcpyHostToDevice )
CALL CheckCudaRet(iret, 'cudaMemcpy1')
iret = cudaMemcpy( d_q, d_p, nn, cudaMemcpyDeviceToDevice )
CALL CheckCudaRet(iret, 'cudaMemcpy2')
iret = cudaMemcpy( h_b, d_q, nn, cudaMemcpyDeviceToHost )
CALL CheckCudaRet(iret, 'cudaMemcpy3')
iret = cudaMemcpy( d_r(1), h_c, nn, cudaMemcpyHostToDevice )
CALL CheckCudaRet(iret, 'cudaMemcpy1')
iret = cudaMemcpy( d_r(2), d_r(1), nn, cudaMemcpyDeviceToDevice )
CALL CheckCudaRet(iret, 'cudaMemcpy2')
iret = cudaMemcpy( h_b, d_r(2), nn, cudaMemcpyDeviceToHost )
CALL CheckCudaRet(iret, 'cudaMemcpy3')
iret = cudaMemcpy( d_r(3), h_c, nn, cudaMemcpyHostToDevice )
CALL CheckCudaRet(iret, 'cudaMemcpy1')
iret = cudaMemcpy( d_r(1), d_r(3), nn, cudaMemcpyDeviceToDevice )
CALL CheckCudaRet(iret, 'cudaMemcpy2')
iret = cudaMemcpy( h_b, d_r(1), nn, cudaMemcpyDeviceToHost )
CALL CheckCudaRet(iret, 'cudaMemcpy3')
END DO
CALL date_and_time(time = time1)
CALL CPU_TIME(t1)
CALL SYSTEM_CLOCK(ic1, icrate)
READ(time0, '(F10.3)') tt0
READ(time1, '(F10.3)') tt1
PRINT *, 'Copy Host->GPU->GPU->HOST: cpu_time = ', t1 - t0, 'sec'
PRINT *, ' DATE_AND_TIME = ', tt1 - tt0, 'sec'
IF ( ANY(c /= b) ) THEN
PRINT *, 'error: mismatch!'
ELSE
PRINT *, 'Copy Check OK', SUM(b) * sizeof(C_FLOAT), KIND(1.0_C_FLOAT)
END IF
iret = cudaFree(d_p)
CALL CheckCudaRet(iret, 'cudaFree')
iret = cudaFree(d_q)
CALL CheckCudaRet(iret, 'cudaFree')
DO i = 1, 3
iret = cudaFree(d_r(i))
PRINT *, 'Free ', i
CALL CheckCudaRet(iret, 'cudaFree')
END DO
iret = cudaFreeHost(h_c)
CALL CheckCudaRet(iret, 'cudaFreeHost')
iret = cudaFreeHost(h_b)
CALL CheckCudaRet(iret, 'cudaFreeHost')
STOP
END PROGRAM CudaTest