またぞろ簡単素朴な積分をやってみます。
を計算します。 を GPU の各スレッドごとに異なる について計算してもらって、積分範囲全域での被積分関数の値を配列に一気に返してもらいます。
Fortran ソース
前半は Fortran 側で CUDA ルーチンを直接呼んで、ビデオカードの情報を得ています。
後半では、簡単な台形積分をやって π を求めています。各積分点ごとのローレンツ型関数の値を GPU 側でスレッド並列で計算し配列に返してもらいます。簡単のため CPU側(Fortran 側)で総和処理をしています。
h を渡したのに使ってないとか、色々無駄がありますが勘弁してもらって・・
MODULE m_cuda USE, INTRINSIC :: ISO_C_BINDING IMPLICIT NONE ! TYPE, BIND(C) :: t_cudaDeviceProp CHARACTER (256) :: name INTEGER(C_LONG_LONG) :: totalGlobalMem ! t_size = C_LONG_LONG = 8bytes INTEGER(C_LONG_LONG) :: sharedMemPerBlock INTEGER(C_INT) :: regsPerBlock INTEGER(C_INT) :: warpSize INTEGER(C_LONG_LONG) :: memPitch INTEGER(C_INT) :: maxThreadsPerBlock INTEGER(C_INT) :: maxThreadsDim(3) INTEGER(C_INT) :: maxGridSize(3) INTEGER(C_INT) :: clockRate INTEGER(C_LONG_LONG) :: totalConstMem INTEGER(C_INT) :: major INTEGER(C_INT) :: minor INTEGER(C_LONG_LONG) :: textureAlignment INTEGER(C_INT) :: deviceOverlap INTEGER(C_INT) :: multiProcessorCount INTEGER(C_INT) :: kernelExecTimeoutEnabled INTEGER(C_INT) :: cudaReserved(39) END TYPE ! INTERFACE SUBROUTINE cudaGetDeviceCount(n) BIND(C, name='cudaGetDeviceCount') INTEGER, INTENT(OUT) :: n END SUBROUTINE cudaGetDeviceCount ! INTEGER FUNCTION cudaGetDeviceProperties(cudaDeviceProp, idev) BIND(C, name='cudaGetDeviceProperties') USE, INTRINSIC :: ISO_C_BINDING IMPORT TYPE (t_cudaDeviceProp), INTENT(OUT) :: cudaDeviceProp INTEGER, VALUE, INTENT(IN) :: idev END FUNCTION cudaGetDeviceProperties ! INTEGER FUNCTION cudaSetDevice(idev) BIND(C, name='cudaSetDevice') INTEGER, VALUE, INTENT(IN) :: idev END FUNCTION cudaSetDevice END INTERFACE END MODULE m_cuda !====================================================== PROGRAM CudaTest USE, INTRINSIC :: ISO_C_BINDING USE :: m_cuda IMPLICIT NONE ! INTERFACE SUBROUTINE cudafunction(n, h, c) BIND(C, name='cudafunction') USE, INTRINSIC :: ISO_C_BINDING IMPORT INTEGER , INTENT(IN) :: n REAL (C_FLOAT), INTENT(IN) :: h REAL (C_FLOAT), INTENT(OUT):: c(N) END SUBROUTINE cudafunction END INTERFACE ! TYPE (t_cudaDeviceProp) :: prop ! !define length of the array INTEGER, PARAMETER :: N = 1024 ! REAL (C_FLOAT) :: c(N) REAL :: s, h INTEGER :: i, idev, ndev, iret ! CALL cudaGetDeviceCount(ndev) PRINT *, 'Number of Device(s) =', ndev DO idev = 0, ndev - 1 iret = cudaGetDeviceProperties(prop, idev) IF (iret /= 0) THEN PRINT '(a, i6)', 'cudaGetDeviceProperties: error code = ', iret STOP END IF PRINT '(1x, a, i4)', 'Device No.:', idev PRINT '(1x, a, a )', 'name: ', prop%name( 1:INDEX(prop%name, ACHAR(0)) - 1 ) PRINT '(1x, a25, i10, a, i5, a)', 'totalGlobalMem: ', prop%totalGlobalMem, ' bytes =', & prop%totalGlobalMem / 2**20, ' Mbytes' PRINT '(1x, a25, i10, a, i5, a)', 'sharedMemPerBlock: ', prop%sharedMemPerBlock, ' bytes =', & prop%sharedMemPerBlock / 2**10, ' Kbytes' PRINT '(1x, a25, i10, a, i5, a)', 'regsPerBlock: ', prop%regsPerBlock, ' bytes =', & prop%regsPerBlock / 2**10, ' Kbytes' PRINT '(1x, a25, i10)', 'warpSize: ', prop%warpSize PRINT '(1x, a25, i10)', 'memPitch: ', prop%memPitch PRINT '(1x, a25, i10)', 'maxThreadsPerBlock: ', prop%maxThreadsPerBlock PRINT '(1x, a25, 3i10)', 'maxThreadsDim: ', prop%maxThreadsDim PRINT '(1x, a25, 3i10)', 'maxGridSize: ', prop%maxGridSize PRINT '(1x, a25, i10, a, i5, a)', 'totalConstMem: ', prop%totalConstMem, ' bytes =', & prop%totalConstMem / 2**10, ' Kbytes' PRINT '(1x, a25, 2i10)', 'major, minor: ', prop%major, prop%minor PRINT '(1x, a25, i10, a, f8.2, a)', 'clockRate: ', prop%clockRate, ' Hz =', & prop%clockRate / 1.0e6, ' GHz' PRINT '(1x, a25, i10)', 'textureAlignment: ', prop%textureAlignment PRINT '(1x, a25, i10)', 'deviceOverlap: ', prop%deviceOverlap PRINT '(1x, a25, i10)', 'multiProcessorCount: ', prop%multiProcessorCount PRINT '(1x, a25, i10)', 'kernelExecTimeoutEnabled:', prop%kernelExecTimeoutEnabled PRINT * END DO idev = 0 iret = cudaSetDevice(idev) IF (iret /= 0) THEN PRINT *, 'cudaSetDevice: error code =', iret STOP END IF ! h = (1.0 - 0.0) / (N - 1) ! Fortran -> C -> CUDA ->C ->Fortran CALL cudafunction(N, h, c) ! !print *, c ! s = h * ( SUM(c) - 0.5 * (c(1) + c(N)) ) ! Trapezoidal integral : Int_0^1 1/ (1 + x^2) dx = pi / 4 PRINT *, 'PI =', s * 4.0 STOP END PROGRAM CudaTest
CU ソース
#include <stdio.h> #include "cuda.h" /* Define CUDA kernel that squares the input complex array */ __global__ void lorentz(float *in, float *out, int N) { unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; float x = (float) index / (float) (N - 1); if( index<N ) { out[index] = 1.0 / (1.0 + x * x); } } /* Fortran subroutine arguments are passed by references. call fun( array_a, array_b, N) will be mapped to function (*a, *b, *N); */ extern "C" void cudafunction(int *Np, float *h, float *a) { int block_size=4; float *a_d; int N=*Np; /* Allocate complex array on device */ cudaMalloc ((void **) &a_d , sizeof(float)*N); // /* Copy array from host memory to device memory */ // cudaMemcpy( a_d, a, sizeof(float)*N ,cudaMemcpyHostToDevice); /* Compute execution configuration */ dim3 dimBlock(block_size); dim3 dimGrid (N/dimBlock.x); if( N % block_size != 0 ) dimGrid.x+=1; /* Execute the kernel */ lorentz<<<dimGrid, dimBlock>>>(a_d, a_d, N); /* Copy the result back */ cudaMemcpy( a, a_d, sizeof(float) * N, cudaMemcpyDeviceToHost ); /* Free memory on the device */ cudaFree(a_d); return; }