Test for working and fully functional setup #63

Sbozzolo · 2024-10-10T16:11:53Z

Here are two scripts I used to check if MPI+GPU was working correctly and if there is direct GPU-GPU communication:

mpitest.cpp (I don't know where I got this from)

#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <mpi-ext.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <unistd.h>

int main(int argc, char *argv[])
{
    char * d_message;
    int myrank, tag=99;
    MPI_Request req;
    MPI_Status stat;
    int flags;

    /* Initialize the MPI library */
    //MPI_Init_thread(&argc, &argv,MPI_THREAD_SINGLE, &flags);
    MPI_Init(&argc, &argv);
    /* Determine unique id of the calling process of all processes participating
       in this MPI program. This id is usually called MPI rank. */
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

    printf("Compile time check:\n");
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
    printf("This MPI library has CUDA-aware support.\n", MPIX_CUDA_AWARE_SUPPORT);
#elif defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
    printf("This MPI library does not have CUDA-aware support.\n");
#else
    printf("This MPI library cannot determine if there is CUDA-aware support.\n");
#endif /* MPIX_CUDA_AWARE_SUPPORT */

    printf("Run time check:\n");
#if defined(MPIX_CUDA_AWARE_SUPPORT)
    if (1 == MPIX_Query_cuda_support()) {
        printf("This MPI library has CUDA-aware support.\n");
    } else {
        printf("This MPI library does not have CUDA-aware support.\n");
    }
#else /* !defined(MPIX_CUDA_AWARE_SUPPORT) */
    printf("This MPI library cannot determine if there is CUDA-aware support.\n");
#endif /* MPIX_CUDA_AWARE_SUPPORT */

    cudaError_t ret1, ret2;

    cudaSetDevice( 0 );
    ret1 = cudaDeviceEnablePeerAccess ( 1, 0);
    cudaSetDevice( 1 );
    ret2 = cudaDeviceEnablePeerAccess (0, 0);

    cudaSetDevice( myrank );
    cudaMallocManaged((void **)&d_message, 8e8 );
    //cudaMalloc((void **)&d_message, 8e8 );

    printf("rank %i's device ptr : %p, (%i,%i)\n", myrank, d_message, ret1, ret2 );
    if (myrank == 0) {
        char h_message[20] = "Hello World";
        cudaMemcpy ( d_message, h_message, sizeof(h_message), cudaMemcpyHostToDevice );
        MPI_Send(d_message, 8e8, MPI_CHAR, 1, tag, MPI_COMM_WORLD);
    } else {
        char h_message[20];
        MPI_Irecv(d_message, 8e8, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &req);
        MPI_Wait(&req, &stat);
        cudaMemcpy ( h_message, d_message, sizeof(h_message), cudaMemcpyDeviceToHost );
        printf(" host_message : %s \n", h_message);
    }

    /* Finalize the MPI library to free resources acquired by it. */
    MPI_Finalize();
    cudaFree( d_message );
    return 0;
}

And cuda_test.jl

using ClimaComms, NVTX, CUDA
context = ClimaComms.context()
iproc, nproc = ClimaComms.init(context)
if ClimaComms.iamroot(context)
    @show context
end
send_arr = CuArray{Float64}(undef, 1000, 1000)
recv_arr = CuArray{Float64}(undef, 1000, 1000)
buf = ClimaComms.graph_context(context,
              send_arr, [length(send_arr)], [mod1(iproc+1,nproc)],
              recv_arr, [length(recv_arr)], [mod1(iproc-1,nproc)])
NVTX.@range "round 1" begin
    ClimaComms.start(buf)
    ClimaComms.finish(buf)
end
NVTX.@range "round 2" begin
    ClimaComms.start(buf)
    ClimaComms.finish(buf)
end
NVTX.@range "round 3" begin
    ClimaComms.start(buf)
    ClimaComms.finish(buf)
end

Run with

$MPITRAPOLINE_MPIEXEC -np 4 nsys profile --trace=nvtx,cuda,mpi julia --project=. cuda_test.jl && nsys stats report1.nsys-rep | grep -C 5 "MemOps"

Returns

Processing [report1.sqlite] with [/opt/nvidia/nsight-systems/2024.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 
SKIPPED: report1.sqlite does not contain CUDA kernel data.

Processing [report1.sqlite] with [/opt/nvidia/nsight-systems/2024.2.3/host-linux-x64/reports/cuda_gpu_mem_time_sum.py]... 

 ** CUDA GPU MemOps Summary (by Time) (cuda_gpu_mem_time_sum):

 Time (%)  Total Time (ns)  Count  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)           Operation          
 --------  ---------------  -----  --------  --------  --------  --------  -----------  ----------------------------
     52.9          670,558     62  10,815.5  11,552.0     1,696    13,408      2,090.9  [CUDA memcpy Host-to-Device]
     41.4          525,147     62   8,470.1   8,448.0     2,848    10,464        819.6  [CUDA memcpy Device-to-Host]
      5.7           72,864      2  36,432.0  36,432.0    33,408    39,456      4,276.6  [CUDA memcpy Peer-to-Peer]  

Processing [report1.sqlite] with [/opt/nvidia/nsight-systems/2024.2.3/host-linux-x64/reports/cuda_gpu_mem_size_sum.py]... 

 ** CUDA GPU MemOps Summary (by Size) (cuda_gpu_mem_size_sum):

 Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)           Operation          
 ----------  -----  --------  --------  --------  --------  -----------  ----------------------------
     16.000      2     8.000     8.000     8.000     8.000        0.000  [CUDA memcpy Peer-to-Peer]  
      8.000     62     0.129     0.131     0.007     0.131        0.016  [CUDA memcpy Device-to-Host]

where we can read where is P2P and where it is not

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Test for working and fully functional setup #63

Test for working and fully functional setup #63

Sbozzolo commented Oct 10, 2024

Test for working and fully functional setup #63

Test for working and fully functional setup #63

Comments

Sbozzolo commented Oct 10, 2024