StartDate: 2025-07-02 08:05:22+00:00 CpuId: 12x Intel Xeon W 2000 / D-2100 (Skylake / Cascade Lake) {Skylake}, 14nm GpuId: 1x Tesla V100-SXM2-16GB CommitSHA: c9ed8e0e9ce80a0104790ac3426f6468684713f5 CommitTime: 2025-06-27 16:50:44 +0200 CommitAuthor: Matthias Krack CommitSubject: Add Daint Spack psmp CI tester to dashboard #################### Building Image cp2k-perf-cuda-volta #################### Dockerfile: /tools/docker/Dockerfile.test_performance_cuda_V100 Build-Path: / Build-Args: GIT_COMMIT_SHA=c9ed8e0e9ce80a0104790ac3426f6468684713f5 SPACK_CACHE=gs://cp2k-spack-cache Build-Cache: Yes Populating docker build cache... done. DEPRECATED: The legacy builder is deprecated and will be removed in a future release. BuildKit is currently disabled; enable it by removing the DOCKER_BUILDKIT=0 environment-variable. Sending build context to Docker daemon 404.7MB Step 1/48 : FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 11.8.0-devel-ubuntu22.04: Pulling from nvidia/cuda aece8493d397: Already exists 5e3b7ee77381: Already exists 5bd037f007fd: Already exists 4cda774ad2ec: Already exists 775f22adee62: Already exists 263fc748118f: Already exists 16c36d0187d0: Already exists e7a56570655c: Already exists 507fc9045cba: Already exists 23b7d8e07c16: Already exists 922ac8fcb889: Already exists Digest: sha256:94fd755736cb58979173d491504f0b573247b1745250249415b07fefc738e41f Status: Downloaded newer image for nvidia/cuda:11.8.0-devel-ubuntu22.04 ---> 6f9cc9f1ba9e Step 2/48 : ENV CUDA_PATH /usr/local/cuda ---> Using cache ---> ea6c9bc4eda6 Step 3/48 : ENV LD_LIBRARY_PATH /usr/local/cuda/lib64 ---> Using cache ---> 223d787cdd89 Step 4/48 : ENV CUDA_CACHE_DISABLE 1 ---> Using cache ---> 1774168f85a8 Step 5/48 : RUN apt-get update -qq && apt-get install -qq --no-install-recommends gfortran mpich libmpich-dev && rm -rf /var/lib/apt/lists/* ---> Using cache ---> cd35350707cc Step 6/48 : WORKDIR /opt/cp2k-toolchain ---> Using cache ---> ca16f8d162c0 Step 7/48 : COPY ./tools/toolchain/install_requirements*.sh ./ ---> Using cache ---> a82ad23d0c42 Step 8/48 : RUN ./install_requirements.sh ubuntu ---> Using cache ---> 45f0debb08b0 Step 9/48 : RUN mkdir scripts ---> Using cache ---> 0d1096f378b4 Step 10/48 : COPY ./tools/toolchain/scripts/VERSION ./tools/toolchain/scripts/parse_if.py ./tools/toolchain/scripts/tool_kit.sh ./tools/toolchain/scripts/common_vars.sh ./tools/toolchain/scripts/signal_trap.sh ./tools/toolchain/scripts/get_openblas_arch.sh ./scripts/ ---> Using cache ---> 187abb9e9e29 Step 11/48 : COPY ./tools/toolchain/install_cp2k_toolchain.sh . ---> Using cache ---> 58793f6f8316 Step 12/48 : RUN ./install_cp2k_toolchain.sh --mpi-mode=mpich --enable-cuda=yes --gpu-ver=V100 --with-dbcsr=no --dry-run ---> Using cache ---> f60bf5ef83c9 Step 13/48 : COPY ./tools/toolchain/scripts/stage0/ ./scripts/stage0/ ---> Using cache ---> 615f61b6b3df Step 14/48 : RUN ./scripts/stage0/install_stage0.sh && rm -rf ./build ---> Using cache ---> 2a876540b3d7 Step 15/48 : COPY ./tools/toolchain/scripts/stage1/ ./scripts/stage1/ ---> Using cache ---> baa5d92fe084 Step 16/48 : RUN ./scripts/stage1/install_stage1.sh && rm -rf ./build ---> Using cache ---> e3e6afa94777 Step 17/48 : COPY ./tools/toolchain/scripts/stage2/ ./scripts/stage2/ ---> Using cache ---> 98da21f1d8cd Step 18/48 : RUN ./scripts/stage2/install_stage2.sh && rm -rf ./build ---> Using cache ---> 20e16a32e18d Step 19/48 : COPY ./tools/toolchain/scripts/stage3/ ./scripts/stage3/ ---> Using cache ---> 1f8fa006e14e Step 20/48 : RUN ./scripts/stage3/install_stage3.sh && rm -rf ./build ---> Using cache ---> 3f1d8ae971e0 Step 21/48 : COPY ./tools/toolchain/scripts/stage4/ ./scripts/stage4/ ---> Using cache ---> 3acaa15b7413 Step 22/48 : RUN ./scripts/stage4/install_stage4.sh && rm -rf ./build ---> Using cache ---> 3552b2fd2993 Step 23/48 : COPY ./tools/toolchain/scripts/stage5/ ./scripts/stage5/ ---> Using cache ---> edcd169bd7c9 Step 24/48 : RUN ./scripts/stage5/install_stage5.sh && rm -rf ./build ---> Using cache ---> 042a8b9fc0b3 Step 25/48 : COPY ./tools/toolchain/scripts/stage6/ ./scripts/stage6/ ---> Using cache ---> b96ad9fd5a02 Step 26/48 : RUN ./scripts/stage6/install_stage6.sh && rm -rf ./build ---> Using cache ---> b0fc22e1c685 Step 27/48 : COPY ./tools/toolchain/scripts/stage7/ ./scripts/stage7/ ---> Using cache ---> 928c58b6ad5c Step 28/48 : RUN ./scripts/stage7/install_stage7.sh && rm -rf ./build ---> Using cache ---> 361444fc445e Step 29/48 : COPY ./tools/toolchain/scripts/stage8/ ./scripts/stage8/ ---> Using cache ---> 3300e5d561ea Step 30/48 : RUN ./scripts/stage8/install_stage8.sh && rm -rf ./build ---> Using cache ---> eca4670efefd Step 31/48 : COPY ./tools/toolchain/scripts/stage9/ ./scripts/stage9/ ---> Using cache ---> 3953a584de67 Step 32/48 : RUN ./scripts/stage9/install_stage9.sh && rm -rf ./build ---> Using cache ---> 28dc3defeb97 Step 33/48 : COPY ./tools/toolchain/scripts/arch_base.tmpl ./tools/toolchain/scripts/generate_arch_files.sh ./scripts/ ---> Using cache ---> ad1bf3673df9 Step 34/48 : RUN ./scripts/generate_arch_files.sh && rm -rf ./build ---> Using cache ---> f4b3a42a46e5 Step 35/48 : WORKDIR /opt/cp2k ---> Using cache ---> 0ff04754db6a Step 36/48 : COPY ./Makefile . ---> Using cache ---> 1b54fa971215 Step 37/48 : COPY ./src ./src ---> c88e35c79220 Step 38/48 : COPY ./exts ./exts ---> 6e6fa345a550 Step 39/48 : COPY ./tools/build_utils ./tools/build_utils ---> 1476550f437b Step 40/48 : RUN /bin/bash -c " mkdir -p arch && ln -vs /opt/cp2k-toolchain/install/arch/local_cuda.psmp ./arch/" ---> Running in bea970e8d3f7 './arch/local_cuda.psmp' -> '/opt/cp2k-toolchain/install/arch/local_cuda.psmp' Removing intermediate container bea970e8d3f7 ---> e19f41983c4a Step 41/48 : COPY ./data ./data ---> a3fcd9c03c91 Step 42/48 : COPY ./tests ./tests ---> 58c69becf3dd Step 43/48 : COPY ./tools/regtesting ./tools/regtesting ---> ffbe0b23bbf7 Step 44/48 : COPY ./benchmarks ./benchmarks ---> 29c541747f44 Step 45/48 : COPY ./tools/docker/scripts/test_performance.sh ./tools/docker/scripts/plot_performance.py ./ ---> ae3e8d6c5fe5 Step 46/48 : RUN ./test_performance.sh "local_cuda" 2>&1 | tee report.log ---> Running in 3bf264c97c23 ========== Compiling CP2K ========== Compiling cp2k... done. Checking benchmark inputs... Found 77 input files and 0 errors. ========== Running Performance Test ========== Running H2O-64.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/H2O-64_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.028 0.028 96.691 96.691 qs_mol_dyn_low 1 2.0 0.004 0.004 96.231 96.234 qs_forces 11 3.9 0.002 0.002 96.185 96.185 qs_energies 11 4.9 0.001 0.001 84.770 84.773 scf_env_do_scf 11 5.9 0.001 0.001 68.262 68.262 velocity_verlet 10 3.0 0.001 0.001 58.659 58.676 scf_env_do_scf_inner_loop 108 6.5 0.006 0.008 58.015 58.015 rebuild_ks_matrix 119 8.3 0.001 0.001 28.338 28.344 qs_ks_build_kohn_sham_matrix 119 9.3 0.016 0.016 28.338 28.343 dbcsr_multiply_generic 2286 12.5 0.119 0.119 26.524 26.566 qs_ks_update_qs_env 119 7.6 0.001 0.001 25.728 25.733 qs_scf_new_mos 108 7.5 0.001 0.001 20.442 20.443 qs_scf_loop_do_ot 108 8.5 0.001 0.001 20.441 20.442 ot_scf_mini 108 9.5 0.003 0.003 18.566 18.571 qs_rho_update_rho_low 119 7.7 0.001 0.001 16.608 16.627 calculate_rho_elec 119 8.7 0.878 0.884 16.608 16.626 fft_wrap_pw1pw2 1201 11.6 0.021 0.022 15.281 15.307 sum_up_and_integrate 119 10.3 0.002 0.002 14.845 14.874 integrate_v_rspace 119 11.3 0.335 0.338 14.751 14.781 multiply_cannon 2286 13.5 0.323 0.324 14.412 14.428 multiply_cannon_loop 2286 14.5 0.222 0.223 13.395 13.416 fft_wrap_pw1pw2_140 487 12.2 0.002 0.002 13.292 13.335 ot_mini 108 10.5 0.001 0.001 11.192 11.195 make_m2s 4572 13.5 0.040 0.041 10.558 10.565 make_images 4572 14.5 1.138 1.146 10.384 10.390 init_scf_loop 11 6.9 0.000 0.000 10.168 10.169 density_rs2pw 119 9.7 0.007 0.007 9.993 10.110 grid_integrate_task_list 119 12.3 8.442 8.474 8.442 8.474 init_scf_run 11 5.9 0.001 0.001 7.972 7.972 scf_env_initial_rho_setup 11 6.9 0.000 0.001 7.971 7.971 qs_energies_init_hamiltonians 11 5.9 0.000 0.000 7.908 7.908 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 7.705 7.847 pw_gpu_r3dc1d_3d_ps 606 13.1 2.392 2.410 7.820 7.837 multiply_cannon_multrec 4572 15.5 2.097 2.112 7.725 7.757 pw_gpu_c1dr3d_3d_ps 595 14.2 2.313 2.330 7.434 7.445 qs_ot_get_derivative 108 11.5 0.001 0.001 6.992 6.995 prepare_preconditioner 11 7.9 0.000 0.000 6.934 6.935 make_preconditioner 11 8.9 0.000 0.000 6.934 6.935 hybrid_alltoall_any 4725 16.4 4.815 4.827 6.147 6.167 make_full_inverse_cholesky 11 9.9 0.000 0.000 5.871 6.117 make_images_data 4572 15.5 0.055 0.056 6.043 6.050 potential_pw2rs 119 12.3 0.035 0.036 5.973 5.973 grid_collocate_task_list 119 9.7 5.709 5.795 5.709 5.795 dbcsr_mm_accdrv_process 9594 16.2 0.679 0.682 5.245 5.258 ot_diis_step 108 11.5 0.005 0.005 4.179 4.179 build_core_ppl_forces 11 5.9 3.902 4.019 3.902 4.019 calculate_dm_sparse 119 9.5 0.001 0.001 3.932 3.936 jit_kernel_multiply 12 15.8 3.900 3.908 3.900 3.908 qs_env_update_s_mstruct 11 6.9 0.000 0.000 3.834 3.901 build_core_hamiltonian_matrix 11 6.9 0.001 0.001 3.689 3.744 calculate_first_density_matrix 1 7.0 0.000 0.000 3.737 3.737 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 3.679 3.680 apply_single 119 13.6 0.000 0.001 3.679 3.679 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 3.565 3.565 pw_poisson_solve 119 10.3 0.003 0.003 3.463 3.481 dbcsr_complete_redistribute 329 12.2 1.305 1.328 3.198 3.455 wfi_extrapolate 11 7.9 0.001 0.001 3.452 3.452 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 3.400 3.401 mp_alltoall_z22v 1201 15.6 3.193 3.271 3.193 3.271 qs_create_task_list 11 7.9 0.000 0.000 3.086 3.210 generate_qs_task_list 11 8.9 1.162 1.174 3.086 3.210 multiply_cannon_sync_h2d 4572 15.5 3.126 3.154 3.126 3.154 qs_ot_get_p 119 10.4 0.001 0.001 2.979 2.979 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.906 2.908 mp_waitall_1 64495 16.9 2.739 2.784 2.739 2.784 transfer_rs2pw 487 10.6 0.007 0.007 2.445 2.605 qs_ot_get_derivative_taylor 59 13.0 0.002 0.002 2.505 2.510 copy_dbcsr_to_fm 153 11.3 0.003 0.003 2.494 2.504 cp_dbcsr_plus_fm_fm_t 22 8.9 0.000 0.001 2.350 2.352 qs_vxc_create 119 10.3 0.002 0.002 2.215 2.246 xc_vxc_pw_create 119 11.3 0.666 0.672 2.213 2.244 transfer_rs2pw_140 130 11.5 1.512 1.521 2.043 2.207 x_to_yz 595 15.2 0.494 0.502 2.073 2.116 yz_to_x 606 14.1 0.478 0.482 2.092 2.116 pw_gpu_fg 606 14.1 2.061 2.081 2.061 2.081 cp_fm_cholesky_invert 11 10.9 2.067 2.067 2.067 2.067 dbcsr_special_finalize 6858 15.5 0.035 0.035 2.059 2.061 transfer_dbcsr_to_fm 11 10.9 0.001 0.001 2.009 2.018 build_core_ppl 11 7.9 1.945 1.987 1.945 1.987 dbcsr_merge_single_wm 4572 16.5 0.137 0.140 1.951 1.954 ------------------------------------------------------------------------------- Plot: name="H2O-64_timings_6cpu_1gpu", title="Timings of H2O-64 with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="rest", label="rest", y=69.923, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=8.442, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=5.709, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="hybrid_alltoall_any", label="hybrid_alltoall_any", y=4.815, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="build_core_ppl_forces", label="build_core_ppl_forces", y=3.902, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="jit_kernel_multiply", label="jit_kernel_multiply", y=3.9, yerr=0.0 Running H2O-64_nonortho.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/H2O-64_nonortho_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.024 0.025 94.723 94.723 qs_mol_dyn_low 1 2.0 0.004 0.004 94.262 94.265 qs_forces 11 3.9 0.002 0.002 94.215 94.215 qs_energies 11 4.9 0.001 0.001 82.608 82.610 scf_env_do_scf 11 5.9 0.001 0.001 65.482 65.483 velocity_verlet 10 3.0 0.001 0.002 58.286 58.303 scf_env_do_scf_inner_loop 96 6.5 0.005 0.007 54.908 54.908 rebuild_ks_matrix 107 8.3 0.001 0.001 27.636 27.638 qs_ks_build_kohn_sham_matrix 107 9.3 0.015 0.015 27.635 27.637 qs_ks_update_qs_env 107 7.6 0.001 0.001 24.591 24.591 dbcsr_multiply_generic 1966 12.4 0.105 0.105 24.408 24.496 qs_scf_new_mos 96 7.5 0.001 0.001 18.369 18.374 qs_scf_loop_do_ot 96 8.5 0.001 0.001 18.369 18.373 qs_rho_update_rho_low 107 7.7 0.001 0.001 17.126 17.144 calculate_rho_elec 107 8.7 0.790 0.797 17.126 17.143 ot_scf_mini 96 9.5 0.002 0.002 16.706 16.708 sum_up_and_integrate 107 10.3 0.002 0.002 15.385 15.479 integrate_v_rspace 107 11.3 0.301 0.303 15.300 15.395 fft_wrap_pw1pw2 1081 11.6 0.019 0.020 13.887 13.923 multiply_cannon 1966 13.4 0.282 0.289 13.381 13.394 multiply_cannon_loop 1966 14.4 0.196 0.199 12.510 12.518 fft_wrap_pw1pw2_140 439 12.2 0.002 0.002 12.102 12.186 init_scf_loop 11 6.9 0.000 0.000 10.495 10.495 ot_mini 96 10.5 0.001 0.001 10.173 10.173 grid_integrate_task_list 107 12.3 9.588 9.683 9.588 9.683 make_m2s 3932 13.4 0.035 0.036 9.574 9.598 make_images 3932 14.4 1.031 1.039 9.418 9.441 density_rs2pw 107 9.7 0.006 0.006 9.049 9.153 qs_energies_init_hamiltonians 11 5.9 0.000 0.000 8.339 8.339 init_scf_run 11 5.9 0.001 0.001 8.149 8.149 scf_env_initial_rho_setup 11 6.9 0.000 0.001 8.148 8.149 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 7.557 7.684 multiply_cannon_multrec 3932 15.4 1.867 1.936 7.438 7.469 grid_collocate_task_list 107 9.7 7.266 7.346 7.266 7.346 pw_gpu_r3dc1d_3d_ps 546 13.1 2.170 2.200 7.142 7.172 prepare_preconditioner 11 7.9 0.000 0.000 7.104 7.117 make_preconditioner 11 8.9 0.000 0.000 7.104 7.117 pw_gpu_c1dr3d_3d_ps 535 14.2 2.090 2.112 6.720 6.728 qs_ot_get_derivative 96 11.5 0.001 0.001 6.445 6.448 make_full_inverse_cholesky 11 9.9 0.000 0.000 6.027 6.273 hybrid_alltoall_any 4079 16.3 4.354 4.364 5.622 5.648 make_images_data 3932 15.4 0.048 0.048 5.480 5.485 potential_pw2rs 107 12.3 0.032 0.033 5.410 5.411 dbcsr_mm_accdrv_process 8450 16.1 1.334 1.470 5.229 5.264 qs_env_update_s_mstruct 11 6.9 0.000 0.000 4.186 4.392 build_core_ppl_forces 11 5.9 3.851 3.946 3.851 3.946 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 3.916 3.917 build_core_hamiltonian_matrix 11 6.9 0.001 0.001 3.730 3.793 calculate_dm_sparse 107 9.5 0.001 0.001 3.742 3.743 ot_diis_step 96 11.5 0.004 0.004 3.709 3.709 calculate_first_density_matrix 1 7.0 0.000 0.000 3.678 3.678 wfi_extrapolate 11 7.9 0.001 0.001 3.646 3.646 dbcsr_complete_redistribute 317 12.2 1.258 1.330 3.333 3.580 qs_create_task_list 11 7.9 0.000 0.000 3.423 3.563 generate_qs_task_list 11 8.9 1.499 1.501 3.423 3.563 jit_kernel_multiply 10 15.5 3.287 3.397 3.287 3.397 apply_preconditioner_dbcsr 107 12.6 0.000 0.000 3.330 3.340 apply_single 107 13.6 0.000 0.001 3.330 3.339 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 3.307 3.308 pw_poisson_solve 107 10.3 0.003 0.003 3.128 3.137 mp_alltoall_z22v 1081 15.6 2.924 3.019 2.924 3.019 multiply_cannon_sync_h2d 3932 15.4 2.778 2.856 2.778 2.856 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.802 2.803 copy_dbcsr_to_fm 147 11.2 0.003 0.003 2.653 2.682 qs_ot_get_p 107 10.4 0.001 0.001 2.610 2.612 mp_waitall_1 55487 16.8 2.449 2.514 2.449 2.514 transfer_rs2pw 439 10.6 0.006 0.007 2.235 2.392 cp_dbcsr_plus_fm_fm_t 22 8.9 0.000 0.001 2.377 2.377 qs_ot_get_derivative_taylor 53 13.0 0.002 0.002 2.256 2.267 transfer_dbcsr_to_fm 11 10.9 0.001 0.001 2.164 2.190 cp_fm_cholesky_invert 11 10.9 2.041 2.041 2.041 2.041 qs_vxc_create 107 10.3 0.002 0.002 2.021 2.039 xc_vxc_pw_create 107 11.3 0.598 0.603 2.019 2.038 transfer_rs2pw_140 118 11.5 1.374 1.385 1.871 2.033 build_core_ppl 11 7.9 1.964 2.015 1.964 2.015 pw_gpu_fg 546 14.1 1.907 1.954 1.907 1.954 yz_to_x 546 14.1 0.429 0.433 1.906 1.954 x_to_yz 535 15.2 0.446 0.453 1.893 1.928 ------------------------------------------------------------------------------- Plot: name="H2O-64_nonortho_timings_6cpu_1gpu", title="Timings of H2O-64_nonortho with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="rest", label="rest", y=66.37700000000001, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=9.588, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=7.266, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="hybrid_alltoall_any", label="hybrid_alltoall_any", y=4.354, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="build_core_ppl_forces", label="build_core_ppl_forces", y=3.851, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="jit_kernel_multiply", label="jit_kernel_multiply", y=3.287, yerr=0.0 Running GW_PBE_4benzene.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/GW_PBE_4benzene_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.017 0.018 105.013 105.014 qs_energies 1 2.0 0.000 0.000 104.638 104.642 mp2_main 1 3.0 0.000 0.000 97.039 97.042 mp2_gpw_main 1 4.0 0.000 0.000 94.120 94.124 rpa_ri_compute_en 1 5.0 0.000 0.000 86.514 86.517 rpa_num_int 1 6.0 0.000 0.001 86.506 86.509 dbt_total 2336 9.6 0.022 0.022 68.184 68.184 compute_mat_P_omega 1 7.0 0.001 0.002 67.239 67.242 compute_mat_P_omega_contract 10 8.0 5.294 5.427 66.953 66.955 dbt_contract 787 11.0 0.052 0.052 46.066 46.066 dbt_tas_total 1149 12.2 0.124 0.125 36.049 36.049 dbt_tas_multiply 807 12.1 0.003 0.003 35.352 35.352 dbt_tas_dbm 807 14.1 0.005 0.005 27.980 27.980 dbm_multiply 807 16.1 26.667 26.930 26.667 26.930 compute_mat_P_omega_calc_M_occ 250 9.0 5.304 5.419 23.889 23.889 dbt_copy 1107 10.7 0.071 0.072 22.914 23.084 dbt_tas_mm_1N 524 15.1 0.002 0.002 18.080 18.352 compute_mat_P_omega_calc_M_vir 250 9.0 0.002 0.002 15.013 15.014 dbt_reshape 594 11.8 6.543 6.575 14.883 14.939 compute_QP_energies 1 7.0 0.000 0.000 12.830 12.830 compute_self_energy_cubic_gw 1 8.0 0.116 0.118 12.829 12.829 dbt_tas_reserve_blocks_index 3266 14.3 0.519 0.521 10.595 10.719 dbm_reserve_blocks 3634 15.3 10.389 10.510 10.389 10.510 dbt_reserve_blocks_index 2347 13.0 0.217 0.218 8.829 9.064 compute_mat_P_omega_calc_P_t 250 9.0 0.001 0.001 8.843 8.843 dbt_reserve_blocks_index_array 2289 12.1 0.012 0.013 8.626 8.830 dbt_crop 1042 12.0 6.109 6.183 8.308 8.403 dbcsr_multiply_generic 30 8.1 0.002 0.002 7.705 7.766 dbt_tas_mm_2 251 15.0 0.002 0.002 7.624 7.624 mp2_ri_gpw_compute_in 1 5.0 0.001 0.001 7.596 7.597 multiply_cannon 30 9.1 0.011 0.013 7.512 7.578 multiply_cannon_loop 30 10.1 0.004 0.004 7.459 7.527 scf_env_do_scf 1 3.0 0.000 0.000 6.874 6.874 scf_env_do_scf_inner_loop 17 4.0 0.001 0.001 6.874 6.874 mp_waitall_2 2656 15.9 6.242 6.256 6.242 6.256 contract_cubic_gw 21 9.0 0.000 0.000 6.228 6.228 dbt_communicate_buffer 594 12.8 0.009 0.010 5.723 5.731 multiply_cannon_multrec 60 11.1 0.283 0.310 5.525 5.554 dbcsr_mm_accdrv_process 328 12.3 0.638 1.256 4.973 4.977 jit_kernel_multiply 17 11.4 4.328 4.951 4.328 4.951 compute_mat_P_omega_copy_M_vir 250 9.0 0.001 0.001 4.820 4.857 compute_mat_P_omega_copy_M_occ 250 9.0 0.001 0.001 4.607 4.622 dbt_tas_copy 511 11.5 2.474 2.521 4.252 4.413 qs_scf_new_mos 17 5.0 0.000 0.000 4.322 4.364 get_2c_integrals 1 6.0 0.000 0.000 3.501 3.501 calculate_dm_sparse 17 6.0 0.000 0.000 3.091 3.129 cp_dbcsr_plus_fm_fm_t 17 7.0 0.000 0.000 3.090 3.129 compute_vec_Sigma_x_minus_vxc_ 1 4.0 0.000 0.000 2.842 2.842 mp_sync 8688 11.6 2.434 2.791 2.434 2.791 compute_2c_integrals 1 7.0 0.000 0.000 2.784 2.784 trace_sigma_gw 21 9.0 0.361 0.371 2.498 2.498 dbt_split_copyback 70 10.6 0.919 0.971 2.435 2.497 mp2_ri_gpw_compute_in_copy_3c 6 6.0 0.225 0.232 2.165 2.309 fill_fm_L_from_L_loc_non_block 1 8.0 0.000 0.000 2.274 2.307 convert_to_new_pgrid 2421 14.1 0.030 0.030 2.288 2.291 dbm_copy 1614 15.1 2.258 2.261 2.258 2.261 fill_fm_L_from_L_loc_non_block 1 9.0 2.206 2.237 2.206 2.237 ------------------------------------------------------------------------------- Plot: name="GW_PBE_4benzene_timings_6cpu_1gpu", title="Timings of GW_PBE_4benzene with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="rest", label="rest", y=49.06300000000001, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbm_multiply", label="dbm_multiply", y=26.667, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbm_reserve_blocks", label="dbm_reserve_blocks", y=10.389, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbt_reshape", label="dbt_reshape", y=6.543, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="mp_waitall_2", label="mp_waitall_2", y=6.242, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbt_crop", label="dbt_crop", y=6.109, yerr=0.0 Running RI-HFX_H2O-32.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/RI-HFX_H2O-32_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.020 0.021 207.710 207.713 qs_forces 1 2.0 0.000 0.000 207.249 207.252 rebuild_ks_matrix 7 6.6 0.000 0.000 201.076 201.080 qs_ks_build_kohn_sham_matrix 7 7.6 0.001 0.001 201.076 201.080 hfx_ks_matrix 7 8.6 0.000 0.000 197.143 197.146 dbt_total 849 11.0 0.008 0.008 145.829 145.831 hfx_ri_update_ks 7 9.6 0.000 0.000 107.477 107.479 hfx_ri_update_ks_Pmat 7 10.6 23.062 23.130 107.473 107.474 qs_energies 1 3.0 0.000 0.000 105.409 105.411 scf_env_do_scf 1 4.0 0.000 0.000 101.987 101.988 qs_ks_update_qs_env_forces 1 3.0 0.000 0.000 101.805 101.807 qs_ks_update_qs_env 8 6.0 0.000 0.000 99.277 99.279 hfx_ri_update_forces 1 7.0 1.020 1.025 89.664 89.665 dbt_contract 207 12.4 0.050 0.050 85.270 85.272 dbt_tas_total 369 13.4 0.073 0.073 71.468 71.469 dbt_tas_multiply 216 13.5 0.001 0.001 68.754 68.755 dbt_copy 423 11.8 0.047 0.048 55.924 56.149 dbt_tas_dbm 216 15.5 0.001 0.001 54.681 54.682 scf_env_do_scf_inner_loop 6 5.0 0.000 0.001 53.431 53.432 hfx_ri_forces_Pmat_3c 1 8.0 3.835 3.861 51.687 51.726 dbm_multiply 216 17.5 51.211 51.276 51.211 51.276 init_scf_loop 2 5.0 0.000 0.000 48.554 48.555 dbt_reshape 175 13.2 18.955 19.180 41.145 41.173 precalc_derivatives 1 8.0 2.346 2.360 32.054 32.054 hfx_ri_update_ks_Pmat_KS 63 11.6 0.001 0.001 30.871 30.871 dbt_tas_mm_2 91 16.5 0.001 0.001 21.992 21.992 mp_waitall_2 1022 16.5 19.523 19.595 19.523 19.595 dbt_tas_reserve_blocks_index 1323 15.4 1.466 1.478 18.605 18.780 dbm_reserve_blocks 1491 16.3 17.834 17.998 17.834 17.998 dbt_tas_mm_3T 77 17.1 0.000 0.000 17.119 17.166 dbt_crop 372 13.7 12.559 12.604 16.378 16.387 dbt_communicate_buffer 175 14.2 0.004 0.004 16.193 16.345 build_3c_derivatives 3 9.0 2.742 2.986 15.813 15.821 hfx_ri_pre_scf_Pmat 1 12.0 0.000 0.000 15.690 15.690 dbt_reserve_blocks_index 889 14.5 0.443 0.449 14.941 14.994 hfx_ri_update_ks_Pmat_Px3C 63 11.6 0.000 0.000 14.872 14.872 dbt_reserve_blocks_index_array 859 13.5 0.007 0.007 14.674 14.721 hfx_ri_update_ks_Pmat_copy_2 63 11.6 0.000 0.000 13.738 13.738 dbt_tas_mm_3N 37 15.4 0.000 0.000 12.811 12.836 dbt_tas_copy 248 12.5 4.747 4.751 8.554 8.689 mp_sync 2901 12.8 7.540 8.291 7.540 8.291 dbcsr_multiply_generic 155 10.8 0.007 0.007 5.948 5.958 multiply_cannon 155 11.8 0.025 0.034 5.672 5.690 multiply_cannon_loop 155 12.8 0.014 0.014 5.560 5.580 dbt_tas_replicate 168 15.1 2.789 2.818 5.347 5.368 hfx_ri_pre_scf_Pmat_int 1 13.0 0.000 0.000 4.750 4.750 hfx_ri_pre_scf_Pmat_copy_2 9 13.0 1.691 1.704 4.163 4.176 ------------------------------------------------------------------------------- Plot: name="RI-HFX_H2O-32_timings_6cpu_1gpu", title="Timings of RI-HFX_H2O-32 with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="rest", label="rest", y=77.12500000000003, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbm_multiply", label="dbm_multiply", y=51.211, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="hfx_ri_update_ks_Pmat", label="hfx_ri_update_ks_Pmat", y=23.062, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="mp_waitall_2", label="mp_waitall_2", y=19.523, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbt_reshape", label="dbt_reshape", y=18.955, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbm_reserve_blocks", label="dbm_reserve_blocks", y=17.834, yerr=0.0 Running RI-MP2_ammonia.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/RI-MP2_ammonia_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.010 0.012 110.334 110.334 qs_energies 1 2.0 0.000 0.000 110.124 110.124 mp2_main 1 3.0 0.000 0.000 102.762 102.762 mp2_gpw_main 1 4.0 0.001 0.001 102.345 102.346 mp2_ri_gpw_compute_in 1 5.0 0.514 0.516 59.478 59.503 mp2_ri_gpw_compute_in_loop 1 6.0 0.010 0.010 50.944 50.973 mp2_ri_gpw_compute_en 1 5.0 0.103 0.105 42.806 42.833 mp2_ri_gpw_compute_en_RI_loop 1 6.0 13.102 13.107 40.175 40.175 dbcsr_multiply_generic 2666 8.0 0.132 0.134 25.915 26.628 ao_to_mo_and_store_B_mult_1 1328 7.0 0.009 0.009 24.331 25.044 mp2_eri_3c_integrate_gpw 1328 7.0 0.014 0.014 20.292 20.988 mp2_ri_gpw_compute_en_expansio 1040 7.0 0.733 0.751 16.384 16.411 local_gemm 1040 8.0 15.651 15.659 15.651 15.659 make_m2s 5332 9.0 0.043 0.043 13.886 13.898 make_images 5332 10.0 2.177 2.198 13.720 13.732 multiply_cannon 2666 9.0 0.351 0.356 11.445 12.147 multiply_cannon_loop 2666 10.0 0.156 0.159 10.452 11.150 integrate_v_rspace 1338 8.0 1.008 1.019 10.770 10.855 hybrid_alltoall_any 6683 11.6 9.916 9.922 10.172 10.178 make_images_data 5332 11.0 0.062 0.062 10.064 10.073 grid_integrate_task_list 1338 9.0 8.657 8.745 8.657 8.745 collocate_function 1328 8.0 5.033 5.116 7.827 8.474 fft_wrap_pw1pw2 26668 10.4 0.124 0.124 7.871 8.424 get_2c_integrals 1 6.0 0.005 0.005 8.011 8.019 compute_2c_integrals 1 7.0 0.010 0.012 7.363 7.365 compute_2c_integrals_loop_lm 1 8.0 0.013 0.022 7.232 7.259 mp2_eri_2c_integrate_gpw 1 9.0 1.987 1.989 7.219 7.255 scf_env_do_scf 1 3.0 0.000 0.000 6.259 6.260 scf_env_do_scf_inner_loop 10 4.0 0.001 0.001 6.259 6.260 multiply_cannon_multrec 2676 11.0 2.949 3.289 5.872 6.212 ao_to_mo_and_store_B_E_Ex_1 1328 7.0 4.247 4.252 6.024 6.033 fft_wrap_pw1pw2_20 10647 11.4 0.016 0.016 5.046 5.604 pw_gpu_r3dc1d_3d 13282 12.2 4.457 5.037 4.457 5.037 qs_scf_new_mos 10 5.0 0.000 0.000 4.962 4.966 mp2_ri_gpw_compute_en_ener 1040 7.0 4.883 4.885 4.883 4.885 mp2_ri_gpw_compute_en_comm 221 7.0 1.057 1.060 4.630 4.648 eigensolver 11 5.8 0.001 0.001 3.060 3.061 pw_gpu_c1dr3d_3d 13280 12.7 2.766 2.791 2.766 2.791 mp2_eri_2c_integrate_gpw_pot_l 1328 10.0 0.003 0.003 2.735 2.765 dbcsr_mm_accdrv_process 5392 12.0 1.419 1.421 2.693 2.696 calc_potential_gpw 2656 9.5 0.012 0.012 2.638 2.659 potential_pw2rs 2666 10.0 0.091 0.091 2.576 2.582 mp_sendrecv_dm3 442 8.0 2.519 2.536 2.519 2.536 cp_fm_diag_elpa 11 6.8 0.000 0.000 2.442 2.443 cp_fm_diag_elpa_base 11 7.8 2.365 2.381 2.441 2.441 fft_wrap_pw1pw2_10 15957 11.5 0.016 0.016 2.241 2.247 replicate_iaK_2intgroup 1 6.0 2.095 2.099 2.236 2.241 fill_local_i_aL 884 7.5 2.229 2.229 2.229 2.229 ------------------------------------------------------------------------------- Plot: name="RI-MP2_ammonia_timings_6cpu_1gpu", title="Timings of RI-MP2_ammonia with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="rest", label="rest", y=57.97500000000001, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="local_gemm", label="local_gemm", y=15.651, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="mp2_ri_gpw_compute_en_RI_loop", label="mp2_ri_gpw_compute_en_RI_loop", y=13.102, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="hybrid_alltoall_any", label="hybrid_alltoall_any", y=9.916, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=8.657, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="collocate_function", label="collocate_function", y=5.033, yerr=0.0 Running diag_cu144_broy.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/diag_cu144_broy_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.072 0.073 190.602 190.605 qs_energies 1 2.0 0.000 0.000 189.431 189.434 scf_env_do_scf 1 3.0 0.000 0.000 175.112 175.115 scf_env_do_scf_inner_loop 15 4.0 0.001 0.002 175.112 175.115 qs_ks_update_qs_env 15 5.0 0.000 0.000 93.288 93.329 rebuild_ks_matrix 15 6.0 0.000 0.000 93.086 93.127 qs_ks_build_kohn_sham_matrix 15 7.0 0.002 0.002 93.086 93.127 qs_vxc_create 15 8.0 0.007 0.014 55.577 55.616 qs_scf_new_mos 15 5.0 0.000 0.000 55.565 55.587 calculate_dispersion_nonloc 15 9.0 10.755 10.776 47.072 47.101 eigensolver 15 6.0 0.002 0.002 46.847 46.924 sum_up_and_integrate 15 8.0 0.000 0.000 35.842 35.847 integrate_v_rspace 15 9.0 0.047 0.047 35.818 35.823 grid_integrate_task_list 15 10.0 34.344 34.385 34.344 34.385 fft_wrap_pw1pw2 1086 10.0 0.024 0.024 32.553 32.643 cp_fm_diag_elpa 15 7.0 0.000 0.000 28.220 28.225 cp_fm_diag_elpa_base 15 8.0 26.448 26.996 28.214 28.214 qs_rho_update_rho_low 16 5.0 0.000 0.000 24.038 24.039 calculate_rho_elec 16 6.0 0.179 0.179 24.038 24.038 fft_wrap_pw1pw2_150 765 11.0 0.004 0.004 22.166 22.239 grid_collocate_task_list 16 7.0 21.279 21.290 21.279 21.290 pw_gpu_c1dr3d_3d_ps 585 12.1 5.660 5.745 17.688 17.704 cp_fm_cholesky_restore 45 7.0 16.781 17.501 16.781 17.501 pw_gpu_r3dc1d_3d_ps 501 11.9 4.741 4.878 14.837 14.910 qs_energies_init_hamiltonians 1 3.0 0.000 0.000 10.698 10.699 vdW_energy 15 10.0 10.599 10.637 10.599 10.637 fft_wrap_pw1pw2_200 197 11.3 0.001 0.001 9.805 9.850 build_core_hamiltonian_matrix 1 4.0 0.000 0.000 9.443 9.454 xc_vxc_pw_create 15 9.0 0.177 0.181 8.497 8.501 mp_alltoall_z22v 1086 14.0 6.748 6.864 6.748 6.864 copy_dbcsr_to_fm 16 5.9 0.001 0.001 5.756 5.816 dbcsr_complete_redistribute 46 8.3 1.718 1.726 5.199 5.302 xc_pw_derive 90 11.0 0.001 0.001 5.026 5.052 xc_rho_set_and_dset_create 15 10.0 0.131 0.132 4.973 4.991 build_core_ppnl 1 5.0 4.952 4.981 4.952 4.981 cp_fm_uplo_to_full 30 8.0 3.610 4.796 3.610 4.796 x_to_yz 585 13.1 1.073 1.079 4.734 4.786 pw_gpu_sf 585 13.1 4.163 4.167 4.163 4.167 gspace_mixing 14 5.0 0.134 0.135 4.153 4.153 yz_to_x 501 12.9 0.880 0.883 3.967 4.033 pw_gpu_fg 501 12.9 3.906 3.914 3.906 3.914 ------------------------------------------------------------------------------- Plot: name="diag_cu144_broy_timings_6cpu_1gpu", title="Timings of diag_cu144_broy with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="rest", label="rest", y=80.995, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=34.344, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="cp_fm_diag_elpa_base", label="cp_fm_diag_elpa_base", y=26.448, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=21.279, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="cp_fm_cholesky_restore", label="cp_fm_cholesky_restore", y=16.781, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="calculate_dispersion_nonloc", label="calculate_dispersion_nonloc", y=10.755, yerr=0.0 Running bench_dftb.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/bench_dftb_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.044 0.044 257.602 257.604 qs_energies 1 2.0 0.000 0.000 257.490 257.492 ls_scf 1 3.0 0.000 0.000 256.725 256.725 ls_scf_main 1 4.0 0.001 0.001 247.099 247.101 density_matrix_trs4 11 5.0 0.006 0.006 208.098 208.105 dbcsr_multiply_generic 185 6.1 0.148 0.160 174.532 174.615 multiply_cannon 185 7.1 1.701 1.929 122.126 122.323 multiply_cannon_loop 185 8.1 0.306 0.307 107.715 107.767 multiply_cannon_multrec 370 9.1 82.332 82.557 92.631 92.815 make_m2s 370 7.1 0.026 0.026 44.371 44.377 make_images 370 8.1 10.516 10.790 43.330 43.337 ls_scf_dm_to_ks 11 5.0 0.000 0.000 35.171 35.179 matrix_ls_to_qs 11 6.0 0.000 0.000 32.896 32.980 dbcsr_complete_redistribute 23 7.5 19.274 19.373 27.753 27.794 matrix_decluster 11 7.0 0.000 0.000 25.365 25.403 arnoldi_extremal 12 6.1 0.000 0.000 20.813 20.814 arnoldi_normal_ev 12 7.1 0.051 0.051 20.813 20.814 build_subspace 23 8.1 0.063 0.063 20.335 20.335 dbcsr_matrix_vector_mult 652 9.0 0.136 0.136 18.802 18.836 dbcsr_matrix_vector_mult_local 652 10.0 17.915 17.949 17.921 17.955 make_images_data 370 9.1 0.011 0.011 17.106 17.337 hybrid_alltoall_any 393 9.9 11.701 11.852 16.473 16.696 dbcsr_finalize 559 7.6 0.150 0.158 14.216 14.613 calculate_norms 740 9.1 13.958 14.074 13.958 14.074 dbcsr_merge_all 510 8.6 2.588 2.881 13.096 13.483 setup_rec_index_2d 370 8.1 9.784 9.789 9.784 9.789 dbcsr_special_finalize 555 9.1 0.009 0.009 9.336 9.338 dbcsr_sort_indices 1283 10.0 9.253 9.307 9.253 9.307 dbcsr_copy 761 7.5 1.705 1.727 9.252 9.278 dbcsr_add_d 280 6.0 0.001 0.001 7.997 8.306 dbcsr_add_anytype 280 7.0 3.491 3.499 7.996 8.305 dbcsr_mm_accdrv_process 14501 10.0 0.791 0.801 8.180 8.198 ls_scf_init_scf 1 4.0 0.000 0.000 8.095 8.098 ls_scf_init_matrix_S 1 5.0 0.000 0.000 7.874 7.876 dbcsr_copy_into_existing 11 8.0 7.529 7.576 7.530 7.576 dbcsr_mm_accdrv_process_sort 14501 11.0 7.389 7.397 7.389 7.397 matrix_sqrt_Newton_Schulz 1 6.0 0.000 0.000 7.127 7.127 tree_to_linear_d 23 10.5 7.043 7.050 7.043 7.050 dbcsr_dot 144 6.3 5.846 5.872 6.347 6.459 dbcsr_merge_single_wm 370 10.1 0.499 0.510 5.915 5.936 mp_waitall_1 5192 10.5 5.349 5.747 5.349 5.747 ------------------------------------------------------------------------------- Plot: name="bench_dftb_timings_6cpu_1gpu", title="Timings of bench_dftb with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="rest", label="rest", y=112.422, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="multiply_cannon_multrec", label="multiply_cannon_multrec", y=82.332, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="dbcsr_complete_redistribute", label="dbcsr_complete_redistribute", y=19.274, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="dbcsr_matrix_vector_mult_local", label="dbcsr_matrix_vector_mult_local", y=17.915, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="calculate_norms", label="calculate_norms", y=13.958, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="hybrid_alltoall_any", label="hybrid_alltoall_any", y=11.701, yerr=0.0 Running dbcsr.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/dbcsr_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.004 0.005 48.779 48.779 lib_test 1 2.0 0.000 0.000 48.773 48.774 dbcsr_run_tests 3 3.0 0.000 0.000 48.773 48.773 test_multiplies_multiproc 3 4.0 0.001 0.001 38.221 38.254 dbcsr_multiply_generic 9 5.0 0.002 0.002 30.353 30.353 multiply_cannon 9 6.0 0.173 0.173 21.114 21.330 multiply_cannon_loop 9 7.0 0.003 0.004 19.899 20.122 multiply_cannon_multrec 18 8.0 9.685 9.955 17.850 18.057 dbcsr_make_random_matrix 9 4.0 7.292 7.328 10.424 10.459 dbcsr_mm_accdrv_process 8199 9.0 2.306 2.682 7.926 7.990 dbcsr_finalize 27 5.7 0.001 0.001 7.121 7.137 dbcsr_merge_all 18 6.5 3.506 3.521 6.981 6.994 dbcsr_redistribute 9 5.0 3.251 3.263 5.394 5.402 dbcsr_mm_accdrv_process_sort 8199 10.0 4.875 4.886 4.875 4.886 make_m2s 18 6.0 0.001 0.001 4.812 4.818 make_images 18 7.0 0.345 0.347 4.778 4.784 make_images_data 18 8.0 0.000 0.000 2.747 2.752 hybrid_alltoall_any 18 9.0 2.357 2.363 2.713 2.718 mp_alltoall_d11v 27 6.0 1.846 1.855 1.846 1.855 tree_to_linear_d 9 7.0 1.791 1.794 1.791 1.794 dbcsr_data_copy_aa2 18 7.5 1.552 1.553 1.552 1.553 dbcsr_data_release 507 7.7 1.457 1.464 1.457 1.464 jit_kernel_multiply 4 10.0 0.745 1.195 0.745 1.195 dbcsr_checksum 6 5.0 0.989 0.993 0.997 0.997 ------------------------------------------------------------------------------- Plot: name="dbcsr_timings_6cpu_1gpu", title="Timings of dbcsr with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="rest", label="rest", y=20.17, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="multiply_cannon_multrec", label="multiply_cannon_multrec", y=9.685, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_make_random_matrix", label="dbcsr_make_random_matrix", y=7.292, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_mm_accdrv_process_sort", label="dbcsr_mm_accdrv_process_sort", y=4.875, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_merge_all", label="dbcsr_merge_all", y=3.506, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_redistribute", label="dbcsr_redistribute", y=3.251, yerr=0.0 Running MQAE_single_node.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/MQAE_single_node_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.051 0.053 204.089 204.091 qs_mol_dyn_low 1 2.0 0.005 0.006 202.434 202.470 qs_forces 6 3.8 0.001 0.001 130.943 130.945 qs_energies 6 4.8 0.000 0.000 123.581 123.582 scf_env_do_scf 6 5.8 0.000 0.001 114.387 114.388 scf_env_do_scf_inner_loop 113 6.2 0.005 0.008 105.896 105.897 velocity_verlet 5 3.0 0.003 0.003 93.786 93.843 rebuild_ks_matrix 119 8.1 0.000 0.000 89.448 89.448 qs_ks_build_kohn_sham_matrix 119 9.1 0.016 0.016 89.447 89.448 qs_ks_update_qs_env 119 7.3 0.001 0.001 84.349 84.349 fft_wrap_pw1pw2 2059 12.4 0.041 0.041 64.979 65.040 fft_wrap_pw1pw2_150 1321 13.9 0.008 0.008 62.880 62.938 qs_vxc_create 119 10.1 0.002 0.002 55.711 55.712 xc_vxc_pw_create 119 11.1 1.523 1.526 55.709 55.710 xc_pw_derive 714 13.1 0.009 0.009 39.197 39.202 qmmm_el_coupling 6 3.8 0.000 0.000 37.066 37.075 qmmm_elec_with_gaussian 6 4.8 0.019 0.019 37.060 37.069 qmmm_elec_with_gaussian_low 6 5.8 0.000 0.000 35.774 35.883 pw_gpu_c1dr3d_3d_ps 1095 14.8 10.613 10.734 34.957 35.025 qmmm_forces 6 3.8 0.001 0.001 31.297 31.297 qmmm_elec_gaussian_low_G 6 6.8 30.973 31.066 30.973 31.066 qmmm_forces_with_gaussian 6 4.8 0.022 0.023 30.514 30.625 pw_gpu_r3dc1d_3d_ps 964 14.0 9.351 9.474 29.969 29.977 qmmm_force_with_gaussian_low 6 5.8 0.000 0.000 29.178 29.290 xc_rho_set_and_dset_create 119 12.1 2.463 2.467 27.983 28.007 xc_pw_divergence 119 12.1 0.005 0.005 25.796 25.814 qmmm_forces_gaussian_low_G 6 6.8 24.139 24.213 24.139 24.213 qs_rho_update_rho_low 119 7.3 0.001 0.001 20.450 20.666 calculate_rho_elec 119 8.3 1.082 1.083 20.449 20.665 sum_up_and_integrate 119 10.1 0.002 0.002 16.147 16.183 density_rs2pw 119 9.3 0.006 0.007 15.808 16.013 integrate_v_rspace 119 11.1 0.020 0.020 15.976 16.011 dbcsr_multiply_generic 2588 12.3 0.090 0.090 15.516 15.683 multiply_cannon 2588 13.3 0.212 0.213 14.024 14.209 multiply_cannon_loop 2588 14.3 0.212 0.214 13.579 13.762 mp_alltoall_z22v 2059 16.4 13.050 13.192 13.050 13.192 multiply_cannon_multrec 5176 15.3 4.113 4.176 10.044 10.075 potential_pw2rs 119 12.1 0.033 0.033 9.618 9.620 x_to_yz 1095 15.8 2.308 2.314 9.440 9.503 qs_ks_ddapc 119 10.1 0.002 0.002 9.239 9.271 pw_gpu_sf 1095 15.8 8.574 8.601 8.574 8.601 init_scf_loop 6 6.8 0.000 0.000 8.488 8.488 pw_gpu_fg 964 15.0 8.023 8.076 8.023 8.076 yz_to_x 964 15.0 1.802 1.808 7.720 7.800 qs_scf_new_mos 113 7.2 0.001 0.001 7.607 7.610 qs_scf_loop_do_ot 113 8.2 0.001 0.001 7.606 7.610 ot_scf_mini 113 9.2 0.002 0.002 7.322 7.323 pw_derive 1089 13.4 7.228 7.257 7.228 7.257 init_scf_run 6 5.8 0.000 0.001 7.095 7.095 scf_env_initial_rho_setup 6 6.8 0.000 0.000 7.095 7.095 grid_integrate_task_list 119 12.1 6.338 6.374 6.338 6.374 pw_gpu_ffc 1095 15.8 6.315 6.351 6.315 6.351 pw_poisson_solve 125 9.9 0.003 0.003 5.948 5.957 dbcsr_mm_accdrv_process 13832 16.0 1.497 2.568 5.861 5.893 jit_kernel_multiply 24 14.7 4.330 5.435 4.330 5.435 calculate_first_density_matrix 1 7.0 0.000 0.000 5.140 5.353 xc_functional_eval 238 13.1 0.002 0.002 5.211 5.219 qs_ks_update_qs_env_forces 6 4.8 0.000 0.000 5.124 5.124 qmmm_forces_gaussian_low_R 6 6.8 0.000 0.000 5.039 5.077 qmmm_forces_with_gaussian_LG 6 7.8 5.039 5.077 5.039 5.077 ot_mini 113 10.2 0.001 0.001 4.962 4.963 qmmm_elec_gaussian_low_R 6 6.8 0.000 0.000 4.801 4.818 qmmm_elec_with_gaussian_LG 6 7.8 4.801 4.818 4.801 4.818 pw_gpu_cff 964 15.0 4.813 4.815 4.813 4.815 qs_ot_get_derivative 113 11.2 0.001 0.001 4.147 4.150 ------------------------------------------------------------------------------- Plot: name="MQAE_single_node_timings_6cpu_1gpu", title="Timings of MQAE_single_node with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="rest", label="rest", y=115.96300000000001, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="qmmm_elec_gaussian_low_G", label="qmmm_elec_gaussian_low_G", y=30.973, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="qmmm_forces_gaussian_low_G", label="qmmm_forces_gaussian_low_G", y=24.139, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="mp_alltoall_z22v", label="mp_alltoall_z22v", y=13.05, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="pw_gpu_c1dr3d_3d_ps", label="pw_gpu_c1dr3d_3d_ps", y=10.613, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="pw_gpu_r3dc1d_3d_ps", label="pw_gpu_r3dc1d_3d_ps", y=9.351, yerr=0.0 Summary: Performance test took 22 minutes. Status: OK Removing intermediate container 3bf264c97c23 ---> ab4a3a8b5e0b Step 47/48 : CMD cat $(find ./report.log -mmin +10) | sed '/^Summary:/ s/$/ (cached)/' ---> Running in 9145ccd57ad5 Removing intermediate container 9145ccd57ad5 ---> 54133bb9c074 Step 48/48 : ENTRYPOINT [] ---> Running in dd36a349af65 Removing intermediate container dd36a349af65 ---> 2d8722c0d91f [Warning] One or more build-args [GIT_COMMIT_SHA SPACK_CACHE] were not consumed Successfully built 2d8722c0d91f Successfully tagged us-central1-docker.pkg.dev/cp2k-org-project/cp2kci/img_cp2k-perf-cuda-volta:master Pushing new image... done. #################### Running Image cp2k-perf-cuda-volta #################### Uploading artifacts... done EndDate: 2025-07-02 09:00:44+00:00