From 98b873b41a35b0a18415493434af075bedf612bd Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Wed, 9 Oct 2024 18:01:43 +0000 Subject: [PATCH 01/32] FIX: added omp declare target to sbc_phy.f90 --- examples/nemo/scripts/omp_gpu_trans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 6a83269a1e..b5810afa08 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -120,7 +120,7 @@ def trans(psyir): ) # For performance in lib_fortran, mark serial routines as GPU-enabled - if psyir.name == "lib_fortran.f90": + if psyir.name == "lib_fortran.f90" or psyir.name == "sbc_phy.f90": if not subroutine.walk(Loop): try: # We need the 'force' option. From f8560d35c47c5b77437cefbc8ff35ed0e9d576a4 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 27 Nov 2024 13:16:23 +0000 Subject: [PATCH 02/32] #2671 Update NEMO OpenMP GPU script with array privatisation --- examples/nemo/scripts/omp_gpu_trans.py | 57 ++++++++++++++++++-------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 1891a230a7..73cb448bc4 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -39,18 +39,36 @@ from utils import ( insert_explicit_loop_parallelism, normalise_loops, add_profiling, - enhance_tree_information, NOT_PERFORMANT) -from psyclone.psyGen import TransInfo + enhance_tree_information, PASSTHROUGH_ISSUES, PARALLELISATION_ISSUES) from psyclone.psyir.nodes import ( Loop, Routine, Directive, Assignment, OMPAtomicDirective) from psyclone.psyir.transformations import OMPTargetTrans -from psyclone.transformations import OMPDeclareTargetTrans, TransformationError +from psyclone.transformations import ( + OMPLoopTrans, OMPDeclareTargetTrans, TransformationError) PROFILING_ENABLED = False # List of all files that psyclone will skip processing -FILES_TO_SKIP = NOT_PERFORMANT - +FILES_TO_SKIP = PASSTHROUGH_ISSUES + [ + "lib_mpp.f90", # Compiler Error: Illegal substring expression + "prtctl.f90", # Compiler Error: Illegal substring expression + "sbcblk.f90", # Compiler Error: Vector expression used where scalar + # expression required + "diadct.f90", # Compiler Error: Wrong number of arguments in reshape + "stpctl.f90", + "lbcnfd.f90", + "flread.f90", +] + +OFFLOADING_ISSUES = [ + "tranxt.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) + "trazdf.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) + "crsdom.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) +] + +PRIVATISATION_ISSUES = [ + "ldftra.f90", # Wrong runtime results +] def trans(psyir): ''' Add OpenMP Target and Loop directives to all loops, including the @@ -62,13 +80,15 @@ def trans(psyir): ''' omp_target_trans = OMPTargetTrans() - omp_loop_trans = TransInfo().get_trans_name('OMPLoopTrans') - omp_loop_trans.omp_directive = "loop" + omp_loop_trans = OMPLoopTrans(omp_schedule="none") + omp_loop_trans.omp_directive = "teamsdistributeparalleldo" - # TODO #2317: Has structure accesses that can not be offloaded and has - # a problematic range to loop expansion of (1:1) + # Many of the obs_ files have problems to be offloaded to the GPU if psyir.name.startswith("obs_"): - print("Skipping file", psyir.name) + return + + # ICE routines do not perform well on GPU, so we skip them + if psyir.name.startswith("ice"): return for subroutine in psyir.walk(Routine): @@ -76,7 +96,7 @@ def trans(psyir): if PROFILING_ENABLED: add_profiling(subroutine.children) - print(f"Transforming subroutine: {subroutine.name}") + print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") enhance_tree_information(subroutine) @@ -123,10 +143,11 @@ def trans(psyir): parent.addchild(atomic) continue - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=omp_target_trans, - loop_directive_trans=omp_loop_trans, - # Collapse is necessary to give GPUs enough parallel items - collapse=True - ) + if psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES: + insert_explicit_loop_parallelism( + subroutine, + region_directive_trans=omp_target_trans, + loop_directive_trans=omp_loop_trans, + collapse=True, + privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES) + ) From 7ed6313dc7045781a3cdea3f7c4b763dd1cf7eca Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Fri, 13 Dec 2024 12:55:14 +0000 Subject: [PATCH 03/32] Add more NEMO GPU exclusions and NEMO_FUNCTIONS values --- examples/nemo/scripts/omp_gpu_trans.py | 44 +++++++---- examples/nemo/scripts/utils.py | 87 +++++++++++++++++++--- src/psyclone/psyir/nodes/omp_directives.py | 2 + 3 files changed, 110 insertions(+), 23 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 73cb448bc4..f5192db6e6 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -37,6 +37,7 @@ ''' PSyclone transformation script showing the introduction of OpenMP for GPU directives into Nemo code. ''' +import os from utils import ( insert_explicit_loop_parallelism, normalise_loops, add_profiling, enhance_tree_information, PASSTHROUGH_ISSUES, PARALLELISATION_ISSUES) @@ -50,20 +51,24 @@ # List of all files that psyclone will skip processing FILES_TO_SKIP = PASSTHROUGH_ISSUES + [ - "lib_mpp.f90", # Compiler Error: Illegal substring expression - "prtctl.f90", # Compiler Error: Illegal substring expression - "sbcblk.f90", # Compiler Error: Vector expression used where scalar + "iom.f90", + "iom_nf90.f90", + "iom_def.f90", + "timing.f90", # Compiler error: Subscript, substring, or argument illegal + "lbcnfd.f90", # Illegal address during kernel execution - line 1012: lbc_nfd_dp + "lib_mpp.f90", # Compiler error: Illegal substring expression + "prtctl.f90", # Compiler error: Illegal substring expression + "sbcblk.f90", # Compiler error: Vector expression used where scalar # expression required - "diadct.f90", # Compiler Error: Wrong number of arguments in reshape - "stpctl.f90", - "lbcnfd.f90", - "flread.f90", ] OFFLOADING_ISSUES = [ - "tranxt.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) - "trazdf.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) - "crsdom.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) + "trcrad.f90", # Illegal address during kernel execution, unless the dimensions are small + "tranxt.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) + "trazdf.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) + "crsdom.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) + "zdftke.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): Illegal address during kernel execution + "dynzdf.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): Illegal address during kernel execution ] PRIVATISATION_ISSUES = [ @@ -79,9 +84,13 @@ def trans(psyir): :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` ''' + # if psyir.name not in (os.environ['ONLY_FILE'], "lib_fortran.f90"): + # return omp_target_trans = OMPTargetTrans() - omp_loop_trans = OMPLoopTrans(omp_schedule="none") - omp_loop_trans.omp_directive = "teamsdistributeparalleldo" + omp_gpu_loop_trans = OMPLoopTrans(omp_schedule="none") + omp_gpu_loop_trans.omp_directive = "teamsdistributeparalleldo" + omp_cpu_loop_trans = OMPLoopTrans(omp_schedule="static") + omp_cpu_loop_trans.omp_directive = "paralleldo" # Many of the obs_ files have problems to be offloaded to the GPU if psyir.name.startswith("obs_"): @@ -129,7 +138,7 @@ def trans(psyir): if loop.ancestor(Directive): continue try: - omp_loop_trans.apply(loop, options={"force": True}) + omp_gpu_loop_trans.apply(loop, options={"force": True}) except TransformationError: continue omp_target_trans.apply(loop.parent.parent) @@ -147,7 +156,14 @@ def trans(psyir): insert_explicit_loop_parallelism( subroutine, region_directive_trans=omp_target_trans, - loop_directive_trans=omp_loop_trans, + loop_directive_trans=omp_gpu_loop_trans, collapse=True, privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES) ) + elif psyir.name not in PARALLELISATION_ISSUES: + # This have issues offloading, but we can still do OpenMP threading + insert_explicit_loop_parallelism( + subroutine, + loop_directive_trans=omp_cpu_loop_trans, + privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES) + ) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index fcc9c543e7..5b22029c1d 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -70,14 +70,76 @@ # Currently fparser has no way of distinguishing array accesses from # function calls if the symbol is imported from some other module. -# We therefore work-around this by keeping a list of known NEMO functions. -NEMO_FUNCTIONS = ["alpha_charn", "cd_neutral_10m", "cpl_freq", "cp_air", - "eos_pt_from_ct", "gamma_moist", "l_vap", "q_air_rh", - "sbc_dcy", "solfrac", "psi_h", "psi_m", "psi_m_coare", - "psi_h_coare", "psi_m_ecmwf", "psi_h_ecmwf", "q_sat", - "rho_air", "visc_air", "sbc_dcy", "glob_sum", - "glob_sum_full", "ptr_sj", "ptr_sjk", "interp1", "interp2", - "interp3", "integ_spline", "nf90_put_var"] +# We therefore work-around this by keeping a list of known NEMO functions +# from v4 and v5. +NEMO_FUNCTIONS = [ + # Internal funtions can be obtained with: + # $ grep -rhi "end function" src/ | awk '{print $3}' | uniq | sort + 'abl_alloc', 'add_xxx', 'Agrif_CFixed', 'agrif_external_switch_index', + 'Agrif_Fixed', 'agrif_oce_alloc', 'Agrif_Root', 'alfa_charn', 'alngam', + 'alpha_sw_sclr', 'alpha_sw_vctr', 'arr_hls', 'arr_lbnd', 'arr_lbnd_2d_dp', + 'arr_lbnd_2d_i', 'arr_lbnd_2d_sp', 'arr_lbnd_3d_dp', 'arr_lbnd_3d_i', + 'arr_lbnd_3d_sp', 'arr_lbnd_4d_dp', 'arr_lbnd_4d_i', 'arr_lbnd_4d_sp', + 'arr_lbnd_5d_dp', 'arr_lbnd_5d_i', 'arr_lbnd_5d_sp', 'atg', + 'bdy_oce_alloc', 'bdy_segs_surf', 'Cd_from_z0', 'CdN10_f_LU12', + 'CdN10_f_LU13', 'cd_n10_ncar', 'cd_neutral_10m', 'CdN_f_LG15', + 'CdN_f_LG15_light', 'CdN_f_LU12_eq36', 'ce_n10_ncar', 'charn_coare3p0', + 'charn_coare3p6', 'charn_coare3p6_wave', 'check_hdom', 'ch_n10_ncar', + 'cp_air', 'cp_air_sclr', 'cp_air_vctr', 'cpl_freq', 'crs_dom_alloc', + 'crs_dom_alloc2', 'dayjul', 'def_newlink', 'delta_skin_layer', + 'depth', 'dep_to_p', 'de_sat_dt_ice_sclr', 'de_sat_dt_ice_vctr', + 'dia_ar5_alloc', 'diadct_alloc', 'dia_hth_alloc', 'dia_ptr_alloc', + 'dia_wri_alloc', 'dom_oce_alloc', 'dom_vvl_alloc', 'dq_sat_dt_ice_sclr', + 'dq_sat_dt_ice_vctr', 'dyn_dmp_alloc', 'dyn_ldf_iso_alloc', + 'dyn_spg_ts_alloc', 'eos_pt_from_ct', 'e_sat_ice_sclr', 'e_sat_ice_vctr', + 'e_sat_sclr', 'e_sat_vctr', 'exa_mpl_alloc', 'f_h_louis_sclr', + 'f_h_louis_vctr', 'find_link', 'fintegral', 'fld_filename', + 'flo_dom_alloc', 'flo_dstnce', 'flo_oce_alloc', 'flo_rst_alloc', + 'flo_wri_alloc', 'f_m_louis_sclr', 'f_m_louis_vctr', 'frac_solar_abs', + 'fspott', 'FUNCTION_GLOBMINMAX', 'FUNCTION_GLOBSUM', 'gamain', + 'gamma_moist', 'gamma_moist_sclr', 'gamma_moist_vctr', 'get_unit', + 'grt_cir_dis', 'grt_cir_dis_saa', 'icb_alloc', 'icb_utl_bilin', + 'icb_utl_bilin_2d_h', 'icb_utl_bilin_3d_h', 'icb_utl_bilin_e', + 'icb_utl_bilin_h', 'icb_utl_bilin_x',' icb_utl_count', 'icb_utl_heat', + 'icb_utl_mass', 'icb_utl_yearday', 'ice1D_alloc', 'ice_alloc', + 'ice_dia_alloc', 'ice_dyn_rdgrft_alloc', 'ice_perm_eff', + 'ice_thd_pnd_alloc', 'ice_update_alloc', 'ice_var_sshdyn', 'in_hdom', + 'integ_spline', 'interp', 'interp1', 'interp2', 'interp3', + 'iom_axis', 'iom_getszuld', 'iom_nf90_varid', 'iom_sdate', 'iom_use', + 'iom_varid', 'iom_xios_setid', 'iscpl_alloc', 'is_tile', 'kiss', + 'ksec_week', 'lib_mpp_alloc', 'linquad', 'L_vap', 'L_vap_sclr', + 'L_vap_vctr', 'm', 'maxdist', 'mynode', 'nblinks', 'nodal_factort', + 'oce_alloc', 'oce_SWE_alloc', 'One_on_L', 'p2z_exp_alloc', + 'p2z_lim_alloc', 'p2z_prod_alloc', 'p4z_che_alloc', 'p4z_diaz_alloc', + 'p4z_flx_alloc', 'p4z_lim_alloc', 'p4z_meso_alloc', 'p4z_opt_alloc', + 'p4z_prod_alloc', 'p4z_rem_alloc', 'p4z_sed_alloc', 'p4z_sink_alloc', + 'p5z_lim_alloc', 'p5z_meso_alloc', 'p5z_prod_alloc', + 'PHI', 'potemp', 'pres_temp_sclr', 'pres_temp_vctr', 'prt_ctl_sum_2d', + 'prt_ctl_sum_3d', 'prt_ctl_write_sum', 'psi_h', 'psi_h_andreas', + 'psi_h_coare', 'psi_h_ecmwf', 'psi_h_ice', 'psi_h_mfs', 'psi_h_ncar', + 'psi_m', 'psi_m_andreas', 'psi_m_coare', 'psi_m_ecmwf', 'psi_m_ice', + 'psi_m_mfs', 'psi_m_ncar', 'p_to_dep', 'ptr_ci_2d', 'ptr_sj_2d', + 'ptr_sj_3d', 'ptr_sjk', 'q_air_rh', 'qlw_net_sclr', 'qlw_net_vctr', + 'q_sat', 'q_sat_sclr', 'q_sat_vctr', 'qsr_ext_lev', 'rho_air', + 'rho_air_sclr', 'rho_air_vctr', 'Ri_bulk', 'Ri_bulk_sclr', 'Ri_bulk_vctr', + 'rough_leng_m', 'rough_leng_tq', 's', 'sbc_blk_alloc', 'sbc_blk_ice_alloc', + 'sbc_cpl_alloc', 'sbc_dcy', 'sbc_dcy_alloc', 'sbc_ice_alloc', + 'sbc_ice_cice_alloc', 'sbc_oce_alloc', 'sbc_rnf_alloc', + 'sbc_ssr_alloc', 'sed_adv_alloc', 'sed_alloc', 'sed_oce_alloc', + 'sms_c14_alloc', 'sms_pisces_alloc', 'snw_ent', 'solfrac', + 'sto_par_flt_fac', 'sum2d', 'sw_adtg', 'sw_ptmp', 'theta', + 'theta_exner_sclr', 'theta_exner_vctr', 't_imp', 'tra_bbl_alloc', + 'tra_dmp_alloc', 'trc_alloc', 'trc_dmp_alloc', 'trc_dmp_sed_alloc', + 'trc_oce_alloc', 'trc_oce_ext_lev', 'trc_opt_alloc', 'trc_sms_cfc_alloc', + 'trc_sms_my_trc_alloc', 'trc_sub_alloc', 'trd_ken_alloc', 'trd_mxl_alloc', + 'trdmxl_oce_alloc', 'trd_mxl_trc_alloc', 'trd_pen_alloc', 'trd_tra_alloc', + 'trd_trc_oce_alloc', 'trd_vor_alloc', 'twrk_id', 'UN10_from_CD', + 'UN10_from_ustar', 'u_star_andreas', 'virt_temp_sclr', 'virt_temp_vctr', + 'visc_air', 'visc_air_sclr', 'visc_air_vctr', 'w1', 'w2', 'z0_from_Cd', + 'z0tq_LKB', 'zdf_gls_alloc', 'zdf_iwm_alloc', 'zdf_mfc_alloc', + 'zdf_mxl_alloc', 'zdf_oce_alloc', 'zdf_osm_alloc', 'zdf_phy_alloc', + 'zdf_tke_alloc', 'zdf_tmx_alloc', +] # Currently fparser has no way of distinguishing array accesses from statement # functions, the following subroutines contains known statement functions @@ -160,7 +222,14 @@ def enhance_tree_information(schedule): ArrayType.Extent.ATTRIBUTE, ArrayType.Extent.ATTRIBUTE, ArrayType.Extent.ATTRIBUTE])) - elif reference.symbol.name in NEMO_FUNCTIONS: + elif (reference.symbol.name in NEMO_FUNCTIONS or + reference.symbol.name.startswith('local_') or + reference.symbol.name.startswith('glob_') or + reference.symbol.name.startswith('SIGN_') or + reference.symbol.name.startswith('netcdf_') or + (reference.symbol.name.startswith('nf90_') and not + reference.symbol.name in ['nf90_64bit_offset', 'nf90_short', + 'nf90_clobber'])): if reference.symbol.is_import or reference.symbol.is_unresolved: # The parser gets these wrong, they are Calls not ArrayRefs if not isinstance(reference.symbol, RoutineSymbol): diff --git a/src/psyclone/psyir/nodes/omp_directives.py b/src/psyclone/psyir/nodes/omp_directives.py index 510656aa53..c9311ef22f 100644 --- a/src/psyclone/psyir/nodes/omp_directives.py +++ b/src/psyclone/psyir/nodes/omp_directives.py @@ -2116,6 +2116,8 @@ def _validate_collapse_value(self): f"'{self}' has a collapse={self._collapse} and the " f"nested body at depth {depth} cannot be " f"collapsed.") + if len(cursor.loop_body.children) == 0: + break cursor = cursor.loop_body.children[0] def _validate_single_loop(self): From fb235fb7bb7353f157ae0a7753d985e2f323a0af Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Fri, 13 Dec 2024 12:58:29 +0000 Subject: [PATCH 04/32] Uncomment NEMOv5 for GPU test --- .github/workflows/nemo_v5_tests.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 1f3c0c8b3b..c66dd91b4a 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -221,17 +221,17 @@ jobs: add_key "key_mpi_off key_nosignedzero" -j 4 -v 1 # Run test (disabled because it is currently too slow) - # cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 - # cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg - # ./nemo + cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 + cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg + ./nemo # tail run.stat - # diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.100steps run.stat - # export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) - # ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \ - # "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \ - # --quiet --apiVersion 1 --username ${{ secrets.MONGODB_USERNAME }} \ - # --password ${{ secrets.MONGODB_PASSWORD }} \ - # --eval 'db.GitHub_CI.insertOne({branch_name: "'"$GITHUB_REF_NAME"'", commit: "'"$GITHUB_SHA"'", - # github_job: "'"$GITHUB_RUN_ID"'"-"'"$GITHUB_RUN_ATTEMPT"'", - # ci_test: "NEMOv5 OpenMP for GPU", nemo_version: "NEMOv5", system: "GlaDos", - # compiler:"nvhpc-24.5" , date: new Date(), elapsed_time: '"${TIME_sec}"'})' + diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.100steps run.stat + export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) + ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \ + "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \ + --quiet --apiVersion 1 --username ${{ secrets.MONGODB_USERNAME }} \ + --password ${{ secrets.MONGODB_PASSWORD }} \ + --eval 'db.GitHub_CI.insertOne({branch_name: "'"$GITHUB_REF_NAME"'", commit: "'"$GITHUB_SHA"'", + github_job: "'"$GITHUB_RUN_ID"'"-"'"$GITHUB_RUN_ATTEMPT"'", + ci_test: "NEMOv5 OpenMP for GPU", nemo_version: "NEMOv5", system: "GlaDos", + compiler:"nvhpc-24.5" , date: new Date(), elapsed_time: '"${TIME_sec}"'})' From ee43ac96903741e054faedf7af89b6bec8ceaeed Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Fri, 13 Dec 2024 13:28:55 +0000 Subject: [PATCH 05/32] Fix flake8 issues --- examples/nemo/scripts/omp_gpu_trans.py | 21 +++++++++++++-------- examples/nemo/scripts/utils.py | 9 +++++---- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index f5192db6e6..fb0cd875cf 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -37,7 +37,7 @@ ''' PSyclone transformation script showing the introduction of OpenMP for GPU directives into Nemo code. ''' -import os +# import os from utils import ( insert_explicit_loop_parallelism, normalise_loops, add_profiling, enhance_tree_information, PASSTHROUGH_ISSUES, PARALLELISATION_ISSUES) @@ -55,7 +55,8 @@ "iom_nf90.f90", "iom_def.f90", "timing.f90", # Compiler error: Subscript, substring, or argument illegal - "lbcnfd.f90", # Illegal address during kernel execution - line 1012: lbc_nfd_dp + "lbcnfd.f90", # Illegal address during kernel execution + # - line 1012: lbc_nfd_dp "lib_mpp.f90", # Compiler error: Illegal substring expression "prtctl.f90", # Compiler error: Illegal substring expression "sbcblk.f90", # Compiler error: Vector expression used where scalar @@ -63,18 +64,22 @@ ] OFFLOADING_ISSUES = [ - "trcrad.f90", # Illegal address during kernel execution, unless the dimensions are small - "tranxt.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) - "trazdf.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) - "crsdom.f90", # String comparison not allowed inside omp teams (this worked fine with omp loop) - "zdftke.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): Illegal address during kernel execution - "dynzdf.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): Illegal address during kernel execution + "trcrad.f90", # Illegal address during kernel execution, unless the + # dimensions are small + "tranxt.f90", # String comparison not allowed inside omp teams + # (this worked fine with omp loop) + "trazdf.f90", # String comparison not allowed inside omp teams + "crsdom.f90", # String comparison not allowed inside omp teams + "zdftke.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): + # Illegal address during kernel execution + "dynzdf.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS) ] PRIVATISATION_ISSUES = [ "ldftra.f90", # Wrong runtime results ] + def trans(psyir): ''' Add OpenMP Target and Loop directives to all loops, including the implicit ones, to parallelise the code and execute it in an acceleration diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 5b22029c1d..1451edd542 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -100,7 +100,7 @@ 'gamma_moist', 'gamma_moist_sclr', 'gamma_moist_vctr', 'get_unit', 'grt_cir_dis', 'grt_cir_dis_saa', 'icb_alloc', 'icb_utl_bilin', 'icb_utl_bilin_2d_h', 'icb_utl_bilin_3d_h', 'icb_utl_bilin_e', - 'icb_utl_bilin_h', 'icb_utl_bilin_x',' icb_utl_count', 'icb_utl_heat', + 'icb_utl_bilin_h', 'icb_utl_bilin_x', 'icb_utl_count', 'icb_utl_heat', 'icb_utl_mass', 'icb_utl_yearday', 'ice1D_alloc', 'ice_alloc', 'ice_dia_alloc', 'ice_dyn_rdgrft_alloc', 'ice_perm_eff', 'ice_thd_pnd_alloc', 'ice_update_alloc', 'ice_var_sshdyn', 'in_hdom', @@ -227,9 +227,10 @@ def enhance_tree_information(schedule): reference.symbol.name.startswith('glob_') or reference.symbol.name.startswith('SIGN_') or reference.symbol.name.startswith('netcdf_') or - (reference.symbol.name.startswith('nf90_') and not - reference.symbol.name in ['nf90_64bit_offset', 'nf90_short', - 'nf90_clobber'])): + (reference.symbol.name.startswith('nf90_') and + reference.symbol.name not in ['nf90_64bit_offset', + 'nf90_short', + 'nf90_clobber'])): if reference.symbol.is_import or reference.symbol.is_unresolved: # The parser gets these wrong, they are Calls not ArrayRefs if not isinstance(reference.symbol, RoutineSymbol): From d12df3dc7f355384b2d8de0749523c41b58a26e4 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Fri, 13 Dec 2024 13:59:47 +0000 Subject: [PATCH 06/32] Fix NEMO_FUNCTIONS --- examples/nemo/scripts/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 1451edd542..510b59ea5a 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -227,10 +227,10 @@ def enhance_tree_information(schedule): reference.symbol.name.startswith('glob_') or reference.symbol.name.startswith('SIGN_') or reference.symbol.name.startswith('netcdf_') or - (reference.symbol.name.startswith('nf90_') and - reference.symbol.name not in ['nf90_64bit_offset', - 'nf90_short', - 'nf90_clobber'])): + reference.symbol.name.startswith('nf90_')): + if len(reference.children) >= 1: + # Thigs with no children are already properly classified + break if reference.symbol.is_import or reference.symbol.is_unresolved: # The parser gets these wrong, they are Calls not ArrayRefs if not isinstance(reference.symbol, RoutineSymbol): From ccf8bf1fa75f941d510e05ac18a6e05d999d54cc Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Fri, 13 Dec 2024 14:00:31 +0000 Subject: [PATCH 07/32] Fix NEMO_FUNCTIONS --- examples/nemo/scripts/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 510b59ea5a..262e8adb6f 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -230,7 +230,7 @@ def enhance_tree_information(schedule): reference.symbol.name.startswith('nf90_')): if len(reference.children) >= 1: # Thigs with no children are already properly classified - break + continue if reference.symbol.is_import or reference.symbol.is_unresolved: # The parser gets these wrong, they are Calls not ArrayRefs if not isinstance(reference.symbol, RoutineSymbol): From 4743af722dca1405fe2789fa47767ec5474f49ec Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 17 Dec 2024 11:45:16 +0000 Subject: [PATCH 08/32] More fixes in the NEMOv5 scripts --- examples/nemo/scripts/omp_gpu_trans.py | 3 ++ examples/nemo/scripts/utils.py | 47 ++++++++++++++------------ 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index fb0cd875cf..1b9c278a00 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -61,6 +61,8 @@ "prtctl.f90", # Compiler error: Illegal substring expression "sbcblk.f90", # Compiler error: Vector expression used where scalar # expression required + "sbcflx.f90", # NEMOv4 sbc_dyc causes NVFORTRAN-S-0083-Vector expression + # used where scalar expression required ] OFFLOADING_ISSUES = [ @@ -73,6 +75,7 @@ "zdftke.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): # Illegal address during kernel execution "dynzdf.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS) + "traatf_qco.f90", # Runtime: Failed to find device function ] PRIVATISATION_ISSUES = [ diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 262e8adb6f..5f9c7cd4a5 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -38,7 +38,7 @@ from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyir.nodes import ( Assignment, Loop, Directive, Container, Reference, CodeBlock, - Call, Return, IfBlock, Routine, IntrinsicCall) + Call, Return, IfBlock, Routine, IntrinsicCall, ArrayReference) from psyclone.psyir.symbols import ( DataSymbol, INTEGER_TYPE, REAL_TYPE, ArrayType, ScalarType, RoutineSymbol, ImportInterface) @@ -222,27 +222,30 @@ def enhance_tree_information(schedule): ArrayType.Extent.ATTRIBUTE, ArrayType.Extent.ATTRIBUTE, ArrayType.Extent.ATTRIBUTE])) - elif (reference.symbol.name in NEMO_FUNCTIONS or - reference.symbol.name.startswith('local_') or - reference.symbol.name.startswith('glob_') or - reference.symbol.name.startswith('SIGN_') or - reference.symbol.name.startswith('netcdf_') or - reference.symbol.name.startswith('nf90_')): - if len(reference.children) >= 1: - # Thigs with no children are already properly classified - continue - if reference.symbol.is_import or reference.symbol.is_unresolved: - # The parser gets these wrong, they are Calls not ArrayRefs - if not isinstance(reference.symbol, RoutineSymbol): - # We need to specialise the generic Symbol to a Routine - reference.symbol.specialise(RoutineSymbol) - if not (isinstance(reference.parent, Call) and - reference.parent.routine is reference): - # We also need to replace the Reference node with a Call - call = Call.create(reference.symbol) - for child in reference.children[:]: - call.addchild(child.detach()) - reference.replace_with(call) + elif ( + # If its an ArrayReference ... + isinstance(reference, ArrayReference) and + # ... with the following name ... + (reference.symbol.name in NEMO_FUNCTIONS or + reference.symbol.name.startswith('local_') or + reference.symbol.name.startswith('glob_') or + reference.symbol.name.startswith('SIGN_') or + reference.symbol.name.startswith('netcdf_') or + reference.symbol.name.startswith('nf90_')) and + # ... and the symbol is unresolved + (reference.symbol.is_import or reference.symbol.is_unresolved) + ): + # The parser gets these wrong, they are Calls not ArrayRefs + if not isinstance(reference.symbol, RoutineSymbol): + # We need to specialise the generic Symbol to a Routine + reference.symbol.specialise(RoutineSymbol) + if not (isinstance(reference.parent, Call) and + reference.parent.routine is reference): + # We also need to replace the Reference node with a Call + call = Call.create(reference.symbol) + for child in reference.children[:]: + call.addchild(child.detach()) + reference.replace_with(call) def inline_calls(schedule): From 77a9c9648da91953f759d4643579cd666c806901 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 17 Dec 2024 11:45:52 +0000 Subject: [PATCH 09/32] Do not run the integration tests passtrough in the first attempt --- .github/workflows/nemo_tests.yml | 1 + .github/workflows/nemo_v5_tests.yml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index 675949705e..0c3e4ea63a 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -84,6 +84,7 @@ jobs: # PSyclone passthrough for MetOffice NEMO - name: NEMO MetOffice Passthrough + if: ${{ github.run_attempt != '1' }} run: | . .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index c66dd91b4a..5a08e0d790 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -72,6 +72,7 @@ jobs: # PSyclone passthrough for 5.0-beta of NEMO. - name: NEMO 5.0 gfortran passthrough + if: ${{ github.run_attempt != '1' }} run: | # Set up environment source /apps/spack/psyclone-spack/spack-repo/share/spack/setup-env.sh @@ -101,6 +102,7 @@ jobs: diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.gfortran.small.100steps run.stat - name: NEMO 5.0 nvidia passthrough + if: ${{ github.run_attempt != '1' }} run: | # Set up environment source /apps/spack/psyclone-spack/spack-repo/share/spack/setup-env.sh @@ -132,6 +134,7 @@ jobs: echo "Time-stepping duration = " $VAR_TIME - name: NEMO 5.0 Intel passthrough + if: ${{ github.run_attempt != '1' }} run: | # Set up environment source /apps/spack/psyclone-spack/spack-repo/share/spack/setup-env.sh From 4f7776aec731f70d9b6c55bdee90e0d06aa207de Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 17 Dec 2024 11:55:42 +0000 Subject: [PATCH 10/32] Reduce NEMOv5 integration test timesteps --- .github/workflows/nemo_v5_tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 5a08e0d790..2469b247c9 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -186,10 +186,10 @@ jobs: # Run test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 - cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg + cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small_10 namelist_cfg OMP_NUM_THREADS=4 mpirun -np 1 ./nemo tail run.stat - diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.gfortran.small.100steps run.stat + diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.gfortran.small.10steps run.stat export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \ "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \ @@ -225,10 +225,10 @@ jobs: # Run test (disabled because it is currently too slow) cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 - cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg + cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small_10 namelist_cfg ./nemo # tail run.stat - diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.100steps run.stat + diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.10steps run.stat export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \ "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \ From 0f44e2844bcc0672d7c3fa8a0729af6041d16421 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 17 Dec 2024 12:04:00 +0000 Subject: [PATCH 11/32] Add NEMOv5 10-timestep KGOs --- .../scripts/KGOs/namelist_cfg_bench_small_10 | 225 ++++++++++++++++++ .../run.stat.bench.gfortran.small.10steps | 10 + .../KGOs/run.stat.bench.nvhpc.small.10steps | 10 + 3 files changed, 245 insertions(+) create mode 100644 examples/nemo/scripts/KGOs/namelist_cfg_bench_small_10 create mode 100644 examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps create mode 100644 examples/nemo/scripts/KGOs/run.stat.bench.nvhpc.small.10steps diff --git a/examples/nemo/scripts/KGOs/namelist_cfg_bench_small_10 b/examples/nemo/scripts/KGOs/namelist_cfg_bench_small_10 new file mode 100644 index 0000000000..9a8f2c4afe --- /dev/null +++ b/examples/nemo/scripts/KGOs/namelist_cfg_bench_small_10 @@ -0,0 +1,225 @@ +!!>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +!! NEMO/OPA BENCH Configuration namelist : overwrite some defaults values defined in SHARED/namelist_ref +!!>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +! +!----------------------------------------------------------------------- +&namrun ! parameters of the run +!----------------------------------------------------------------------- + cn_exp = 'BENCH' ! experience name + nn_it000 = 1 ! first time step + nn_itend = 10 ! last time step + nn_stock = -1 ! frequency of creation of a restart file (modulo referenced to 1) + nn_write = -1 ! frequency of write in the output file (modulo referenced to nn_it000) +/ +!----------------------------------------------------------------------- +&namusr_def ! User defined : BENCH configuration: Flat bottom, beta-plane +!----------------------------------------------------------------------- + nn_isize = 64 ! number of point in i-direction of global(local) domain if >0 (<0) + nn_jsize = 64 ! number of point in j-direction of global(local) domain if >0 (<0) + nn_ksize = 30 ! total number of point in k-direction + ln_Iperio = .true. ! i-periodicity + ln_Jperio = .false. ! j-periodicity + ln_NFold = .true. ! North pole folding + cn_NFtype = 'F' ! Folding type: T or F +/ +!----------------------------------------------------------------------- +&nammpp ! Massively Parallel Processing +!----------------------------------------------------------------------- + ln_nnogather= .true. ! activate code to avoid mpi_allgather use at the northfold + jpni = 0 ! jpni number of processors following i (set automatically if < 1) + jpnj = 0 ! jpnj number of processors following j (set automatically if < 1) +/ +!----------------------------------------------------------------------- +&namctl ! Control prints (default: OFF) +!----------------------------------------------------------------------- + ln_timing = .true. ! timing by routine write out in timing.output file + sn_cfctl%l_runstat = .TRUE. +/ +!----------------------------------------------------------------------- +&namdom ! time and space domain +!----------------------------------------------------------------------- + rn_Dt = 360. ! time step for the dynamics (and tracer if nn_acc=0) + ln_meshmask = .false. ! =T create a mesh file +/ + +!!====================================================================== +!! *** Surface Boundary Condition namelists *** !! +!! !! +!! namsbc surface boundary condition manager (default: NO selection) +!!====================================================================== +! +!----------------------------------------------------------------------- +&namsbc ! Surface Boundary Condition (surface module) +!----------------------------------------------------------------------- + ln_usr = .true. ! user defined formulation (T => check usrdef_sbc) + nn_ice = 2 ! =0 no ice boundary condition + ! ! =1 use observed ice-cover ( => fill namsbc_iif ) + ! ! =2 or 3 for SI3 and CICE, respectively + ln_traqsr = .true. ! Light penetration in the ocean (T => fill namtra_qsr) +/ + +! +!!====================================================================== +!! *** Lateral boundary condition *** !! +!! !! +!! namlbc lateral momentum boundary condition (default: NO selection) +!! namagrif agrif nested grid (read by child model only) ("key_agrif") +!! nam_tide Tidal forcing (default: OFF) +!! nambdy Unstructured open boundaries (default: OFF) +!! nambdy_dta Unstructured open boundaries - external data (see nambdy) +!! nambdy_tide tidal forcing at open boundaries (default: OFF) +!!====================================================================== +! +!----------------------------------------------------------------------- +&namlbc ! lateral momentum boundary condition (default: NO selection) +!----------------------------------------------------------------------- + rn_shlat = 0. ! free slip +/ + +!!====================================================================== +!! *** Top/Bottom boundary condition *** !! +!! !! +!! namdrg top/bottom drag coefficient (default: NO selection) +!! namdrg_top top friction (ln_drg_OFF =F & ln_isfcav=T) +!! namdrg_bot bottom friction (ln_drg_OFF =F) +!! nambbc bottom temperature boundary condition (default: OFF) +!! nambbl bottom boundary layer scheme (default: OFF) +!!====================================================================== +! +!----------------------------------------------------------------------- +&namtra_qsr ! penetrative solar radiation (ln_traqsr =T) +!----------------------------------------------------------------------- + ! ! type of penetration (default: NO selection) + ln_qsr_rgb = .true. ! RGB light penetration (Red-Green-Blue) + nn_chldta = 0 ! RGB : Chl data (=1) or cst value (=0) +/ +!----------------------------------------------------------------------- +&namdrg ! top/bottom drag coefficient (default: NO selection) +!----------------------------------------------------------------------- + ln_non_lin = .true. ! non-linear drag: Cd = Cd0 |U| +/ +!----------------------------------------------------------------------- +&nambbc ! bottom temperature boundary condition (default: OFF) +!----------------------------------------------------------------------- + ln_trabbc = .true. ! Apply a geothermal heating at the ocean bottom + nn_geoflx = 1 ! geothermal heat flux: = 1 constant flux +/ +!----------------------------------------------------------------------- +&nambbl ! bottom boundary layer scheme (default: OFF) +!----------------------------------------------------------------------- + ln_trabbl = .true. ! Bottom Boundary Layer parameterisation flag +/ + +!!====================================================================== +!! Tracer (T & S) namelists !! +!! !! +!! nameos equation of state (default: NO selection) +!! namtra_adv advection scheme (default: NO selection) +!! namtra_ldf lateral diffusion scheme (default: NO selection) +!! namtra_mle mixed layer eddy param. (Fox-Kemper param.) (default: OFF) +!! namtra_eiv eddy induced velocity param. (default: OFF) +!! namtra_dmp T & S newtonian damping (default: OFF) +!!====================================================================== +! +!----------------------------------------------------------------------- +&nameos ! ocean Equation Of Seawater (default: NO selection) +!----------------------------------------------------------------------- + ln_teos10 = .true. ! = Use TEOS-10 +/ +!----------------------------------------------------------------------- +&namtra_adv ! advection scheme for tracer (default: NO selection) +!----------------------------------------------------------------------- + ln_traadv_fct = .true. ! FCT scheme + nn_fct_h = 2 ! =2/4, horizontal 2nd / 4th order + nn_fct_v = 2 ! =2/4, vertical 2nd / COMPACT 4th order +/ +!----------------------------------------------------------------------- +&namtra_ldf ! lateral diffusion scheme for tracers (default: NO selection) +!----------------------------------------------------------------------- + ln_traldf_lap = .true. ! laplacian operator + ln_traldf_iso = .true. ! iso-neutral (standard operator) + ! + nn_aht_ijk_t = 20 ! space/time variation of eddy coefficient: +/ +!----------------------------------------------------------------------- +&namtra_mle ! mixed layer eddy parametrisation (Fox-Kemper) (default: OFF) +!----------------------------------------------------------------------- + ln_mle = .true. ! (T) use the Mixed Layer Eddy (MLE) parameterisation +/ +!----------------------------------------------------------------------- +&namtra_eiv ! eddy induced velocity param. (default: OFF) +!----------------------------------------------------------------------- + ln_ldfeiv = .true. ! use eddy induced velocity parameterization + ! + nn_aei_ijk_t = 20 ! space/time variation of eddy coefficient: +/ + +!!====================================================================== +!! *** Dynamics namelists *** !! +!! !! +!! nam_vvl vertical coordinate options (default: z-star) +!! namdyn_adv formulation of the momentum advection (default: NO selection) +!! namdyn_vor advection scheme (default: NO selection) +!! namdyn_hpg hydrostatic pressure gradient (default: NO selection) +!! namdyn_spg surface pressure gradient (default: NO selection) +!! namdyn_ldf lateral diffusion scheme (default: NO selection) +!! namdta_dyn offline TOP: dynamics read in files (OFF_SRC only) +!!====================================================================== +! +!----------------------------------------------------------------------- +&nam_vvl ! vertical coordinate options (default: z-star) +!----------------------------------------------------------------------- + ln_vvl_zstar = .true. ! z-star vertical coordinate + ln_vvl_dbg = .false. ! debug prints (T/F) +/ +!----------------------------------------------------------------------- +&namdyn_adv ! formulation of the momentum advection (default: NO selection) +!----------------------------------------------------------------------- + ln_dynadv_vec = .true. ! vector form (T) or flux form (F) + nn_dynkeg = 1 ! scheme for grad(KE): =0 C2 ; =1 Hollingsworth correction +/ +!----------------------------------------------------------------------- +&namdyn_vor ! Vorticity / Coriolis scheme (default: NO selection) +!----------------------------------------------------------------------- + ln_dynvor_een = .true. ! energy & enstrophy scheme +/ +!----------------------------------------------------------------------- +&namdyn_hpg ! Hydrostatic pressure gradient option (default: NO selection) +!----------------------------------------------------------------------- + ln_hpg_sco = .true. ! s-coordinate (standard jacobian formulation) +/ +!----------------------------------------------------------------------- +&namdyn_spg ! surface pressure gradient (default: NO selection) +!----------------------------------------------------------------------- + ln_dynspg_ts = .true. ! split-explicit free surface + ln_bt_auto = .false. ! Number of sub-step defined from: + nn_e = 30 ! =F : the number of sub-step in rn_Dt seconds +/ +!----------------------------------------------------------------------- +&namdyn_ldf ! lateral diffusion on momentum (default: NO selection) +!----------------------------------------------------------------------- + ln_dynldf_lap = .true. ! laplacian operator + ln_dynldf_hor = .true. ! horizontal (geopotential) + ! ! Coefficient + nn_ahm_ijk_t = 30 ! space/time variation of eddy coef +/ + +!!====================================================================== +!! vertical physics namelists !! +!! !! +!! namzdf vertical physics manager (default: NO selection) +!! namzdf_ric richardson number vertical mixing (ln_zdfric=T) +!! namzdf_tke TKE vertical mixing (ln_zdftke=T) +!! namzdf_gls GLS vertical mixing (ln_zdfgls=T) +!! namzdf_osm OSM vertical diffusion (ln_zdfosm=T) +!! namzdf_iwm tidal mixing parameterization (ln_zdfiwm=T) +!!====================================================================== +! +!----------------------------------------------------------------------- +&namzdf ! vertical physics (default: NO selection) +!----------------------------------------------------------------------- + ln_zdftke = .true. ! Turbulent Kinetic Energy closure (T => fill namzdf_tke) + ln_zdfevd = .true. ! enhanced vertical diffusion + ln_zdfddm = .true. ! double diffusive mixing + ln_zdfiwm = .true. ! internal wave-induced mixing (T => fill namzdf_iwm) +/ diff --git a/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps b/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps new file mode 100644 index 0000000000..b53b99e092 --- /dev/null +++ b/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps @@ -0,0 +1,10 @@ + it : 1 |ssh|_max: 0.2336851764570087D+01 |U|_max: 0.7052149477800684D-02 |V|_max: 0.2308260467200877D-02 S_min: 0.2996908781150693D+02 S_max: 0.3101392942716721D+02 + it : 2 |ssh|_max: 0.3739164083094878D+01 |U|_max: 0.1029616821992987D-01 |V|_max: 0.9486960009862211D-02 S_min: 0.2996910922616945D+02 S_max: 0.3101392859195436D+02 + it : 3 |ssh|_max: 0.4179101131274851D+01 |U|_max: 0.1301524988138879D-01 |V|_max: 0.2226585559898513D-01 S_min: 0.2996913484029493D+02 S_max: 0.3101392781894970D+02 + it : 4 |ssh|_max: 0.4569875511150748D+01 |U|_max: 0.1401053780649201D-01 |V|_max: 0.3894932715115156D-01 S_min: 0.2996916048603851D+02 S_max: 0.3101392710567817D+02 + it : 5 |ssh|_max: 0.4796169575324639D+01 |U|_max: 0.1103849688785435D-01 |V|_max: 0.5247761995640531D-01 S_min: 0.2996918638309708D+02 S_max: 0.3101392644923621D+02 + it : 6 |ssh|_max: 0.4986687389489863D+01 |U|_max: 0.1061909566753376D-01 |V|_max: 0.6682435160313002D-01 S_min: 0.2996921212409725D+02 S_max: 0.3101392584662024D+02 + it : 7 |ssh|_max: 0.5137377532600958D+01 |U|_max: 0.7776206738812892D-02 |V|_max: 0.8230434351326746D-01 S_min: 0.2996923806059345D+02 S_max: 0.3101392529517780D+02 + it : 8 |ssh|_max: 0.5245041711882124D+01 |U|_max: 0.1114333312848225D-01 |V|_max: 0.9661364041991180D-01 S_min: 0.2996926384223953D+02 S_max: 0.3101392479253744D+02 + it : 9 |ssh|_max: 0.5162398664673749D+01 |U|_max: 0.1465335344080455D-01 |V|_max: 0.1139851405821525D+00 S_min: 0.2996929000744533D+02 S_max: 0.3101392433638144D+02 + it : 10 |ssh|_max: 0.5005955481222619D+01 |U|_max: 0.2023455937023107D-01 |V|_max: 0.1294762239325535D+00 S_min: 0.2996931617422945D+02 S_max: 0.3101392392447119D+02 diff --git a/examples/nemo/scripts/KGOs/run.stat.bench.nvhpc.small.10steps b/examples/nemo/scripts/KGOs/run.stat.bench.nvhpc.small.10steps new file mode 100644 index 0000000000..ef1dc55186 --- /dev/null +++ b/examples/nemo/scripts/KGOs/run.stat.bench.nvhpc.small.10steps @@ -0,0 +1,10 @@ + it : 1 |ssh|_max: 0.2336851764570087D+01 |U|_max: 0.7052149477800684D-02 |V|_max: 0.2308260467200878D-02 S_min: 0.2996908781150693D+02 S_max: 0.3101392942716721D+02 + it : 2 |ssh|_max: 0.3739164083094879D+01 |U|_max: 0.1029616821992987D-01 |V|_max: 0.9486960009862213D-02 S_min: 0.2996910922616945D+02 S_max: 0.3101392859195436D+02 + it : 3 |ssh|_max: 0.4179101131274851D+01 |U|_max: 0.1301524988138877D-01 |V|_max: 0.2226585559898513D-01 S_min: 0.2996913484029493D+02 S_max: 0.3101392781894970D+02 + it : 4 |ssh|_max: 0.4569875511150748D+01 |U|_max: 0.1401053780649233D-01 |V|_max: 0.3894932715115157D-01 S_min: 0.2996916048603851D+02 S_max: 0.3101392710567817D+02 + it : 5 |ssh|_max: 0.4796169575324495D+01 |U|_max: 0.1103849688785496D-01 |V|_max: 0.5247761995640531D-01 S_min: 0.2996918638309708D+02 S_max: 0.3101392644923621D+02 + it : 6 |ssh|_max: 0.4986687389489813D+01 |U|_max: 0.1061909566753376D-01 |V|_max: 0.6682435160312988D-01 S_min: 0.2996921212409725D+02 S_max: 0.3101392584662024D+02 + it : 7 |ssh|_max: 0.5137377532600873D+01 |U|_max: 0.7776206738812510D-02 |V|_max: 0.8230434351326728D-01 S_min: 0.2996923806059344D+02 S_max: 0.3101392529517780D+02 + it : 8 |ssh|_max: 0.5245041711882096D+01 |U|_max: 0.1114333312848249D-01 |V|_max: 0.9661364041990601D-01 S_min: 0.2996926384223953D+02 S_max: 0.3101392479253744D+02 + it : 9 |ssh|_max: 0.5162398664673712D+01 |U|_max: 0.1465335344080445D-01 |V|_max: 0.1139851405821512D+00 S_min: 0.2996929000744533D+02 S_max: 0.3101392433638144D+02 + it : 10 |ssh|_max: 0.5005955481222640D+01 |U|_max: 0.2023455937023348D-01 |V|_max: 0.1294762239325558D+00 S_min: 0.2996931617422945D+02 S_max: 0.3101392392447119D+02 From 873be3bf1538f22e392ae7f930b25d57813d3724 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 17 Dec 2024 15:01:02 +0000 Subject: [PATCH 12/32] Some more updates for NEMOv5 --- examples/nemo/scripts/omp_cpu_trans.py | 3 +++ examples/nemo/scripts/omp_gpu_trans.py | 8 ++++++++ examples/nemo/scripts/utils.py | 3 +-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/nemo/scripts/omp_cpu_trans.py b/examples/nemo/scripts/omp_cpu_trans.py index 1102455c61..fa48d0ba72 100755 --- a/examples/nemo/scripts/omp_cpu_trans.py +++ b/examples/nemo/scripts/omp_cpu_trans.py @@ -59,6 +59,9 @@ # List of all files that psyclone will skip processing FILES_TO_SKIP = PASSTHROUGH_ISSUES +if PROFILING_ENABLED: + # Fails with profiling enabled. issue #2723 + FILES_TO_SKIP.append("mppini.f90") def trans(psyir): ''' Add OpenMP Parallel and Do directives to all loops, including the diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index bd6c5d1b55..43edcde10a 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -81,6 +81,7 @@ # Illegal address during kernel execution "dynzdf.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS) "traatf_qco.f90", # Runtime: Failed to find device function + "lbclnk.f90", # Improve performance until #2751 ] PRIVATISATION_ISSUES = [ @@ -115,6 +116,13 @@ def trans(psyir): for subroutine in psyir.walk(Routine): + # Skip things from the initialisation + if (subroutine.name.endswith('_alloc') or + subroutine.name.endswith('_init') or + subroutine.name.startswith('Agrif') or + subroutine.name == 'dom_msk'): + continue + if PROFILING_ENABLED: add_profiling(subroutine.children) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 47dc5352e4..4af0ded032 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -64,8 +64,7 @@ ] # If routine names contain these substrings then we do not profile them -PROFILING_IGNORE = ["_init", "_rst", "alloc", "agrif", "flo_dom", - "macho", "mpp_", "nemo_gcm", +PROFILING_IGNORE = ["flo_dom", "macho", "mpp_", "nemo_gcm", # These are small functions that the addition of profiling # prevents from being in-lined (and then breaks any attempt # to create OpenACC regions with calls to them) From a64e1da6ff3862d2148967bcbf33101f0762f76f Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Wed, 18 Dec 2024 20:37:51 +0000 Subject: [PATCH 13/32] add OMP DECLARE TARGET to sbc_phy and solfrac_mod --- examples/nemo/scripts/omp_gpu_trans.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 43edcde10a..a7bb4aef4f 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -139,6 +139,17 @@ def trans(psyir): hoist_expressions=True ) + if (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop)) or \ + psyir.name == "solfrac_mod.f90": + try: + # We need the 'force' option. + # SIGN_ARRAY_1D has a CodeBlock because of a WHERE without + # array notation. (TODO #717) + OMPDeclareTargetTrans().apply(subroutine, + options={"force": True}) + except TransformationError as err: + print(err) + # Thes are functions that are called from inside parallel regions, # annotate them with 'omp declare target' if subroutine.name.lower().startswith("sign_"): From 28ca0dfa24789ffb49c8705d8714e729666f8320 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Mon, 13 Jan 2025 13:16:03 +0000 Subject: [PATCH 14/32] Add NEMO GPU exclusions --- examples/nemo/scripts/omp_cpu_trans.py | 1 + examples/nemo/scripts/omp_gpu_trans.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/nemo/scripts/omp_cpu_trans.py b/examples/nemo/scripts/omp_cpu_trans.py index fa48d0ba72..8facebca8f 100755 --- a/examples/nemo/scripts/omp_cpu_trans.py +++ b/examples/nemo/scripts/omp_cpu_trans.py @@ -63,6 +63,7 @@ # Fails with profiling enabled. issue #2723 FILES_TO_SKIP.append("mppini.f90") + def trans(psyir): ''' Add OpenMP Parallel and Do directives to all loops, including the implicit ones. diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 43edcde10a..26a91b279e 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -68,6 +68,7 @@ # expression required "sbcflx.f90", # NEMOv4 sbc_dyc causes NVFORTRAN-S-0083-Vector expression # used where scalar expression required + "fldread.f90", # Wrong runtime results ] OFFLOADING_ISSUES = [ @@ -79,9 +80,9 @@ "crsdom.f90", # String comparison not allowed inside omp teams "zdftke.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): # Illegal address during kernel execution - "dynzdf.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS) "traatf_qco.f90", # Runtime: Failed to find device function "lbclnk.f90", # Improve performance until #2751 + "dynzdf.f90", # Wrong runtime results ] PRIVATISATION_ISSUES = [ @@ -98,6 +99,7 @@ def trans(psyir): :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` ''' + # import os # if psyir.name not in (os.environ['ONLY_FILE'], "lib_fortran.f90"): # return omp_target_trans = OMPTargetTrans() @@ -120,7 +122,9 @@ def trans(psyir): if (subroutine.name.endswith('_alloc') or subroutine.name.endswith('_init') or subroutine.name.startswith('Agrif') or - subroutine.name == 'dom_msk'): + subroutine.name.startswith('dia_') or + subroutine.name == 'dom_msk' or + subroutine.name == 'dom_ngb'): continue if PROFILING_ENABLED: From 80ccb50a76b05121ed71e53715fd2873c2ca75e6 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 15 Jan 2025 10:06:56 +0000 Subject: [PATCH 15/32] Exlcude more files for increase NEMOv5 accuracy in ORCA1 --- examples/nemo/scripts/omp_gpu_trans.py | 62 +++++++++++++++----------- examples/nemo/scripts/utils.py | 2 +- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index d8f2d63c66..ccbe9ec86d 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -56,19 +56,29 @@ # List of all files that psyclone will skip processing FILES_TO_SKIP = PASSTHROUGH_ISSUES + [ - "iom.f90", - "iom_nf90.f90", - "iom_def.f90", - "timing.f90", # Compiler error: Subscript, substring, or argument illegal - "lbcnfd.f90", # Illegal address during kernel execution - # - line 1012: lbc_nfd_dp "lib_mpp.f90", # Compiler error: Illegal substring expression - "prtctl.f90", # Compiler error: Illegal substring expression "sbcblk.f90", # Compiler error: Vector expression used where scalar # expression required - "sbcflx.f90", # NEMOv4 sbc_dyc causes NVFORTRAN-S-0083-Vector expression + "sbcflx.f90", # NEMOv4 sbc_dyc causes NVFORTRAN-S-0083-Vector expression # used where scalar expression required "fldread.f90", # Wrong runtime results + "zdfddm.f90", # Wrong results + "zdfiwm.f90", # Wrong results + "geo2ocean.f90", # Wrong results +] + +SKIP_FOR_PERFORMANCE = [ + # Check if these work with NEMOv4 + "iom.f90", + "iom_nf90.f90", + "iom_def.f90", + "timing.f90", + "prtctl.f90", +] + +DONT_HOIST = [ + # Incorrect hoisting + "lbcnfd.f90", ] OFFLOADING_ISSUES = [ @@ -83,6 +93,8 @@ "traatf_qco.f90", # Runtime: Failed to find device function "lbclnk.f90", # Improve performance until #2751 "dynzdf.f90", # Wrong runtime results + "traqsr.f90", + "ldftra.f90", # Wrong runtime results ] PRIVATISATION_ISSUES = [ @@ -112,6 +124,9 @@ def trans(psyir): if psyir.name.startswith("obs_"): return + if psyir.name in SKIP_FOR_PERFORMANCE: + return + # ICE routines do not perform well on GPU, so we skip them if psyir.name.startswith("ice"): return @@ -124,12 +139,10 @@ def trans(psyir): subroutine.name.startswith('Agrif') or subroutine.name.startswith('dia_') or subroutine.name == 'dom_msk' or + subroutine.name == 'dom_zgr' or subroutine.name == 'dom_ngb'): continue - if PROFILING_ENABLED: - add_profiling(subroutine.children) - print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") enhance_tree_information(subroutine) @@ -140,28 +153,27 @@ def trans(psyir): convert_array_notation=True, loopify_array_intrinsics=True, convert_range_loops=True, - hoist_expressions=True + hoist_expressions=(psyir.name not in DONT_HOIST) ) - if (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop)) or \ - psyir.name == "solfrac_mod.f90": + # These are functions that are called from inside parallel regions, + # annotate them with 'omp declare target' + if ( + subroutine.name.lower().startswith("sign_") or + subroutine.name.lower() == "solfrac" or + (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop)) + ): try: - # We need the 'force' option. - # SIGN_ARRAY_1D has a CodeBlock because of a WHERE without - # array notation. (TODO #717) - OMPDeclareTargetTrans().apply(subroutine, - options={"force": True}) + OMPDeclareTargetTrans().apply(subroutine) + print(f"Marked {subroutine.name} as GPU-enabled") except TransformationError as err: print(err) - - # Thes are functions that are called from inside parallel regions, - # annotate them with 'omp declare target' - if subroutine.name.lower().startswith("sign_"): - OMPDeclareTargetTrans().apply(subroutine) - print(f"Marked {subroutine.name} as GPU-enabled") # We continue parallelising inside the routine, but this could # change if the parallelisation directives added below are not # nestable, in that case we could add a 'continue' here + elif PROFILING_ENABLED: + # We annotate the rest with profiling hooks if requested + add_profiling(subroutine.children) # For now this is a special case for stpctl.f90 because it forces # loops to parallelise without many safety checks diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 4af0ded032..f9e4794ec4 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -69,7 +69,7 @@ # prevents from being in-lined (and then breaks any attempt # to create OpenACC regions with calls to them) "interp1", "interp2", "interp3", "integ_spline", "sbc_dcy", - "sum", "sign_", "ddpdd", "psyclone_cmp_int", + "sum", "sign_", "ddpdd", "solfrac", "psyclone_cmp_int", "psyclone_cmp_char", "psyclone_cmp_logical"] # Currently fparser has no way of distinguishing array accesses from From f7818315bb14997786049f8a4fe9b849ab75c985 Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Wed, 15 Jan 2025 12:04:00 +0000 Subject: [PATCH 16/32] removed exclusions by moving to omp teams loop --- examples/nemo/scripts/omp_gpu_trans.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index ccbe9ec86d..8af7ee6083 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -65,6 +65,7 @@ "zdfddm.f90", # Wrong results "zdfiwm.f90", # Wrong results "geo2ocean.f90", # Wrong results + "zdfswm.f90", # fort2 terminated by signal 11 ] SKIP_FOR_PERFORMANCE = [ @@ -74,6 +75,7 @@ "iom_def.f90", "timing.f90", "prtctl.f90", + "trazdf.f90", ] DONT_HOIST = [ @@ -84,10 +86,6 @@ OFFLOADING_ISSUES = [ "trcrad.f90", # Illegal address during kernel execution, unless the # dimensions are small - "tranxt.f90", # String comparison not allowed inside omp teams - # (this worked fine with omp loop) - "trazdf.f90", # String comparison not allowed inside omp teams - "crsdom.f90", # String comparison not allowed inside omp teams "zdftke.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): # Illegal address during kernel execution "traatf_qco.f90", # Runtime: Failed to find device function From 1eaf4c152f810e95d8fee4f987db862f5420ccf2 Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Wed, 15 Jan 2025 14:01:53 +0000 Subject: [PATCH 17/32] added OMPTeamsLoop Directive --- examples/nemo/scripts/omp_gpu_trans.py | 2 +- src/psyclone/psyir/nodes/__init__.py | 3 ++- src/psyclone/psyir/nodes/omp_directives.py | 5 +++++ src/psyclone/psyir/transformations/omp_loop_trans.py | 4 +++- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 8af7ee6083..701b9e2ed9 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -114,7 +114,7 @@ def trans(psyir): # return omp_target_trans = OMPTargetTrans() omp_gpu_loop_trans = OMPLoopTrans(omp_schedule="none") - omp_gpu_loop_trans.omp_directive = "teamsdistributeparalleldo" + omp_gpu_loop_trans.omp_directive = "teamsloop" omp_cpu_loop_trans = OMPLoopTrans(omp_schedule="static") omp_cpu_loop_trans.omp_directive = "paralleldo" diff --git a/src/psyclone/psyir/nodes/__init__.py b/src/psyclone/psyir/nodes/__init__.py index b43f98f751..b050cbc88c 100644 --- a/src/psyclone/psyir/nodes/__init__.py +++ b/src/psyclone/psyir/nodes/__init__.py @@ -92,7 +92,7 @@ OMPStandaloneDirective, OMPRegionDirective, OMPTargetDirective, OMPLoopDirective, OMPDeclareTargetDirective, OMPTeamsDistributeParallelDoDirective, OMPAtomicDirective, - OMPSimdDirective) + OMPSimdDirective, OMPTeamsLoopDirective) from psyclone.psyir.nodes.clause import Clause, OperandClause from psyclone.psyir.nodes.omp_clauses import ( OMPGrainsizeClause, OMPNogroupClause, OMPNowaitClause, OMPNumTasksClause, @@ -184,6 +184,7 @@ 'OMPDeclareTargetDirective', 'OMPSimdDirective', 'OMPTeamsDistributeParallelDoDirective', + 'OMPTeamsLoopDirective', # OMP Clause Nodes 'OMPGrainsizeClause', 'OMPNogroupClause', diff --git a/src/psyclone/psyir/nodes/omp_directives.py b/src/psyclone/psyir/nodes/omp_directives.py index c9311ef22f..804c9821f9 100644 --- a/src/psyclone/psyir/nodes/omp_directives.py +++ b/src/psyclone/psyir/nodes/omp_directives.py @@ -2416,6 +2416,11 @@ class OMPTeamsDistributeParallelDoDirective(OMPParallelDoDirective): _directive_string = "teams distribute parallel do" +class OMPTeamsLoopDirective(OMPParallelDoDirective): + ''' Class representing the OMP teams loop directive. ''' + _directive_string = "teams loop" + + class OMPTargetDirective(OMPRegionDirective): ''' Class for the !$OMP TARGET directive that offloads the code contained in its region into an accelerator device. ''' diff --git a/src/psyclone/psyir/transformations/omp_loop_trans.py b/src/psyclone/psyir/transformations/omp_loop_trans.py index 04c31b0485..4dbd41106b 100644 --- a/src/psyclone/psyir/transformations/omp_loop_trans.py +++ b/src/psyclone/psyir/transformations/omp_loop_trans.py @@ -38,7 +38,8 @@ from psyclone.configuration import Config from psyclone.psyir.nodes import ( Routine, OMPDoDirective, OMPLoopDirective, OMPParallelDoDirective, - OMPTeamsDistributeParallelDoDirective, OMPScheduleClause) + OMPTeamsDistributeParallelDoDirective, OMPTeamsLoopDirective, + OMPScheduleClause) from psyclone.psyir.symbols import DataSymbol, INTEGER_TYPE from psyclone.psyir.transformations.parallel_loop_trans import \ ParallelLoopTrans @@ -48,6 +49,7 @@ "do": OMPDoDirective, "paralleldo": OMPParallelDoDirective, "teamsdistributeparalleldo": OMPTeamsDistributeParallelDoDirective, + "teamsloop": OMPTeamsLoopDirective, "loop": OMPLoopDirective } #: List containing the valid names for OMP directives. From 10a55d98bfc4201a1a99fb6bdaff1740cb615e70 Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Wed, 15 Jan 2025 14:42:54 +0000 Subject: [PATCH 18/32] removed lib_mpp from exclusions --- examples/nemo/scripts/omp_gpu_trans.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 701b9e2ed9..a5e80966c4 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -56,7 +56,6 @@ # List of all files that psyclone will skip processing FILES_TO_SKIP = PASSTHROUGH_ISSUES + [ - "lib_mpp.f90", # Compiler error: Illegal substring expression "sbcblk.f90", # Compiler error: Vector expression used where scalar # expression required "sbcflx.f90", # NEMOv4 sbc_dyc causes NVFORTRAN-S-0083-Vector expression From 39f84be59f707828e40be02564b84fd00d9d9eb1 Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Wed, 15 Jan 2025 15:29:58 +0000 Subject: [PATCH 19/32] removed zdftke from exclusion list --- examples/nemo/scripts/omp_gpu_trans.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index a5e80966c4..f26b08b9c4 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -85,8 +85,6 @@ OFFLOADING_ISSUES = [ "trcrad.f90", # Illegal address during kernel execution, unless the # dimensions are small - "zdftke.f90", # returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): - # Illegal address during kernel execution "traatf_qco.f90", # Runtime: Failed to find device function "lbclnk.f90", # Improve performance until #2751 "dynzdf.f90", # Wrong runtime results From 92c82ce344929547ced6b5b2d371e99512aa634c Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 15 Jan 2025 15:34:03 +0000 Subject: [PATCH 20/32] Add MERGE intrinsic as available for GPU offloading --- src/psyclone/psyir/nodes/intrinsic_call.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/psyclone/psyir/nodes/intrinsic_call.py b/src/psyclone/psyir/nodes/intrinsic_call.py index 22c68d966b..649617c951 100644 --- a/src/psyclone/psyir/nodes/intrinsic_call.py +++ b/src/psyclone/psyir/nodes/intrinsic_call.py @@ -800,6 +800,7 @@ def is_available_on_device(self): # The one below are not documented on nvidia compiler IntrinsicCall.Intrinsic.PRODUCT, IntrinsicCall.Intrinsic.SIZE, IntrinsicCall.Intrinsic.SUM, IntrinsicCall.Intrinsic.LBOUND, + IntrinsicCall.Intrinsic.MERGE, IntrinsicCall.Intrinsic.UBOUND) @classmethod From 2d12fa452e95132e4afc386ae8ead670d5db5c85 Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Wed, 15 Jan 2025 16:02:05 +0000 Subject: [PATCH 21/32] added zdftke to exclusion list - Incorrect results --- examples/nemo/scripts/omp_gpu_trans.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index f26b08b9c4..97eda0efad 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -61,6 +61,7 @@ "sbcflx.f90", # NEMOv4 sbc_dyc causes NVFORTRAN-S-0083-Vector expression # used where scalar expression required "fldread.f90", # Wrong runtime results + "zdftke.f90", # Wrong results "zdfddm.f90", # Wrong results "zdfiwm.f90", # Wrong results "geo2ocean.f90", # Wrong results From 9e983b1cdb74790b122bb79d899f04d00219c6fe Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Thu, 16 Jan 2025 11:40:01 +0000 Subject: [PATCH 22/32] removed dynzdf from offloading issues and moved to performance issues --- examples/nemo/scripts/omp_gpu_trans.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 97eda0efad..1aef51f208 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -76,6 +76,7 @@ "timing.f90", "prtctl.f90", "trazdf.f90", + "dynzdf.f90", ] DONT_HOIST = [ @@ -86,9 +87,8 @@ OFFLOADING_ISSUES = [ "trcrad.f90", # Illegal address during kernel execution, unless the # dimensions are small - "traatf_qco.f90", # Runtime: Failed to find device function + "traatf_qco.f90", # Runtime: Failed to find device function (BENCH) "lbclnk.f90", # Improve performance until #2751 - "dynzdf.f90", # Wrong runtime results "traqsr.f90", "ldftra.f90", # Wrong runtime results ] From d8072d627201acea2d9feef016ac3e45d36e5777 Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Thu, 16 Jan 2025 16:13:59 +0000 Subject: [PATCH 23/32] removed dynspg_ts.f90 and geo2ocean.f90 from exclude list --- examples/nemo/scripts/omp_gpu_trans.py | 2 +- examples/nemo/scripts/utils.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 1aef51f208..da229b8cca 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -64,7 +64,6 @@ "zdftke.f90", # Wrong results "zdfddm.f90", # Wrong results "zdfiwm.f90", # Wrong results - "geo2ocean.f90", # Wrong results "zdfswm.f90", # fort2 terminated by signal 11 ] @@ -91,6 +90,7 @@ "lbclnk.f90", # Improve performance until #2751 "traqsr.f90", "ldftra.f90", # Wrong runtime results + "geo2ocean.f90", # Uses MATH function calls (ONLY EXCLUDE FOR TESTING) ] PRIVATISATION_ISSUES = [ diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index f9e4794ec4..83d143c240 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -164,7 +164,6 @@ # TODO #2787: May solve these issues. "icedyn_rhg_evp.f90", "domqco.f90", - "dynspg_ts.f90", ] From fa537243da4e587e80c95876edbea602e045205d Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Fri, 17 Jan 2025 09:50:46 +0000 Subject: [PATCH 24/32] removed excluded files affected by no -Kieee flag --- examples/nemo/scripts/utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 83d143c240..9a0c37e447 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -159,11 +159,6 @@ PARALLELISATION_ISSUES = [ "ldfc1d_c2d.f90", "tramle.f90", - # These files get the same results when parallelised by: "nvfortran -O1 - # -Kieee -nofma -Mnovect" but had to be excluded by other compiler/flags - # TODO #2787: May solve these issues. - "icedyn_rhg_evp.f90", - "domqco.f90", ] From 5ffaea8f0da6ba50134de88acdd0fef9c6d704e9 Mon Sep 17 00:00:00 2001 From: Aditya Sadawarte Date: Fri, 17 Jan 2025 11:37:10 +0000 Subject: [PATCH 25/32] removed zdftke from direct exclusions (SQRT function) --- examples/nemo/scripts/omp_gpu_trans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index da229b8cca..1598660813 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -61,7 +61,6 @@ "sbcflx.f90", # NEMOv4 sbc_dyc causes NVFORTRAN-S-0083-Vector expression # used where scalar expression required "fldread.f90", # Wrong runtime results - "zdftke.f90", # Wrong results "zdfddm.f90", # Wrong results "zdfiwm.f90", # Wrong results "zdfswm.f90", # fort2 terminated by signal 11 @@ -91,6 +90,7 @@ "traqsr.f90", "ldftra.f90", # Wrong runtime results "geo2ocean.f90", # Uses MATH function calls (ONLY EXCLUDE FOR TESTING) + "zdftke.f90", # Uses MATH function calls (ONLY EXCLUDE FOR TESTING) ] PRIVATISATION_ISSUES = [ From 94b70c49c9f142f73e2e006741455dbb30beec36 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Fri, 17 Jan 2025 12:29:00 +0000 Subject: [PATCH 26/32] Re-introduce dynspg_ts with math issues for full reproducibility --- examples/nemo/scripts/omp_gpu_trans.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 1598660813..165320a8fb 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -89,8 +89,9 @@ "lbclnk.f90", # Improve performance until #2751 "traqsr.f90", "ldftra.f90", # Wrong runtime results - "geo2ocean.f90", # Uses MATH function calls (ONLY EXCLUDE FOR TESTING) - "zdftke.f90", # Uses MATH function calls (ONLY EXCLUDE FOR TESTING) + "geo2ocean.f90", # Uses MATH function calls (EXCLUDE FOR TESTING #2856) + "dynspg_ts.f90", # Uses MATH function calls (EXCLUDE FOR TESTING #2856) + "zdftke.f90", # Uses MATH function calls (EXCLUDE FOR TESTING #2856) ] PRIVATISATION_ISSUES = [ From 4a10d0fef8c548ef15ccfb51a4c007afc659c29b Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Mon, 20 Jan 2025 12:38:27 +0000 Subject: [PATCH 27/32] Mark the files that have parenthesis that matter for full reproducibility --- examples/nemo/scripts/omp_gpu_trans.py | 3 +-- examples/nemo/scripts/utils.py | 5 ++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 165320a8fb..ee61dc90db 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -63,7 +63,7 @@ "fldread.f90", # Wrong runtime results "zdfddm.f90", # Wrong results "zdfiwm.f90", # Wrong results - "zdfswm.f90", # fort2 terminated by signal 11 + "zdfswm.f90", # fort2 terminated by signal 11 ] SKIP_FOR_PERFORMANCE = [ @@ -90,7 +90,6 @@ "traqsr.f90", "ldftra.f90", # Wrong runtime results "geo2ocean.f90", # Uses MATH function calls (EXCLUDE FOR TESTING #2856) - "dynspg_ts.f90", # Uses MATH function calls (EXCLUDE FOR TESTING #2856) "zdftke.f90", # Uses MATH function calls (EXCLUDE FOR TESTING #2856) ] diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 9a0c37e447..374609ccea 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -151,7 +151,10 @@ # These files change the results from baseline when psyclone processes them PASSTHROUGH_ISSUES = [ - "ldfslp.f90", # It has a '!dir$ NOVECTOR' that gets deleted by fparser + # TODO #2858: These 3 have parenthesis that matter for full reporducibility + "dynspg_ts.f90", # Uses MATH function calls (EXCLUDE FOR TESTING #2856) + "dynvor.f90", + "ldfslp.f90", ] # These files change the results from the baseline when psyclone adds From ef2345d9d5af70fb5a99f3f7fbdf6ec288487518 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Fri, 24 Jan 2025 11:52:58 +0000 Subject: [PATCH 28/32] Add NEMOv5 ORCA1 test to the integration test --- .github/workflows/nemo_v5_tests.yml | 41 ++++++++++++++++++- .../scripts/KGOs/run.stat.orca1.nvhpc.10steps | 10 +++++ 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 examples/nemo/scripts/KGOs/run.stat.orca1.nvhpc.10steps diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 3234d5ee97..54f4dd1d19 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -200,7 +200,7 @@ jobs: ci_test: "NEMOv5 OpenMP for CPU", nemo_version: "NEMOv5", system: "GlaDos", compiler:"gfortran-14" , date: new Date(), elapsed_time: '"${TIME_sec}"'})' - - name: NEMO 5.0 nvidia OpenMP for GPUs (managed memory) + - name: NEMO 5.0 nvidia OpenMP for GPUs (BENCH - managed memory) run: | # Set up environment source /apps/spack/psyclone-spack/spack-repo/share/spack/setup-env.sh @@ -236,5 +236,42 @@ jobs: --password ${{ secrets.MONGODB_PASSWORD }} \ --eval 'db.GitHub_CI.insertOne({branch_name: "'"$GITHUB_REF_NAME"'", commit: "'"$GITHUB_SHA"'", github_job: "'"$GITHUB_RUN_ID"'"-"'"$GITHUB_RUN_ATTEMPT"'", - ci_test: "NEMOv5 OpenMP for GPU", nemo_version: "NEMOv5", system: "GlaDos", + ci_test: "NEMOv5 OpenMP for GPU (BENCH)", nemo_version: "NEMOv5", system: "GlaDos", + compiler:"nvhpc-24.5" , date: new Date(), elapsed_time: '"${TIME_sec}"'})' + + - name: NEMO 5.0 nvidia OpenMP for GPUs (UKMO ORCA1 - managed memory) + run: | + # Set up environment + source /apps/spack/psyclone-spack/spack-repo/share/spack/setup-env.sh + spack unload && spack load nemo-build-environment%nvhpc@24.5 + source .runner_venv/bin/activate + export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts + export PSYCLONE_HOME=${PWD}/.runner_venv + export NEMO_DIR=${HOME}/NEMOv5 + export TEST_DIR=ORCA1_OMP_OFFLOAD_NVHPC + + # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS + # We compile at -O1 to permit comparison of the results. + cd $NEMO_DIR + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + export FCFLAGS="-i4 -Mr8 -O1 -Kieee -nofma -Mnovect -g -mp=gpu -gpu=managed" + + # Clean up and compile + # Without key_mpi_off it fails to compile (even without psyclone) + ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} clean -y + ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + add_key "key_mpi_off key_nosignedzero" -j 4 -v 1 + + # Run test (disabled because it is currently too slow) + cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 + ./nemo + diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca1.nvhpc.10steps run.stat + export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) + ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \ + "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \ + --quiet --apiVersion 1 --username ${{ secrets.MONGODB_USERNAME }} \ + --password ${{ secrets.MONGODB_PASSWORD }} \ + --eval 'db.GitHub_CI.insertOne({branch_name: "'"$GITHUB_REF_NAME"'", commit: "'"$GITHUB_SHA"'", + github_job: "'"$GITHUB_RUN_ID"'"-"'"$GITHUB_RUN_ATTEMPT"'", + ci_test: "NEMOv5 OpenMP for GPU (ORCA1)", nemo_version: "NEMOv5", system: "GlaDos", compiler:"nvhpc-24.5" , date: new Date(), elapsed_time: '"${TIME_sec}"'})' diff --git a/examples/nemo/scripts/KGOs/run.stat.orca1.nvhpc.10steps b/examples/nemo/scripts/KGOs/run.stat.orca1.nvhpc.10steps new file mode 100644 index 0000000000..f354e97b28 --- /dev/null +++ b/examples/nemo/scripts/KGOs/run.stat.orca1.nvhpc.10steps @@ -0,0 +1,10 @@ + it : 1 |ssh|_max: 0.2916211875218087D+01 |U|_max: 0.9981163300069769D+00 |V|_max: 0.1307725327848167D+01 S_min: 0.4662006718575353D+01 S_max: 0.4113683127178150D+02 + it : 2 |ssh|_max: 0.2993687776643656D+01 |U|_max: 0.2222546780479131D+01 |V|_max: 0.2356596334062002D+01 S_min: 0.4676370204123349D+01 S_max: 0.4113676679086335D+02 + it : 3 |ssh|_max: 0.2980914460894113D+01 |U|_max: 0.3209483567667491D+01 |V|_max: 0.2754499501397742D+01 S_min: 0.4689947532502438D+01 S_max: 0.4113668748287516D+02 + it : 4 |ssh|_max: 0.2931506876273280D+01 |U|_max: 0.4144909991457986D+01 |V|_max: 0.2975812579591785D+01 S_min: 0.4699047798840508D+01 S_max: 0.4113662256133599D+02 + it : 5 |ssh|_max: 0.2722453056197598D+01 |U|_max: 0.4767435607510221D+01 |V|_max: 0.3687030466427942D+01 S_min: 0.4706627021626249D+01 S_max: 0.4113654818608597D+02 + it : 6 |ssh|_max: 0.2646796570389801D+01 |U|_max: 0.5207612464372120D+01 |V|_max: 0.3717499845110545D+01 S_min: 0.4713059832415359D+01 S_max: 0.4113648388708570D+02 + it : 7 |ssh|_max: 0.2567591158968874D+01 |U|_max: 0.5134450014160691D+01 |V|_max: 0.3696139950895351D+01 S_min: 0.4718670148787913D+01 S_max: 0.4113641268478863D+02 + it : 8 |ssh|_max: 0.2405462273410292D+01 |U|_max: 0.5097657179968111D+01 |V|_max: 0.3337026635792947D+01 S_min: 0.4723834038866863D+01 S_max: 0.4113635084308528D+02 + it : 9 |ssh|_max: 0.2786981134450641D+01 |U|_max: 0.4730664948689733D+01 |V|_max: 0.3157690707425519D+01 S_min: 0.4727376010231868D+01 S_max: 0.4113628337011737D+02 + it : 10 |ssh|_max: 0.2788273218892431D+01 |U|_max: 0.4406279755214914D+01 |V|_max: 0.3292925965365973D+01 S_min: 0.4730630269756457D+01 S_max: 0.4113622442765698D+02 From 953d115f16fdbb223b46fdeb63273b8b3af9cf11 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Fri, 24 Jan 2025 13:13:09 +0000 Subject: [PATCH 29/32] Fix some test issues --- .github/workflows/nemo_v5_tests.yml | 2 +- .../tests/psyir/transformations/transformations_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 54f4dd1d19..841f6f3fe7 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -187,7 +187,7 @@ jobs: # Run test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small_10 namelist_cfg - OMP_NUM_THREADS=4 mpirun -np 1 ./nemo + OMP_NUM_THREADS=4 ./nemo tail run.stat diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.gfortran.small.10steps run.stat export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) diff --git a/src/psyclone/tests/psyir/transformations/transformations_test.py b/src/psyclone/tests/psyir/transformations/transformations_test.py index 3fa4c33f89..ec4d66a42b 100644 --- a/src/psyclone/tests/psyir/transformations/transformations_test.py +++ b/src/psyclone/tests/psyir/transformations/transformations_test.py @@ -367,7 +367,7 @@ def test_omplooptrans_properties(): omplooptrans.omp_directive = "invalid" assert ("The OMPLoopTrans.omp_directive property must be a str with " "the value of ['do', 'paralleldo', 'teamsdistributeparalleldo', " - "'loop'] but found a 'str' with value 'invalid'." + "'teamsloop', 'loop'] but found a 'str' with value 'invalid'." in str(err.value)) with pytest.raises(TypeError) as err: From 68b117e09cc38adf82e38d974287f6e3bfd49e02 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 28 Jan 2025 11:48:57 +0000 Subject: [PATCH 30/32] Add more intrinsics to GPUs --- src/psyclone/psyir/nodes/intrinsic_call.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/psyclone/psyir/nodes/intrinsic_call.py b/src/psyclone/psyir/nodes/intrinsic_call.py index f76298a0bb..f63c1c07a5 100644 --- a/src/psyclone/psyir/nodes/intrinsic_call.py +++ b/src/psyclone/psyir/nodes/intrinsic_call.py @@ -798,11 +798,12 @@ def is_available_on_device(self): IntrinsicCall.Intrinsic.SIGN, IntrinsicCall.Intrinsic.SIN, IntrinsicCall.Intrinsic.SINH, IntrinsicCall.Intrinsic.SQRT, IntrinsicCall.Intrinsic.TAN, IntrinsicCall.Intrinsic.TANH, + IntrinsicCall.Intrinsic.UBOUND, IntrinsicCall.Intrinsic.MERGE, # The one below are not documented on nvidia compiler IntrinsicCall.Intrinsic.PRODUCT, IntrinsicCall.Intrinsic.SIZE, IntrinsicCall.Intrinsic.SUM, IntrinsicCall.Intrinsic.LBOUND, - IntrinsicCall.Intrinsic.MERGE, - IntrinsicCall.Intrinsic.UBOUND) + IntrinsicCall.Intrinsic.MAXVAL, IntrinsicCall.Intrinsic.MINVAL, + IntrinsicCall.Intrinsic.TINY, IntrinsicCall.Intrinsic.HUGE) @classmethod def create(cls, intrinsic, arguments=()): From 0bfa065c3b798e0d1d9a0740d04a9fce124e4613 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 28 Jan 2025 16:18:46 +0000 Subject: [PATCH 31/32] Update NEMO CI tests --- .github/workflows/nemo_v5_tests.yml | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 841f6f3fe7..847f32d62a 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -45,6 +45,8 @@ jobs: run_if_on_mirror: if: ${{ github.repository == 'stfc/PSyclone-mirror' }} runs-on: self-hosted + env: + NEMODIR_NAME: NEMOv5_Jan25 steps: - uses: actions/checkout@v3 @@ -80,7 +82,7 @@ jobs: source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv - export NEMO_DIR=${HOME}/NEMOv5 + export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=BENCH_PASSTHROUGH_GCC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS @@ -110,13 +112,13 @@ jobs: source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv - export NEMO_DIR=${HOME}/NEMOv5 + export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=BENCH_PASSTHROUGH_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-i4 -Mr8 -O1 -Kieee -nofma -Mnovect" + export FCFLAGS="-i4 -Mr8 -O1 -nofma -Mnovect" # Clean up and compile # Without key_mpi_off it fails to compile (even without psyclone) @@ -142,7 +144,7 @@ jobs: source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv - export NEMO_DIR=${HOME}/NEMOv5 + export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=BENCH_PASSTHROUGH_ONEAPI # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS @@ -170,7 +172,7 @@ jobs: source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv - export NEMO_DIR=${HOME}/NEMOv5 + export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=BENCH_OMP_THREADING_GCC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS @@ -187,7 +189,7 @@ jobs: # Run test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small_10 namelist_cfg - OMP_NUM_THREADS=4 ./nemo + OMP_NUM_THREADS=4 mpirun -np 1 ./nemo tail run.stat diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.gfortran.small.10steps run.stat export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) @@ -208,14 +210,14 @@ jobs: source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv - export NEMO_DIR=${HOME}/NEMOv5 + export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=BENCH_OMP_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS # We compile at -O1 to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-i4 -Mr8 -O1 -Kieee -nofma -Mnovect -g -mp=gpu -gpu=managed" + export FCFLAGS="-i4 -Mr8 -O1 -nofma -Mnovect -g -mp=gpu -gpu=managed,math_uniform" # Clean up and compile # Without key_mpi_off it fails to compile (even without psyclone) @@ -247,14 +249,14 @@ jobs: source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv - export NEMO_DIR=${HOME}/NEMOv5 + export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=ORCA1_OMP_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS - # We compile at -O1 to permit comparison of the results. + # We compile at "-O1 -nofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-i4 -Mr8 -O1 -Kieee -nofma -Mnovect -g -mp=gpu -gpu=managed" + export FCFLAGS="-i4 -Mr8 -O1 -nofma -Mnovect -g -mp=gpu -gpu=managed,math_uniform" # Clean up and compile # Without key_mpi_off it fails to compile (even without psyclone) From cad20f3e2b880deeec1476d1874b1003e4899f04 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 28 Jan 2025 16:22:27 +0000 Subject: [PATCH 32/32] Fix test for intrinsics available on GPU --- src/psyclone/tests/psyir/nodes/intrinsic_call_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py index 4d1639902a..71b45adad2 100644 --- a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py +++ b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py @@ -118,7 +118,7 @@ def test_intrinsiccall_is_inquiry(): (IntrinsicCall.Intrinsic.ABS, True), (IntrinsicCall.Intrinsic.MIN, True), (IntrinsicCall.Intrinsic.MAX, True), - (IntrinsicCall.Intrinsic.MAXVAL, False), + (IntrinsicCall.Intrinsic.MAXVAL, True), (IntrinsicCall.Intrinsic.ALLOCATE, False), (IntrinsicCall.Intrinsic.MATMUL, False), (IntrinsicCall.Intrinsic.ACOS, True),