diff --git a/source/source_esolver/esolver_ks_lcao_tddft.cpp b/source/source_esolver/esolver_ks_lcao_tddft.cpp index b273304ffe..8a0035681b 100644 --- a/source/source_esolver/esolver_ks_lcao_tddft.cpp +++ b/source/source_esolver/esolver_ks_lcao_tddft.cpp @@ -1,9 +1,9 @@ #include "esolver_ks_lcao_tddft.h" //----------------IO----------------- -#include "source_io/module_dipole/dipole_io.h" #include "source_io/module_ctrl/ctrl_output_td.h" #include "source_io/module_current/td_current_io.h" +#include "source_io/module_dipole/dipole_io.h" #include "source_io/module_output/output_log.h" #include "source_io/module_wf/read_wfc_nao.h" //------LCAO HSolver ElecState------- @@ -425,32 +425,51 @@ void ESolver_KS_LCAO_TDDFT::store_h_s_psi(UnitCell& ucell, // Store H and S matrices to Hk_laststep and Sk_laststep if (use_tensor && use_lapack) { - // Gather H and S matrices to root process #ifdef __MPI int myid = 0; int num_procs = 1; MPI_Comm_rank(MPI_COMM_WORLD, &myid); MPI_Comm_size(MPI_COMM_WORLD, &num_procs); - // Global matrix structure + std::complex* h_ptr = nullptr; + std::complex* s_ptr = nullptr; + + // Define containers for gathered data (only needed for multi-process) module_rt::Matrix_g> h_mat_g; module_rt::Matrix_g> s_mat_g; - // Collect H matrix - module_rt::gatherMatrix(myid, 0, h_mat, h_mat_g); - BlasConnector::copy(len_HS_ik, - h_mat_g.p.get(), - 1, - this->Hk_laststep.template data>() + ik * len_HS_ik, - 1); + if (num_procs == 1) + { + // Single process: directly point to local data without gather + h_ptr = h_mat.p; + s_ptr = s_mat.p; + } + else + { + // Multiple processes: gather data to the root process (myid == 0) and point to the gathered data + module_rt::gatherMatrix(myid, 0, h_mat, h_mat_g); + module_rt::gatherMatrix(myid, 0, s_mat, s_mat_g); + if (myid == 0) + { + h_ptr = h_mat_g.p.get(); + s_ptr = s_mat_g.p.get(); + } + } - // Collect S matrix - module_rt::gatherMatrix(myid, 0, s_mat, s_mat_g); - BlasConnector::copy(len_HS_ik, - s_mat_g.p.get(), - 1, - this->Sk_laststep.template data>() + ik * len_HS_ik, - 1); + // Only the root process (myid == 0) performs the copy + if (myid == 0 && h_ptr != nullptr && s_ptr != nullptr) + { + BlasConnector::copy(len_HS_ik, + h_ptr, + 1, + this->Hk_laststep.template data>() + ik * len_HS_ik, + 1); + BlasConnector::copy(len_HS_ik, + s_ptr, + 1, + this->Sk_laststep.template data>() + ik * len_HS_ik, + 1); + } #endif } else diff --git a/source/source_estate/module_dm/cal_edm_tddft.cpp b/source/source_estate/module_dm/cal_edm_tddft.cpp index bd33477f97..3196283c6b 100644 --- a/source/source_estate/module_dm/cal_edm_tddft.cpp +++ b/source/source_estate/module_dm/cal_edm_tddft.cpp @@ -60,7 +60,7 @@ void cal_edm_tddft(Parallel_Orbitals& pv, hamilt::Hamilt>* p_hamilt) { ModuleBase::TITLE("elecstate", "cal_edm_tddft"); - ModuleBase::timer::tick("elecstate", "cal_edm_tddft"); + ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft"); const int nlocal = PARAM.globalv.nlocal; assert(nlocal >= 0); @@ -303,7 +303,7 @@ void cal_edm_tddft(Parallel_Orbitals& pv, #endif } // end ik - ModuleBase::timer::tick("elecstate", "cal_edm_tddft"); + ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft"); return; } // cal_edm_tddft @@ -313,7 +313,7 @@ void cal_edm_tddft_tensor(Parallel_Orbitals& pv, hamilt::Hamilt>* p_hamilt) { ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor"); - ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor"); + ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft"); const int nlocal = PARAM.globalv.nlocal; assert(nlocal >= 0); @@ -532,7 +532,7 @@ void cal_edm_tddft_tensor(Parallel_Orbitals& pv, ModuleBase::WARNING_QUIT("elecstate::cal_edm_tddft_tensor", "MPI is required for this function!"); #endif } // end ik - ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor"); + ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft"); return; } // cal_edm_tddft_tensor @@ -544,7 +544,7 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv, hamilt::Hamilt>* p_hamilt) { ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor_lapack"); - ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor_lapack"); + ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft"); const int nlocal = PARAM.globalv.nlocal; assert(nlocal >= 0); @@ -555,6 +555,12 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv, // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU using ct_Device = typename ct::PsiToContainer::type; + // Memory operations + using syncmem_complex_h2d_op + = base_device::memory::synchronize_memory_op, Device, base_device::DEVICE_CPU>; + using syncmem_complex_d2h_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, Device>; + #if ((defined __CUDA) /* || (defined __ROCM) */) if (ct_device_type == ct::DeviceType::GpuDevice) { @@ -573,209 +579,214 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv, #ifdef __MPI int myid = 0; const int root_proc = 0; + int num_procs = 1; MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + // 1. Prepare Data Source Pointers (Host) + // If np = 1, point directly to local data to avoid copy + // If np > 1, gather data and point to the gathered buffer + std::complex* h_src = nullptr; + std::complex* s_src = nullptr; + std::complex* dmk_src = nullptr; - // Gather local H, S, and DMK matrices to global matrices on root process + // Global containers (Used only when num_procs > 1) + module_rt::Matrix_g> h_mat_global, s_mat_global, dmk_global, edm_global; + + // Get Local Matrices hamilt::MatrixBlock> h_mat_local, s_mat_local; p_hamilt->matrix(h_mat_local, s_mat_local); - module_rt::Matrix_g> h_mat_global, s_mat_global, dmk_global; - module_rt::gatherMatrix(myid, root_proc, h_mat_local, h_mat_global); - module_rt::gatherMatrix(myid, root_proc, s_mat_local, s_mat_global); - - // Create a temporary MatrixBlock for local DMK - hamilt::MatrixBlock> dmk_local_block; - dmk_local_block.p = tmp_dmk_local; - dmk_local_block.desc = pv.desc; - module_rt::gatherMatrix(myid, root_proc, dmk_local_block, dmk_global); - - // Declare and allocate global EDM matrix on ALL processes, prepare for distribution in the end - module_rt::Matrix_g> edm_global; - edm_global.p.reset(new std::complex[nlocal * nlocal]); - edm_global.row = nlocal; - edm_global.col = nlocal; - // Set the descriptor of the global EDM matrix - edm_global.desc.reset(new int[9]{1, pv.desc[1], nlocal, nlocal, nlocal, nlocal, 0, 0, nlocal}); - - // Only root process performs the global calculation + if (num_procs == 1) + { + // Optimization: Direct access for single process + h_src = h_mat_local.p; + s_src = s_mat_local.p; + dmk_src = tmp_dmk_local; + } + else + { + // Standard Gather Logic for multi-process + module_rt::gatherMatrix(myid, root_proc, h_mat_local, h_mat_global); + module_rt::gatherMatrix(myid, root_proc, s_mat_local, s_mat_global); + + hamilt::MatrixBlock> dmk_local_block; + dmk_local_block.p = tmp_dmk_local; + dmk_local_block.desc = pv.desc; + module_rt::gatherMatrix(myid, root_proc, dmk_local_block, dmk_global); + + if (myid == root_proc) + { + h_src = h_mat_global.p.get(); + s_src = s_mat_global.p.get(); + dmk_src = dmk_global.p.get(); + } + } + + // 2. GPU Calculation (on Rank 0) if (myid == root_proc) { - ct::Tensor Htmp_global(ct::DataType::DT_COMPLEX_DOUBLE, - ct::DeviceType::CpuDevice, - ct::TensorShape({nlocal, nlocal})); - ct::Tensor S_global(ct::DataType::DT_COMPLEX_DOUBLE, - ct::DeviceType::CpuDevice, - ct::TensorShape({nlocal, nlocal})); - ct::Tensor DMK_global(ct::DataType::DT_COMPLEX_DOUBLE, - ct::DeviceType::CpuDevice, - ct::TensorShape({nlocal, nlocal})); - - // Copy gathered data into tensors - BlasConnector::copy(nlocal * nlocal, - h_mat_global.p.get(), - 1, - Htmp_global.template data>(), - 1); - BlasConnector::copy(nlocal * nlocal, - s_mat_global.p.get(), - 1, - S_global.template data>(), - 1); - BlasConnector::copy(nlocal * nlocal, - dmk_global.p.get(), - 1, - DMK_global.template data>(), - 1); - - // Move tensors to the target device (CPU or GPU) - ct::Tensor Htmp_global_dev = Htmp_global.to_device(); - ct::Tensor S_global_dev = S_global.to_device(); - ct::Tensor DMK_global_dev = DMK_global.to_device(); + ct::Tensor H_dev, S_dev, DMK_dev, ipiv_dev; - // --- Calculate S^-1 using getrf + getrs --- - ct::Tensor ipiv(ct::DataType::DT_INT, ct_device_type, ct::TensorShape({nlocal})); - ipiv.zero(); + // Allocate and Copy (H2D) + H_dev = ct::Tensor(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal})); + syncmem_complex_h2d_op()(H_dev.template data>(), h_src, nlocal * nlocal); + + S_dev = ct::Tensor(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal})); + syncmem_complex_h2d_op()(S_dev.template data>(), s_src, nlocal * nlocal); + + DMK_dev = ct::Tensor(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal})); + syncmem_complex_h2d_op()(DMK_dev.template data>(), dmk_src, nlocal * nlocal); + + ipiv_dev = ct::Tensor(ct::DataType::DT_INT, ct_device_type, ct::TensorShape({nlocal})); + ipiv_dev.zero(); + // --- Calculate S^-1 using getrf + getrs --- // 1. LU decomposition S = P * L * U - ct::kernels::lapack_getrf, ct_Device>()( - nlocal, - nlocal, - S_global_dev.template data>(), - nlocal, - ipiv.template data()); - - // 2. Solve S * Sinv = I for Sinv using the LU decomposition - // Create identity matrix as RHS - auto Sinv_global = module_rt::create_identity_matrix>(nlocal, ct_device_type); - - ct::kernels::lapack_getrs, ct_Device>()( - 'N', - nlocal, - nlocal, - S_global_dev.template data>(), - nlocal, - ipiv.template data(), - Sinv_global.template data>(), - nlocal); - - // --- EDM Calculation using BLAS on global tensors --- - // tmp1 = Htmp * Sinv - ct::Tensor tmp1_global_tensor(ct::DataType::DT_COMPLEX_DOUBLE, - ct_device_type, - ct::TensorShape({nlocal, nlocal})); - tmp1_global_tensor.zero(); - std::complex one_complex = {1.0, 0.0}; - std::complex zero_complex = {0.0, 0.0}; - ct::kernels::blas_gemm, ct_Device>()( - 'N', - 'N', - nlocal, - nlocal, - nlocal, - &one_complex, - Htmp_global_dev.template data>(), - nlocal, - Sinv_global.template data>(), - nlocal, - &zero_complex, - tmp1_global_tensor.template data>(), - nlocal); - - // tmp2 = tmp1^T * tmp_dmk - ct::Tensor tmp2_global_tensor(ct::DataType::DT_COMPLEX_DOUBLE, - ct_device_type, - ct::TensorShape({nlocal, nlocal})); - tmp2_global_tensor.zero(); - ct::kernels::blas_gemm, ct_Device>()( - 'T', - 'N', - nlocal, - nlocal, - nlocal, - &one_complex, - tmp1_global_tensor.template data>(), - nlocal, - DMK_global_dev.template data>(), - nlocal, - &zero_complex, - tmp2_global_tensor.template data>(), - nlocal); - - // tmp3 = Sinv * Htmp - ct::Tensor tmp3_global_tensor(ct::DataType::DT_COMPLEX_DOUBLE, - ct_device_type, - ct::TensorShape({nlocal, nlocal})); - tmp3_global_tensor.zero(); - ct::kernels::blas_gemm, ct_Device>()( - 'N', - 'N', - nlocal, - nlocal, - nlocal, - &one_complex, - Sinv_global.template data>(), - nlocal, - Htmp_global_dev.template data>(), - nlocal, - &zero_complex, - tmp3_global_tensor.template data>(), - nlocal); - - // tmp4 = tmp_dmk * tmp3^T - ct::Tensor tmp4_global_tensor(ct::DataType::DT_COMPLEX_DOUBLE, - ct_device_type, - ct::TensorShape({nlocal, nlocal})); - tmp4_global_tensor.zero(); - ct::kernels::blas_gemm, ct_Device>()( - 'N', - 'T', - nlocal, - nlocal, - nlocal, - &one_complex, - DMK_global_dev.template data>(), - nlocal, - tmp3_global_tensor.template data>(), - nlocal, - &zero_complex, - tmp4_global_tensor.template data>(), - nlocal); + ct::kernels::lapack_getrf, ct_Device>()(nlocal, + nlocal, + S_dev.template data>(), + nlocal, + ipiv_dev.template data()); + + // 2. Solve S * Sinv = I + auto Sinv_dev = module_rt::create_identity_matrix>(nlocal, ct_device_type); + + ct::kernels::lapack_getrs, ct_Device>()('N', + nlocal, + nlocal, + S_dev.template data>(), + nlocal, + ipiv_dev.template data(), + Sinv_dev.template data>(), + nlocal); + + // --- EDM Calculation --- + std::complex one = {1.0, 0.0}; + std::complex zero = {0.0, 0.0}; + + // tmp1 = H * Sinv + ct::Tensor tmp1_dev(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal})); + ct::kernels::blas_gemm, ct_Device>()('N', + 'N', + nlocal, + nlocal, + nlocal, + &one, + H_dev.template data>(), + nlocal, + Sinv_dev.template data>(), + nlocal, + &zero, + tmp1_dev.template data>(), + nlocal); + + // tmp2 = tmp1^T * DMK + ct::Tensor tmp2_dev(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal})); + ct::kernels::blas_gemm, ct_Device>()('T', + 'N', + nlocal, + nlocal, + nlocal, + &one, + tmp1_dev.template data>(), + nlocal, + DMK_dev.template data>(), + nlocal, + &zero, + tmp2_dev.template data>(), + nlocal); + + // tmp3 = Sinv * H + ct::Tensor tmp3_dev(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal})); + ct::kernels::blas_gemm, ct_Device>()('N', + 'N', + nlocal, + nlocal, + nlocal, + &one, + Sinv_dev.template data>(), + nlocal, + H_dev.template data>(), + nlocal, + &zero, + tmp3_dev.template data>(), + nlocal); + + // tmp4 = DMK * tmp3^T + ct::Tensor tmp4_dev(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal})); + ct::kernels::blas_gemm, ct_Device>()('N', + 'T', + nlocal, + nlocal, + nlocal, + &one, + DMK_dev.template data>(), + nlocal, + tmp3_dev.template data>(), + nlocal, + &zero, + tmp4_dev.template data>(), + nlocal); // tmp4 = tmp2 + tmp4 - ct::kernels::blas_axpy, ct_Device>()( - nlocal * nlocal, - &one_complex, - tmp2_global_tensor.template data>(), - 1, - tmp4_global_tensor.template data>(), - 1); + ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal, + &one, + tmp2_dev.template data>(), + 1, + tmp4_dev.template data>(), + 1); // tmp4 = 0.5 * tmp4 - std::complex half_complex = {0.5, 0.0}; - ct::kernels::blas_scal, ct_Device>()( - nlocal * nlocal, - &half_complex, - tmp4_global_tensor.template data>(), - 1); - - // Copy result from device tensor back to CPU buffer for distribution - ct::Tensor tmp4_global_tensor_cpu = tmp4_global_tensor.to_device(); - BlasConnector::copy(nlocal * nlocal, - tmp4_global_tensor_cpu.template data>(), - 1, - edm_global.p.get(), - 1); + std::complex half = {0.5, 0.0}; + ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal, + &half, + tmp4_dev.template data>(), + 1); + + // 3. Retrieve Result (D2H) + std::complex* edm_dest = nullptr; + + if (num_procs == 1) + { + // Directly copy to target local matrix + tmp_edmk.create(pv.ncol, pv.nrow); + edm_dest = tmp_edmk.c; + } + else + { + // Wait to set up edm_dest after allocating global buffer + if (myid == root_proc && edm_global.p == nullptr) + { + edm_global.p.reset(new std::complex[nlocal * nlocal]); + } + edm_dest = edm_global.p.get(); + } + + if (num_procs == 1 || myid == root_proc) + { + syncmem_complex_d2h_op()(edm_dest, tmp4_dev.template data>(), nlocal * nlocal); + } } - // --- Distribute the globally computed EDM matrix back to distributed form --- - tmp_edmk.create(pv.ncol, pv.nrow); + // 4. Distribute (Only needed if num_procs > 1) + if (num_procs > 1) + { + if (edm_global.p == nullptr) + { + edm_global.p.reset(new std::complex[nlocal * nlocal]); + } - hamilt::MatrixBlock> edm_local_block; - edm_local_block.p = tmp_edmk.c; - edm_local_block.desc = pv.desc; + edm_global.row = nlocal; + edm_global.col = nlocal; + edm_global.desc.reset(new int[9]{1, pv.desc[1], nlocal, nlocal, nlocal, nlocal, 0, 0, nlocal}); - // Distribute edm_global to all processes' local blocks - module_rt::distributeMatrix(edm_local_block, edm_global); + tmp_edmk.create(pv.ncol, pv.nrow); + hamilt::MatrixBlock> edm_local_block; + edm_local_block.p = tmp_edmk.c; + edm_local_block.desc = pv.desc; + module_rt::distributeMatrix(edm_local_block, edm_global); + } #else ModuleBase::WARNING_QUIT("elecstate::cal_edm_tddft_tensor_lapack", "MPI is required for this function!"); #endif // __MPI @@ -790,7 +801,7 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv, } #endif // __CUDA - ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor_lapack"); + ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft"); return; } // cal_edm_tddft_tensor_lapack diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp index 278b270410..90c1970120 100644 --- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp +++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp @@ -169,6 +169,7 @@ void hamilt::TDNonlocal>::calculate_HR() if (use_gpu) { + ModuleBase::timer::tick("TD_Efficiency", "snap_psibeta"); #ifdef __CUDA // GPU path: Atom-level GPU batch processing module_rt::gpu::snap_psibeta_atom_batch_gpu(orb_, @@ -183,9 +184,11 @@ void hamilt::TDNonlocal>::calculate_HR() nlm_dim, nlm_tot); #endif + ModuleBase::timer::tick("TD_Efficiency", "snap_psibeta"); } else { + ModuleBase::timer::tick("TD_Efficiency", "snap_psibeta"); // CPU path: OpenMP parallel over neighbors to compute nlm_tot #pragma omp parallel for schedule(dynamic) for (int ad = 0; ad < adjs.adj_num + 1; ++ad) @@ -224,6 +227,7 @@ void hamilt::TDNonlocal>::calculate_HR() } } } + ModuleBase::timer::tick("TD_Efficiency", "snap_psibeta"); } // 2. calculate D for each pair of atoms diff --git a/source/source_lcao/module_rt/evolve_elec.cpp b/source/source_lcao/module_rt/evolve_elec.cpp index b957d193ea..6d6eebc2ff 100644 --- a/source/source_lcao/module_rt/evolve_elec.cpp +++ b/source/source_lcao/module_rt/evolve_elec.cpp @@ -94,27 +94,48 @@ void Evolve_elec::solve_psi(const int& istep, module_rt::Matrix_g> psi_g; module_rt::Matrix_g> psi_laststep_g; + // Prepare host pointers for psi and psi_laststep + std::complex* p_psi_host = nullptr; + std::complex* p_psi_last_host = nullptr; + if (use_lapack) { - // Need to gather the psi to the root process on CPU - // H_laststep and S_laststep are already gathered in esolver_ks_lcao_tddft.cpp #ifdef __MPI - // Access the rank of the calling process in the communicator int myid = 0; const int root_proc = 0; + int num_procs = 1; MPI_Comm_rank(MPI_COMM_WORLD, &myid); - - // Gather psi to the root process - gatherPsi(myid, root_proc, psi[0].get_pointer(), para_orb, psi_g); - gatherPsi(myid, root_proc, psi_laststep[0].get_pointer(), para_orb, psi_laststep_g); - - // Syncronize data from CPU to Device - syncmem_complex_h2d_op()(psi_k_tensor.data>(), - psi_g.p.get(), - len_psi_k_1 * len_psi_k_2); - syncmem_complex_h2d_op()(psi_k_laststep_tensor.data>(), - psi_laststep_g.p.get(), - len_psi_k_1 * len_psi_k_2); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + if (num_procs == 1) + { + // Single process: directly point to local data without gather + p_psi_host = psi[0].get_pointer(); + p_psi_last_host = psi_laststep[0].get_pointer(); + } + else + { + // Multiple processes: gather data to the root process (myid == 0) and point to the gathered data + gatherPsi(myid, root_proc, psi[0].get_pointer(), para_orb, psi_g); + gatherPsi(myid, root_proc, psi_laststep[0].get_pointer(), para_orb, psi_laststep_g); + + if (myid == root_proc) + { + p_psi_host = psi_g.p.get(); + p_psi_last_host = psi_laststep_g.p.get(); + } + } + + // Only the root process (myid == 0) performs the copy + if (myid == root_proc) + { + syncmem_complex_h2d_op()(psi_k_tensor.data>(), + p_psi_host, + len_psi_k_1 * len_psi_k_2); + syncmem_complex_h2d_op()(psi_k_laststep_tensor.data>(), + p_psi_last_host, + len_psi_k_1 * len_psi_k_2); + } #endif } else @@ -157,17 +178,27 @@ void Evolve_elec::solve_psi(const int& istep, if (use_lapack) { #ifdef __MPI - // Syncronize data from Device to CPU - syncmem_complex_d2h_op()(psi_g.p.get(), - psi_k_tensor.data>(), - len_psi_k_1 * len_psi_k_2); - syncmem_complex_d2h_op()(psi_laststep_g.p.get(), - psi_k_laststep_tensor.data>(), - len_psi_k_1 * len_psi_k_2); - - // Distribute psi to all processes - distributePsi(para_orb, psi[0].get_pointer(), psi_g); - distributePsi(para_orb, psi_laststep[0].get_pointer(), psi_laststep_g); + int myid = 0; + int num_procs = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + if (myid == 0) + { + syncmem_complex_d2h_op()(p_psi_host, + psi_k_tensor.data>(), + len_psi_k_1 * len_psi_k_2); + syncmem_complex_d2h_op()(p_psi_last_host, + psi_k_laststep_tensor.data>(), + len_psi_k_1 * len_psi_k_2); + } + + // If it's multi-process, distribute back; if it's single-process, the data is already in psi[0] + if (num_procs > 1) + { + distributePsi(para_orb, psi[0].get_pointer(), psi_g); + distributePsi(para_orb, psi_laststep[0].get_pointer(), psi_laststep_g); + } #endif } else diff --git a/source/source_lcao/module_rt/evolve_psi.cpp b/source/source_lcao/module_rt/evolve_psi.cpp index bd6eaef793..5a2f116fcd 100644 --- a/source/source_lcao/module_rt/evolve_psi.cpp +++ b/source/source_lcao/module_rt/evolve_psi.cpp @@ -155,83 +155,92 @@ void evolve_psi_tensor(const int nband, hamilt::MatrixBlock> h_mat, s_mat; p_hamilt->matrix(h_mat, s_mat); - ModuleBase::timer::tick("TD_Efficiency", "host_device_comm"); + int myid = 0; + int num_procs = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + const int root_proc = 0; - // Create Tensor objects for temporary data and sync from host to device - const int len_HS = use_lapack ? nlocal * nlocal : pv->nloc; - ct::Tensor Stmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); - ct::Tensor Htmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); - ct::Tensor Hold(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); + std::complex* h_src = nullptr; + std::complex* s_src = nullptr; + + module_rt::Matrix_g> h_mat_g, s_mat_g; if (use_lapack) { - // Need to gather H and S matrix to root process here - int myid = 0; - int num_procs = 1; - MPI_Comm_rank(MPI_COMM_WORLD, &myid); - MPI_Comm_size(MPI_COMM_WORLD, &num_procs); - - module_rt::Matrix_g> h_mat_g, s_mat_g; // Global matrix structure - - // Collect H matrix - module_rt::gatherMatrix(myid, 0, h_mat, h_mat_g); - syncmem_complex_h2d_op()(Htmp.data>(), h_mat_g.p.get(), len_HS); - syncmem_complex_h2d_op()(Hold.data>(), h_mat_g.p.get(), len_HS); - - // Collect S matrix - module_rt::gatherMatrix(myid, 0, s_mat, s_mat_g); - syncmem_complex_h2d_op()(Stmp.data>(), s_mat_g.p.get(), len_HS); + if (num_procs == 1) + { + h_src = h_mat.p; + s_src = s_mat.p; + } + else + { + module_rt::gatherMatrix(myid, 0, h_mat, h_mat_g); + module_rt::gatherMatrix(myid, 0, s_mat, s_mat_g); + if (myid == root_proc) + { + h_src = h_mat_g.p.get(); + s_src = s_mat_g.p.get(); + } + } } else { - // Original code - syncmem_complex_h2d_op()(Stmp.data>(), s_mat.p, len_HS); - syncmem_complex_h2d_op()(Htmp.data>(), h_mat.p, len_HS); - syncmem_complex_h2d_op()(Hold.data>(), h_mat.p, len_HS); + h_src = h_mat.p; + s_src = s_mat.p; } - ModuleBase::timer::tick("TD_Efficiency", "host_device_comm"); + const int len_HS = use_lapack ? nlocal * nlocal : pv->nloc; - ct::Tensor U_operator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); - U_operator.zero(); + ct::Tensor Stmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); - int myid = 0; - int root_proc = 0; - MPI_Comm_rank(MPI_COMM_WORLD, &myid); + if (s_src != nullptr) + { + if (!use_lapack || myid == root_proc) + { + ModuleBase::timer::tick("TD_Efficiency", "host_device_comm"); + syncmem_complex_h2d_op()(Stmp.data>(), s_src, len_HS); + ModuleBase::timer::tick("TD_Efficiency", "host_device_comm"); + } + } - // (1)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + ct::Tensor Htmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); - /// @brief compute H(t+dt/2) - /// @input H_laststep, Htmp, print_matrix - /// @output Htmp + if (h_src != nullptr) + { + if (!use_lapack || myid == root_proc) + { + ModuleBase::timer::tick("TD_Efficiency", "host_device_comm"); + syncmem_complex_h2d_op()(Htmp.data>(), h_src, len_HS); + ModuleBase::timer::tick("TD_Efficiency", "host_device_comm"); + } + } + + // (1) Compute H(t+dt/2) if (propagator != 2) { if (!use_lapack) { half_Hmatrix_tensor(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, ofs_running, print_matrix); } - else + else if (myid == root_proc) { - if (myid == root_proc) - { - half_Hmatrix_tensor_lapack(pv, - nband, - nlocal, - Htmp, - Stmp, - H_laststep, - S_laststep, - ofs_running, - print_matrix); - } + half_Hmatrix_tensor_lapack(pv, + nband, + nlocal, + Htmp, + Stmp, + H_laststep, + S_laststep, + ofs_running, + print_matrix); } } - // (2)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + ct::Tensor U_operator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); + U_operator.zero(); - /// @brief compute U_operator - /// @input Stmp, Htmp, print_matrix - /// @output U_operator + // (2) Compute U_operator Propagator prop(propagator, pv, PARAM.inp.td_dt); prop.compute_propagator_tensor(nlocal, Stmp, @@ -242,55 +251,47 @@ void evolve_psi_tensor(const int nband, print_matrix, use_lapack); - // (3)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - /// @brief apply U_operator to the wave function of the previous step for new wave function - /// @input U_operator, psi_k_laststep, print_matrix - /// @output psi_k + // (3) Apply U_operator (psi_k = U * psi_last) if (!use_lapack) { upsi_tensor(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix); } - else + else if (myid == root_proc) { - if (myid == root_proc) - { - upsi_tensor_lapack(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix); - } + upsi_tensor_lapack(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix); } - // (4)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - /// @brief normalize psi_k - /// @input Stmp, psi_not_norm, psi_k, print_matrix - /// @output psi_k + // (4) Normalize psi_k if (!use_lapack) { norm_psi_tensor(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix); } - else + else if (myid == root_proc) + { + norm_psi_tensor_lapack(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix); + } + + // (5) Compute ekb + ct::Tensor Hold(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); + + // Resync H matrix + if (h_src != nullptr) { - if (myid == root_proc) + if (!use_lapack || myid == root_proc) { - norm_psi_tensor_lapack(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix); + ModuleBase::timer::tick("TD_Efficiency", "host_device_comm"); + syncmem_complex_h2d_op()(Hold.data>(), h_src, len_HS); + ModuleBase::timer::tick("TD_Efficiency", "host_device_comm"); } } - // (5)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - /// @brief compute ekb - /// @input Htmp, psi_k - /// @output ekb if (!use_lapack) { compute_ekb_tensor(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running); } - else + else if (myid == root_proc) { - if (myid == root_proc) - { - compute_ekb_tensor_lapack(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running); - } + compute_ekb_tensor_lapack(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running); } #endif // __MPI diff --git a/source/source_lcao/module_rt/middle_hamilt.cpp b/source/source_lcao/module_rt/middle_hamilt.cpp index 9706afa558..10f432fbe9 100644 --- a/source/source_lcao/module_rt/middle_hamilt.cpp +++ b/source/source_lcao/module_rt/middle_hamilt.cpp @@ -180,40 +180,6 @@ void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv, // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU using ct_Device = typename ct::PsiToContainer::type; - if (print_matrix) - { - ct::Tensor Htmp_cpu = Htmp.to_device(); - ct::Tensor H_laststep_cpu = H_laststep.to_device(); - - ofs_running << std::setprecision(10); - ofs_running << std::endl; - ofs_running << " H(t+dt) :" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << Htmp_cpu.data>()[in + j].real() << "+" - << Htmp_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - ofs_running << std::endl; - ofs_running << " H(t):" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << H_laststep_cpu.data>()[in + j].real() << "+" - << H_laststep_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - } - std::complex one_half = {0.5, 0.0}; // Perform the operation Htmp = one_half * H_laststep + one_half * Htmp @@ -243,25 +209,6 @@ void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv, 1, Stmp.data>(), 1); - - if (print_matrix) - { - ct::Tensor Htmp_cpu = Htmp.to_device(); - - ofs_running << std::endl; - ofs_running << " H (t+dt/2) :" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << Htmp_cpu.data>()[in + j].real() << "+" - << Htmp_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - } } // Explicit instantiation of template functions diff --git a/source/source_lcao/module_rt/norm_psi.cpp b/source/source_lcao/module_rt/norm_psi.cpp index d29972dce3..f5a9c6c8b4 100644 --- a/source/source_lcao/module_rt/norm_psi.cpp +++ b/source/source_lcao/module_rt/norm_psi.cpp @@ -1,14 +1,13 @@ #include "norm_psi.h" +#include "source_base/global_function.h" // ModuleBase::GlobalFunc::ZEROS #include "source_base/module_container/ATen/kernels/blas.h" #include "source_base/module_external/blas_connector.h" #include "source_base/module_external/scalapack_connector.h" -#include "source_base/global_function.h" // ModuleBase::GlobalFunc::ZEROS - +#include #include #include -#include namespace module_rt { @@ -494,33 +493,6 @@ void norm_psi_tensor_lapack(const Parallel_Orbitals* pv, Cij.data>(), nlocal); // Leading dimension of Cij - if (print_matrix) - { - ct::Tensor Cij_print_cpu = Cij.to_device(); - - ofs_running << "original Cij :" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - double aa = Cij_print_cpu.data>()[in + j].real(); - double bb = Cij_print_cpu.data>()[in + j].imag(); - if (std::abs(aa) < 1e-8) - { - aa = 0.0; - } - if (std::abs(bb) < 1e-8) - { - bb = 0.0; - } - ofs_running << aa << "+" << bb << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - } - // Normalize Cij: set diagonal elements to 1/sqrt(Cij[i][i]), off-diagonal elements to 0 if (ct_device_type == ct::DeviceType::GpuDevice) { @@ -587,70 +559,6 @@ void norm_psi_tensor_lapack(const Parallel_Orbitals* pv, &beta, psi_k.data>(), nlocal); // Leading dimension of psi_k - - if (print_matrix) - { - ct::Tensor Cij_print_cpu = Cij.to_device(); - ct::Tensor psi_k_cpu = psi_k.to_device(); - ct::Tensor tmp1_cpu = tmp1.to_device(); - - ofs_running << " Cij:" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << Cij_print_cpu.data>()[in + j].real() << "+" - << Cij_print_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - ofs_running << std::endl; - ofs_running << " psi_k:" << std::endl; - for (int i = 0; i < nband; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - double aa = psi_k_cpu.data>()[in + j].real(); - double bb = psi_k_cpu.data>()[in + j].imag(); - if (std::abs(aa) < 1e-8) - { - aa = 0.0; - } - if (std::abs(bb) < 1e-8) - { - bb = 0.0; - } - ofs_running << aa << "+" << bb << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - ofs_running << " psi_k before normalization:" << std::endl; - for (int i = 0; i < nband; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - double aa = tmp1_cpu.data>()[in + j].real(); - double bb = tmp1_cpu.data>()[in + j].imag(); - if (std::abs(aa) < 1e-8) - { - aa = 0.0; - } - if (std::abs(bb) < 1e-8) - { - bb = 0.0; - } - ofs_running << aa << "+" << bb << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - ofs_running << std::endl; - } } // Explicit instantiation of template functions diff --git a/source/source_lcao/module_rt/propagator_cn2.cpp b/source/source_lcao/module_rt/propagator_cn2.cpp index 01614c9792..8ef07b0ebb 100644 --- a/source/source_lcao/module_rt/propagator_cn2.cpp +++ b/source/source_lcao/module_rt/propagator_cn2.cpp @@ -510,211 +510,73 @@ void Propagator::compute_propagator_cn2_tensor_lapack(const int nlocal, // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU using ct_Device = typename ct::PsiToContainer::type; - // (1) copy Htmp to Numerator & Denominator - ct::Tensor Numerator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal * nlocal})); - Numerator.zero(); + // Define coefficients + // beta1 = -i * dt/4 (for Numerator) + // beta2 = +i * dt/4 (for Denominator) + std::complex beta1 = {0.0, -0.25 * this->dt}; + std::complex beta2 = {0.0, 0.25 * this->dt}; + + // ======================================================================== + // Numerator = Stmp + beta1 * Htmp + // ======================================================================== + + // 1. Copy Stmp to U_operator base_device::memory::synchronize_memory_op, Device, Device>()( - Numerator.data>(), - Htmp.data>(), + U_operator.data>(), + Stmp.data>(), nlocal * nlocal); + // 2. U_operator = beta1 * Htmp + U_operator + ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal, + &beta1, + Htmp.data>(), + 1, + U_operator.data>(), + 1); + + // ======================================================================== + // Denominator = Stmp + beta2 * Htmp + // ======================================================================== + ct::Tensor Denominator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal * nlocal})); - Denominator.zero(); + + // 1. Copy Stmp to Denominator base_device::memory::synchronize_memory_op, Device, Device>()( Denominator.data>(), - Htmp.data>(), + Stmp.data>(), nlocal * nlocal); - if (print_matrix) - { - ct::Tensor Stmp_cpu = Stmp.to_device(); - ct::Tensor Numerator_cpu = Numerator.to_device(); - - ofs_running << std::endl; - ofs_running << " S matrix :" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << Stmp_cpu.data>()[in + j].real() << "+" - << Stmp_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - ofs_running << std::endl; - ofs_running << " H matrix :" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << Numerator_cpu.data>()[in + j].real() << "+" - << Numerator_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - } - - // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - // (2) compute Numerator & Denominator by GEADD - // Numerator = Stmp - i*para * Htmp; beta1 = - para = -0.25 * this->dt - // Denominator = Stmp + i*para * Htmp; beta2 = para = 0.25 * this->dt - std::complex one = {1.0, 0.0}; - std::complex beta1 = {0.0, -0.25 * this->dt}; - std::complex beta2 = {0.0, 0.25 * this->dt}; - - // Numerator = -i*para * Htmp - ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal, - &beta1, - Numerator.data>(), - 1); - // Numerator = Stmp + (-i*para * Htmp) + // 2. Denominator = beta2 * Htmp + Denominator ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal, - &one, - Stmp.data>(), - 1, - Numerator.data>(), - 1); - // Denominator = i*para * Htmp - ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal, &beta2, - Denominator.data>(), - 1); - // Denominator = Stmp + (i*para * Htmp) - ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal, - &one, - Stmp.data>(), + Htmp.data>(), 1, Denominator.data>(), 1); - if (print_matrix) - { - ct::Tensor Denominator_cpu = Denominator.to_device(); + // ======================================================================== + // Solve D * U = N, result overwrites N (which is U_operator) + // ======================================================================== - ofs_running << " beta=" << beta1 << std::endl; - ofs_running << " fenmu:" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << Denominator_cpu.data>()[in + j].real() << "+" - << Denominator_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - } - - //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - // (3) Next, invert Denominator ct::Tensor ipiv(ct::DataType::DT_INT, ct_device_type, ct::TensorShape({nlocal})); - ipiv.zero(); - // (3.1) compute ipiv + // No need to zero ipiv, it is output only. + + // 1. LU Factorization of Denominator (In-place) ct::kernels::lapack_getrf, ct_Device>()(nlocal, nlocal, Denominator.data>(), nlocal, ipiv.data()); - // Print ipiv - if (print_matrix) - { - ct::Tensor ipiv_cpu = ipiv.to_device(); - - ofs_running << " ipiv:" << std::endl; - for (int i = 0; i < nlocal; i++) - { - ofs_running << ipiv_cpu.data()[i] << " "; - } - ofs_running << std::endl; - } - - // (3.2) compute inverse matrix of Denominator - ct::Tensor Denominator_inv = create_identity_matrix>(nlocal, ct_device_type); + // 2. Solve D * X = B ct::kernels::lapack_getrs, ct_Device>()('N', nlocal, nlocal, Denominator.data>(), nlocal, ipiv.data(), - Denominator_inv.data>(), + U_operator.data>(), nlocal); - - //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - // (4) U_operator = Denominator_inv * Numerator; - std::complex one_gemm = {1.0, 0.0}; - std::complex zero_gemm = {0.0, 0.0}; - ct::kernels::blas_gemm, ct_Device>()('N', - 'N', - nlocal, - nlocal, - nlocal, - &one_gemm, - Denominator_inv.data>(), - nlocal, - Numerator.data>(), - nlocal, - &zero_gemm, - U_operator.data>(), - nlocal); - - if (print_matrix) - { - ct::Tensor Denominator_inv_cpu = Denominator_inv.to_device(); - ct::Tensor Numerator_cpu = Numerator.to_device(); - ct::Tensor U_operator_cpu = U_operator.to_device(); - - ofs_running << " fenmu^-1:" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << Denominator_inv_cpu.data>()[in + j].real() << "+" - << Denominator_inv_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - ofs_running << " fenzi:" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - ofs_running << Numerator_cpu.data>()[in + j].real() << "+" - << Numerator_cpu.data>()[in + j].imag() << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - ofs_running << " U operator:" << std::endl; - for (int i = 0; i < nlocal; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - double aa = U_operator_cpu.data>()[in + j].real(); - double bb = U_operator_cpu.data>()[in + j].imag(); - if (std::abs(aa) < 1e-8) - { - aa = 0.0; - } - if (std::abs(bb) < 1e-8) - { - bb = 0.0; - } - ofs_running << aa << "+" << bb << "i "; - } - ofs_running << std::endl; - } - } } // Explicit instantiation of template functions diff --git a/source/source_lcao/module_rt/upsi.cpp b/source/source_lcao/module_rt/upsi.cpp index 5892ae2a57..0982a77426 100644 --- a/source/source_lcao/module_rt/upsi.cpp +++ b/source/source_lcao/module_rt/upsi.cpp @@ -195,56 +195,6 @@ void upsi_tensor_lapack(const Parallel_Orbitals* pv, &beta, psi_k.data>(), nlocal); - - if (print_matrix) - { - ct::Tensor psi_k_cpu = psi_k.to_device(); - ct::Tensor psi_k_laststep_cpu = psi_k_laststep.to_device(); - - ofs_running << std::endl; - ofs_running << " psi_k:" << std::endl; - for (int i = 0; i < nband; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - double aa = psi_k_cpu.data>()[in + j].real(); - double bb = psi_k_cpu.data>()[in + j].imag(); - if (std::abs(aa) < 1e-8) - { - aa = 0.0; - } - if (std::abs(bb) < 1e-8) - { - bb = 0.0; - } - ofs_running << aa << "+" << bb << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - ofs_running << " psi_k_laststep:" << std::endl; - for (int i = 0; i < nband; i++) - { - const int in = i * nlocal; - for (int j = 0; j < nlocal; j++) - { - double aa = psi_k_laststep_cpu.data>()[in + j].real(); - double bb = psi_k_laststep_cpu.data>()[in + j].imag(); - if (std::abs(aa) < 1e-8) - { - aa = 0.0; - } - if (std::abs(bb) < 1e-8) - { - bb = 0.0; - } - ofs_running << aa << "+" << bb << "i "; - } - ofs_running << std::endl; - } - ofs_running << std::endl; - } } // Explicit instantiation of template functions