diff --git a/source/source_esolver/esolver_ks_lcao_tddft.cpp b/source/source_esolver/esolver_ks_lcao_tddft.cpp
index b273304ffe..8a0035681b 100644
--- a/source/source_esolver/esolver_ks_lcao_tddft.cpp
+++ b/source/source_esolver/esolver_ks_lcao_tddft.cpp
@@ -1,9 +1,9 @@
#include "esolver_ks_lcao_tddft.h"
//----------------IO-----------------
-#include "source_io/module_dipole/dipole_io.h"
#include "source_io/module_ctrl/ctrl_output_td.h"
#include "source_io/module_current/td_current_io.h"
+#include "source_io/module_dipole/dipole_io.h"
#include "source_io/module_output/output_log.h"
#include "source_io/module_wf/read_wfc_nao.h"
//------LCAO HSolver ElecState-------
@@ -425,32 +425,51 @@ void ESolver_KS_LCAO_TDDFT
::store_h_s_psi(UnitCell& ucell,
// Store H and S matrices to Hk_laststep and Sk_laststep
if (use_tensor && use_lapack)
{
- // Gather H and S matrices to root process
#ifdef __MPI
int myid = 0;
int num_procs = 1;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
- // Global matrix structure
+ std::complex* h_ptr = nullptr;
+ std::complex* s_ptr = nullptr;
+
+ // Define containers for gathered data (only needed for multi-process)
module_rt::Matrix_g> h_mat_g;
module_rt::Matrix_g> s_mat_g;
- // Collect H matrix
- module_rt::gatherMatrix(myid, 0, h_mat, h_mat_g);
- BlasConnector::copy(len_HS_ik,
- h_mat_g.p.get(),
- 1,
- this->Hk_laststep.template data>() + ik * len_HS_ik,
- 1);
+ if (num_procs == 1)
+ {
+ // Single process: directly point to local data without gather
+ h_ptr = h_mat.p;
+ s_ptr = s_mat.p;
+ }
+ else
+ {
+ // Multiple processes: gather data to the root process (myid == 0) and point to the gathered data
+ module_rt::gatherMatrix(myid, 0, h_mat, h_mat_g);
+ module_rt::gatherMatrix(myid, 0, s_mat, s_mat_g);
+ if (myid == 0)
+ {
+ h_ptr = h_mat_g.p.get();
+ s_ptr = s_mat_g.p.get();
+ }
+ }
- // Collect S matrix
- module_rt::gatherMatrix(myid, 0, s_mat, s_mat_g);
- BlasConnector::copy(len_HS_ik,
- s_mat_g.p.get(),
- 1,
- this->Sk_laststep.template data>() + ik * len_HS_ik,
- 1);
+ // Only the root process (myid == 0) performs the copy
+ if (myid == 0 && h_ptr != nullptr && s_ptr != nullptr)
+ {
+ BlasConnector::copy(len_HS_ik,
+ h_ptr,
+ 1,
+ this->Hk_laststep.template data>() + ik * len_HS_ik,
+ 1);
+ BlasConnector::copy(len_HS_ik,
+ s_ptr,
+ 1,
+ this->Sk_laststep.template data>() + ik * len_HS_ik,
+ 1);
+ }
#endif
}
else
diff --git a/source/source_estate/module_dm/cal_edm_tddft.cpp b/source/source_estate/module_dm/cal_edm_tddft.cpp
index bd33477f97..3196283c6b 100644
--- a/source/source_estate/module_dm/cal_edm_tddft.cpp
+++ b/source/source_estate/module_dm/cal_edm_tddft.cpp
@@ -60,7 +60,7 @@ void cal_edm_tddft(Parallel_Orbitals& pv,
hamilt::Hamilt>* p_hamilt)
{
ModuleBase::TITLE("elecstate", "cal_edm_tddft");
- ModuleBase::timer::tick("elecstate", "cal_edm_tddft");
+ ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft");
const int nlocal = PARAM.globalv.nlocal;
assert(nlocal >= 0);
@@ -303,7 +303,7 @@ void cal_edm_tddft(Parallel_Orbitals& pv,
#endif
} // end ik
- ModuleBase::timer::tick("elecstate", "cal_edm_tddft");
+ ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft");
return;
} // cal_edm_tddft
@@ -313,7 +313,7 @@ void cal_edm_tddft_tensor(Parallel_Orbitals& pv,
hamilt::Hamilt>* p_hamilt)
{
ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor");
- ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor");
+ ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft");
const int nlocal = PARAM.globalv.nlocal;
assert(nlocal >= 0);
@@ -532,7 +532,7 @@ void cal_edm_tddft_tensor(Parallel_Orbitals& pv,
ModuleBase::WARNING_QUIT("elecstate::cal_edm_tddft_tensor", "MPI is required for this function!");
#endif
} // end ik
- ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor");
+ ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft");
return;
} // cal_edm_tddft_tensor
@@ -544,7 +544,7 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv,
hamilt::Hamilt>* p_hamilt)
{
ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor_lapack");
- ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor_lapack");
+ ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft");
const int nlocal = PARAM.globalv.nlocal;
assert(nlocal >= 0);
@@ -555,6 +555,12 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv,
// ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
using ct_Device = typename ct::PsiToContainer::type;
+ // Memory operations
+ using syncmem_complex_h2d_op
+ = base_device::memory::synchronize_memory_op, Device, base_device::DEVICE_CPU>;
+ using syncmem_complex_d2h_op
+ = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, Device>;
+
#if ((defined __CUDA) /* || (defined __ROCM) */)
if (ct_device_type == ct::DeviceType::GpuDevice)
{
@@ -573,209 +579,214 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv,
#ifdef __MPI
int myid = 0;
const int root_proc = 0;
+ int num_procs = 1;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+ MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+
+ // 1. Prepare Data Source Pointers (Host)
+ // If np = 1, point directly to local data to avoid copy
+ // If np > 1, gather data and point to the gathered buffer
+ std::complex* h_src = nullptr;
+ std::complex* s_src = nullptr;
+ std::complex* dmk_src = nullptr;
- // Gather local H, S, and DMK matrices to global matrices on root process
+ // Global containers (Used only when num_procs > 1)
+ module_rt::Matrix_g> h_mat_global, s_mat_global, dmk_global, edm_global;
+
+ // Get Local Matrices
hamilt::MatrixBlock> h_mat_local, s_mat_local;
p_hamilt->matrix(h_mat_local, s_mat_local);
- module_rt::Matrix_g> h_mat_global, s_mat_global, dmk_global;
- module_rt::gatherMatrix(myid, root_proc, h_mat_local, h_mat_global);
- module_rt::gatherMatrix(myid, root_proc, s_mat_local, s_mat_global);
-
- // Create a temporary MatrixBlock for local DMK
- hamilt::MatrixBlock> dmk_local_block;
- dmk_local_block.p = tmp_dmk_local;
- dmk_local_block.desc = pv.desc;
- module_rt::gatherMatrix(myid, root_proc, dmk_local_block, dmk_global);
-
- // Declare and allocate global EDM matrix on ALL processes, prepare for distribution in the end
- module_rt::Matrix_g> edm_global;
- edm_global.p.reset(new std::complex[nlocal * nlocal]);
- edm_global.row = nlocal;
- edm_global.col = nlocal;
- // Set the descriptor of the global EDM matrix
- edm_global.desc.reset(new int[9]{1, pv.desc[1], nlocal, nlocal, nlocal, nlocal, 0, 0, nlocal});
-
- // Only root process performs the global calculation
+ if (num_procs == 1)
+ {
+ // Optimization: Direct access for single process
+ h_src = h_mat_local.p;
+ s_src = s_mat_local.p;
+ dmk_src = tmp_dmk_local;
+ }
+ else
+ {
+ // Standard Gather Logic for multi-process
+ module_rt::gatherMatrix(myid, root_proc, h_mat_local, h_mat_global);
+ module_rt::gatherMatrix(myid, root_proc, s_mat_local, s_mat_global);
+
+ hamilt::MatrixBlock> dmk_local_block;
+ dmk_local_block.p = tmp_dmk_local;
+ dmk_local_block.desc = pv.desc;
+ module_rt::gatherMatrix(myid, root_proc, dmk_local_block, dmk_global);
+
+ if (myid == root_proc)
+ {
+ h_src = h_mat_global.p.get();
+ s_src = s_mat_global.p.get();
+ dmk_src = dmk_global.p.get();
+ }
+ }
+
+ // 2. GPU Calculation (on Rank 0)
if (myid == root_proc)
{
- ct::Tensor Htmp_global(ct::DataType::DT_COMPLEX_DOUBLE,
- ct::DeviceType::CpuDevice,
- ct::TensorShape({nlocal, nlocal}));
- ct::Tensor S_global(ct::DataType::DT_COMPLEX_DOUBLE,
- ct::DeviceType::CpuDevice,
- ct::TensorShape({nlocal, nlocal}));
- ct::Tensor DMK_global(ct::DataType::DT_COMPLEX_DOUBLE,
- ct::DeviceType::CpuDevice,
- ct::TensorShape({nlocal, nlocal}));
-
- // Copy gathered data into tensors
- BlasConnector::copy(nlocal * nlocal,
- h_mat_global.p.get(),
- 1,
- Htmp_global.template data>(),
- 1);
- BlasConnector::copy(nlocal * nlocal,
- s_mat_global.p.get(),
- 1,
- S_global.template data>(),
- 1);
- BlasConnector::copy(nlocal * nlocal,
- dmk_global.p.get(),
- 1,
- DMK_global.template data>(),
- 1);
-
- // Move tensors to the target device (CPU or GPU)
- ct::Tensor Htmp_global_dev = Htmp_global.to_device();
- ct::Tensor S_global_dev = S_global.to_device();
- ct::Tensor DMK_global_dev = DMK_global.to_device();
+ ct::Tensor H_dev, S_dev, DMK_dev, ipiv_dev;
- // --- Calculate S^-1 using getrf + getrs ---
- ct::Tensor ipiv(ct::DataType::DT_INT, ct_device_type, ct::TensorShape({nlocal}));
- ipiv.zero();
+ // Allocate and Copy (H2D)
+ H_dev = ct::Tensor(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal}));
+ syncmem_complex_h2d_op()(H_dev.template data>(), h_src, nlocal * nlocal);
+
+ S_dev = ct::Tensor(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal}));
+ syncmem_complex_h2d_op()(S_dev.template data>(), s_src, nlocal * nlocal);
+
+ DMK_dev = ct::Tensor(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal}));
+ syncmem_complex_h2d_op()(DMK_dev.template data>(), dmk_src, nlocal * nlocal);
+
+ ipiv_dev = ct::Tensor(ct::DataType::DT_INT, ct_device_type, ct::TensorShape({nlocal}));
+ ipiv_dev.zero();
+ // --- Calculate S^-1 using getrf + getrs ---
// 1. LU decomposition S = P * L * U
- ct::kernels::lapack_getrf, ct_Device>()(
- nlocal,
- nlocal,
- S_global_dev.template data>(),
- nlocal,
- ipiv.template data());
-
- // 2. Solve S * Sinv = I for Sinv using the LU decomposition
- // Create identity matrix as RHS
- auto Sinv_global = module_rt::create_identity_matrix>(nlocal, ct_device_type);
-
- ct::kernels::lapack_getrs, ct_Device>()(
- 'N',
- nlocal,
- nlocal,
- S_global_dev.template data>(),
- nlocal,
- ipiv.template data(),
- Sinv_global.template data>(),
- nlocal);
-
- // --- EDM Calculation using BLAS on global tensors ---
- // tmp1 = Htmp * Sinv
- ct::Tensor tmp1_global_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
- ct_device_type,
- ct::TensorShape({nlocal, nlocal}));
- tmp1_global_tensor.zero();
- std::complex one_complex = {1.0, 0.0};
- std::complex zero_complex = {0.0, 0.0};
- ct::kernels::blas_gemm, ct_Device>()(
- 'N',
- 'N',
- nlocal,
- nlocal,
- nlocal,
- &one_complex,
- Htmp_global_dev.template data>(),
- nlocal,
- Sinv_global.template data>(),
- nlocal,
- &zero_complex,
- tmp1_global_tensor.template data>(),
- nlocal);
-
- // tmp2 = tmp1^T * tmp_dmk
- ct::Tensor tmp2_global_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
- ct_device_type,
- ct::TensorShape({nlocal, nlocal}));
- tmp2_global_tensor.zero();
- ct::kernels::blas_gemm, ct_Device>()(
- 'T',
- 'N',
- nlocal,
- nlocal,
- nlocal,
- &one_complex,
- tmp1_global_tensor.template data>(),
- nlocal,
- DMK_global_dev.template data>(),
- nlocal,
- &zero_complex,
- tmp2_global_tensor.template data>(),
- nlocal);
-
- // tmp3 = Sinv * Htmp
- ct::Tensor tmp3_global_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
- ct_device_type,
- ct::TensorShape({nlocal, nlocal}));
- tmp3_global_tensor.zero();
- ct::kernels::blas_gemm, ct_Device>()(
- 'N',
- 'N',
- nlocal,
- nlocal,
- nlocal,
- &one_complex,
- Sinv_global.template data>(),
- nlocal,
- Htmp_global_dev.template data>(),
- nlocal,
- &zero_complex,
- tmp3_global_tensor.template data>(),
- nlocal);
-
- // tmp4 = tmp_dmk * tmp3^T
- ct::Tensor tmp4_global_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
- ct_device_type,
- ct::TensorShape({nlocal, nlocal}));
- tmp4_global_tensor.zero();
- ct::kernels::blas_gemm, ct_Device>()(
- 'N',
- 'T',
- nlocal,
- nlocal,
- nlocal,
- &one_complex,
- DMK_global_dev.template data>(),
- nlocal,
- tmp3_global_tensor.template data>(),
- nlocal,
- &zero_complex,
- tmp4_global_tensor.template data>(),
- nlocal);
+ ct::kernels::lapack_getrf, ct_Device>()(nlocal,
+ nlocal,
+ S_dev.template data>(),
+ nlocal,
+ ipiv_dev.template data());
+
+ // 2. Solve S * Sinv = I
+ auto Sinv_dev = module_rt::create_identity_matrix>(nlocal, ct_device_type);
+
+ ct::kernels::lapack_getrs, ct_Device>()('N',
+ nlocal,
+ nlocal,
+ S_dev.template data>(),
+ nlocal,
+ ipiv_dev.template data(),
+ Sinv_dev.template data>(),
+ nlocal);
+
+ // --- EDM Calculation ---
+ std::complex one = {1.0, 0.0};
+ std::complex zero = {0.0, 0.0};
+
+ // tmp1 = H * Sinv
+ ct::Tensor tmp1_dev(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal}));
+ ct::kernels::blas_gemm, ct_Device>()('N',
+ 'N',
+ nlocal,
+ nlocal,
+ nlocal,
+ &one,
+ H_dev.template data>(),
+ nlocal,
+ Sinv_dev.template data>(),
+ nlocal,
+ &zero,
+ tmp1_dev.template data>(),
+ nlocal);
+
+ // tmp2 = tmp1^T * DMK
+ ct::Tensor tmp2_dev(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal}));
+ ct::kernels::blas_gemm, ct_Device>()('T',
+ 'N',
+ nlocal,
+ nlocal,
+ nlocal,
+ &one,
+ tmp1_dev.template data>(),
+ nlocal,
+ DMK_dev.template data>(),
+ nlocal,
+ &zero,
+ tmp2_dev.template data>(),
+ nlocal);
+
+ // tmp3 = Sinv * H
+ ct::Tensor tmp3_dev(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal}));
+ ct::kernels::blas_gemm, ct_Device>()('N',
+ 'N',
+ nlocal,
+ nlocal,
+ nlocal,
+ &one,
+ Sinv_dev.template data>(),
+ nlocal,
+ H_dev.template data>(),
+ nlocal,
+ &zero,
+ tmp3_dev.template data>(),
+ nlocal);
+
+ // tmp4 = DMK * tmp3^T
+ ct::Tensor tmp4_dev(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal, nlocal}));
+ ct::kernels::blas_gemm, ct_Device>()('N',
+ 'T',
+ nlocal,
+ nlocal,
+ nlocal,
+ &one,
+ DMK_dev.template data>(),
+ nlocal,
+ tmp3_dev.template data>(),
+ nlocal,
+ &zero,
+ tmp4_dev.template data>(),
+ nlocal);
// tmp4 = tmp2 + tmp4
- ct::kernels::blas_axpy, ct_Device>()(
- nlocal * nlocal,
- &one_complex,
- tmp2_global_tensor.template data>(),
- 1,
- tmp4_global_tensor.template data>(),
- 1);
+ ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal,
+ &one,
+ tmp2_dev.template data>(),
+ 1,
+ tmp4_dev.template data>(),
+ 1);
// tmp4 = 0.5 * tmp4
- std::complex half_complex = {0.5, 0.0};
- ct::kernels::blas_scal, ct_Device>()(
- nlocal * nlocal,
- &half_complex,
- tmp4_global_tensor.template data>(),
- 1);
-
- // Copy result from device tensor back to CPU buffer for distribution
- ct::Tensor tmp4_global_tensor_cpu = tmp4_global_tensor.to_device();
- BlasConnector::copy(nlocal * nlocal,
- tmp4_global_tensor_cpu.template data>(),
- 1,
- edm_global.p.get(),
- 1);
+ std::complex half = {0.5, 0.0};
+ ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal,
+ &half,
+ tmp4_dev.template data>(),
+ 1);
+
+ // 3. Retrieve Result (D2H)
+ std::complex* edm_dest = nullptr;
+
+ if (num_procs == 1)
+ {
+ // Directly copy to target local matrix
+ tmp_edmk.create(pv.ncol, pv.nrow);
+ edm_dest = tmp_edmk.c;
+ }
+ else
+ {
+ // Wait to set up edm_dest after allocating global buffer
+ if (myid == root_proc && edm_global.p == nullptr)
+ {
+ edm_global.p.reset(new std::complex[nlocal * nlocal]);
+ }
+ edm_dest = edm_global.p.get();
+ }
+
+ if (num_procs == 1 || myid == root_proc)
+ {
+ syncmem_complex_d2h_op()(edm_dest, tmp4_dev.template data>(), nlocal * nlocal);
+ }
}
- // --- Distribute the globally computed EDM matrix back to distributed form ---
- tmp_edmk.create(pv.ncol, pv.nrow);
+ // 4. Distribute (Only needed if num_procs > 1)
+ if (num_procs > 1)
+ {
+ if (edm_global.p == nullptr)
+ {
+ edm_global.p.reset(new std::complex[nlocal * nlocal]);
+ }
- hamilt::MatrixBlock> edm_local_block;
- edm_local_block.p = tmp_edmk.c;
- edm_local_block.desc = pv.desc;
+ edm_global.row = nlocal;
+ edm_global.col = nlocal;
+ edm_global.desc.reset(new int[9]{1, pv.desc[1], nlocal, nlocal, nlocal, nlocal, 0, 0, nlocal});
- // Distribute edm_global to all processes' local blocks
- module_rt::distributeMatrix(edm_local_block, edm_global);
+ tmp_edmk.create(pv.ncol, pv.nrow);
+ hamilt::MatrixBlock> edm_local_block;
+ edm_local_block.p = tmp_edmk.c;
+ edm_local_block.desc = pv.desc;
+ module_rt::distributeMatrix(edm_local_block, edm_global);
+ }
#else
ModuleBase::WARNING_QUIT("elecstate::cal_edm_tddft_tensor_lapack", "MPI is required for this function!");
#endif // __MPI
@@ -790,7 +801,7 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv,
}
#endif // __CUDA
- ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor_lapack");
+ ModuleBase::timer::tick("TD_Efficiency", "cal_edm_tddft");
return;
} // cal_edm_tddft_tensor_lapack
diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
index 278b270410..90c1970120 100644
--- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
@@ -169,6 +169,7 @@ void hamilt::TDNonlocal>::calculate_HR()
if (use_gpu)
{
+ ModuleBase::timer::tick("TD_Efficiency", "snap_psibeta");
#ifdef __CUDA
// GPU path: Atom-level GPU batch processing
module_rt::gpu::snap_psibeta_atom_batch_gpu(orb_,
@@ -183,9 +184,11 @@ void hamilt::TDNonlocal>::calculate_HR()
nlm_dim,
nlm_tot);
#endif
+ ModuleBase::timer::tick("TD_Efficiency", "snap_psibeta");
}
else
{
+ ModuleBase::timer::tick("TD_Efficiency", "snap_psibeta");
// CPU path: OpenMP parallel over neighbors to compute nlm_tot
#pragma omp parallel for schedule(dynamic)
for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
@@ -224,6 +227,7 @@ void hamilt::TDNonlocal>::calculate_HR()
}
}
}
+ ModuleBase::timer::tick("TD_Efficiency", "snap_psibeta");
}
// 2. calculate D for each pair of atoms
diff --git a/source/source_lcao/module_rt/evolve_elec.cpp b/source/source_lcao/module_rt/evolve_elec.cpp
index b957d193ea..6d6eebc2ff 100644
--- a/source/source_lcao/module_rt/evolve_elec.cpp
+++ b/source/source_lcao/module_rt/evolve_elec.cpp
@@ -94,27 +94,48 @@ void Evolve_elec::solve_psi(const int& istep,
module_rt::Matrix_g> psi_g;
module_rt::Matrix_g> psi_laststep_g;
+ // Prepare host pointers for psi and psi_laststep
+ std::complex* p_psi_host = nullptr;
+ std::complex* p_psi_last_host = nullptr;
+
if (use_lapack)
{
- // Need to gather the psi to the root process on CPU
- // H_laststep and S_laststep are already gathered in esolver_ks_lcao_tddft.cpp
#ifdef __MPI
- // Access the rank of the calling process in the communicator
int myid = 0;
const int root_proc = 0;
+ int num_procs = 1;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-
- // Gather psi to the root process
- gatherPsi(myid, root_proc, psi[0].get_pointer(), para_orb, psi_g);
- gatherPsi(myid, root_proc, psi_laststep[0].get_pointer(), para_orb, psi_laststep_g);
-
- // Syncronize data from CPU to Device
- syncmem_complex_h2d_op()(psi_k_tensor.data>(),
- psi_g.p.get(),
- len_psi_k_1 * len_psi_k_2);
- syncmem_complex_h2d_op()(psi_k_laststep_tensor.data>(),
- psi_laststep_g.p.get(),
- len_psi_k_1 * len_psi_k_2);
+ MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+
+ if (num_procs == 1)
+ {
+ // Single process: directly point to local data without gather
+ p_psi_host = psi[0].get_pointer();
+ p_psi_last_host = psi_laststep[0].get_pointer();
+ }
+ else
+ {
+ // Multiple processes: gather data to the root process (myid == 0) and point to the gathered data
+ gatherPsi(myid, root_proc, psi[0].get_pointer(), para_orb, psi_g);
+ gatherPsi(myid, root_proc, psi_laststep[0].get_pointer(), para_orb, psi_laststep_g);
+
+ if (myid == root_proc)
+ {
+ p_psi_host = psi_g.p.get();
+ p_psi_last_host = psi_laststep_g.p.get();
+ }
+ }
+
+ // Only the root process (myid == 0) performs the copy
+ if (myid == root_proc)
+ {
+ syncmem_complex_h2d_op()(psi_k_tensor.data>(),
+ p_psi_host,
+ len_psi_k_1 * len_psi_k_2);
+ syncmem_complex_h2d_op()(psi_k_laststep_tensor.data>(),
+ p_psi_last_host,
+ len_psi_k_1 * len_psi_k_2);
+ }
#endif
}
else
@@ -157,17 +178,27 @@ void Evolve_elec::solve_psi(const int& istep,
if (use_lapack)
{
#ifdef __MPI
- // Syncronize data from Device to CPU
- syncmem_complex_d2h_op()(psi_g.p.get(),
- psi_k_tensor.data>(),
- len_psi_k_1 * len_psi_k_2);
- syncmem_complex_d2h_op()(psi_laststep_g.p.get(),
- psi_k_laststep_tensor.data>(),
- len_psi_k_1 * len_psi_k_2);
-
- // Distribute psi to all processes
- distributePsi(para_orb, psi[0].get_pointer(), psi_g);
- distributePsi(para_orb, psi_laststep[0].get_pointer(), psi_laststep_g);
+ int myid = 0;
+ int num_procs = 1;
+ MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+ MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+
+ if (myid == 0)
+ {
+ syncmem_complex_d2h_op()(p_psi_host,
+ psi_k_tensor.data>(),
+ len_psi_k_1 * len_psi_k_2);
+ syncmem_complex_d2h_op()(p_psi_last_host,
+ psi_k_laststep_tensor.data>(),
+ len_psi_k_1 * len_psi_k_2);
+ }
+
+ // If it's multi-process, distribute back; if it's single-process, the data is already in psi[0]
+ if (num_procs > 1)
+ {
+ distributePsi(para_orb, psi[0].get_pointer(), psi_g);
+ distributePsi(para_orb, psi_laststep[0].get_pointer(), psi_laststep_g);
+ }
#endif
}
else
diff --git a/source/source_lcao/module_rt/evolve_psi.cpp b/source/source_lcao/module_rt/evolve_psi.cpp
index bd6eaef793..5a2f116fcd 100644
--- a/source/source_lcao/module_rt/evolve_psi.cpp
+++ b/source/source_lcao/module_rt/evolve_psi.cpp
@@ -155,83 +155,92 @@ void evolve_psi_tensor(const int nband,
hamilt::MatrixBlock> h_mat, s_mat;
p_hamilt->matrix(h_mat, s_mat);
- ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+ int myid = 0;
+ int num_procs = 1;
+ MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+ MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+ const int root_proc = 0;
- // Create Tensor objects for temporary data and sync from host to device
- const int len_HS = use_lapack ? nlocal * nlocal : pv->nloc;
- ct::Tensor Stmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
- ct::Tensor Htmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
- ct::Tensor Hold(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
+ std::complex* h_src = nullptr;
+ std::complex* s_src = nullptr;
+
+ module_rt::Matrix_g> h_mat_g, s_mat_g;
if (use_lapack)
{
- // Need to gather H and S matrix to root process here
- int myid = 0;
- int num_procs = 1;
- MPI_Comm_rank(MPI_COMM_WORLD, &myid);
- MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
- module_rt::Matrix_g> h_mat_g, s_mat_g; // Global matrix structure
-
- // Collect H matrix
- module_rt::gatherMatrix(myid, 0, h_mat, h_mat_g);
- syncmem_complex_h2d_op()(Htmp.data>(), h_mat_g.p.get(), len_HS);
- syncmem_complex_h2d_op()(Hold.data>(), h_mat_g.p.get(), len_HS);
-
- // Collect S matrix
- module_rt::gatherMatrix(myid, 0, s_mat, s_mat_g);
- syncmem_complex_h2d_op()(Stmp.data>(), s_mat_g.p.get(), len_HS);
+ if (num_procs == 1)
+ {
+ h_src = h_mat.p;
+ s_src = s_mat.p;
+ }
+ else
+ {
+ module_rt::gatherMatrix(myid, 0, h_mat, h_mat_g);
+ module_rt::gatherMatrix(myid, 0, s_mat, s_mat_g);
+ if (myid == root_proc)
+ {
+ h_src = h_mat_g.p.get();
+ s_src = s_mat_g.p.get();
+ }
+ }
}
else
{
- // Original code
- syncmem_complex_h2d_op()(Stmp.data>(), s_mat.p, len_HS);
- syncmem_complex_h2d_op()(Htmp.data>(), h_mat.p, len_HS);
- syncmem_complex_h2d_op()(Hold.data>(), h_mat.p, len_HS);
+ h_src = h_mat.p;
+ s_src = s_mat.p;
}
- ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+ const int len_HS = use_lapack ? nlocal * nlocal : pv->nloc;
- ct::Tensor U_operator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
- U_operator.zero();
+ ct::Tensor Stmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
- int myid = 0;
- int root_proc = 0;
- MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+ if (s_src != nullptr)
+ {
+ if (!use_lapack || myid == root_proc)
+ {
+ ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+ syncmem_complex_h2d_op()(Stmp.data>(), s_src, len_HS);
+ ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+ }
+ }
- // (1)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+ ct::Tensor Htmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
- /// @brief compute H(t+dt/2)
- /// @input H_laststep, Htmp, print_matrix
- /// @output Htmp
+ if (h_src != nullptr)
+ {
+ if (!use_lapack || myid == root_proc)
+ {
+ ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+ syncmem_complex_h2d_op()(Htmp.data>(), h_src, len_HS);
+ ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+ }
+ }
+
+ // (1) Compute H(t+dt/2)
if (propagator != 2)
{
if (!use_lapack)
{
half_Hmatrix_tensor(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, ofs_running, print_matrix);
}
- else
+ else if (myid == root_proc)
{
- if (myid == root_proc)
- {
- half_Hmatrix_tensor_lapack(pv,
- nband,
- nlocal,
- Htmp,
- Stmp,
- H_laststep,
- S_laststep,
- ofs_running,
- print_matrix);
- }
+ half_Hmatrix_tensor_lapack(pv,
+ nband,
+ nlocal,
+ Htmp,
+ Stmp,
+ H_laststep,
+ S_laststep,
+ ofs_running,
+ print_matrix);
}
}
- // (2)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+ ct::Tensor U_operator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
+ U_operator.zero();
- /// @brief compute U_operator
- /// @input Stmp, Htmp, print_matrix
- /// @output U_operator
+ // (2) Compute U_operator
Propagator prop(propagator, pv, PARAM.inp.td_dt);
prop.compute_propagator_tensor(nlocal,
Stmp,
@@ -242,55 +251,47 @@ void evolve_psi_tensor(const int nband,
print_matrix,
use_lapack);
- // (3)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
- /// @brief apply U_operator to the wave function of the previous step for new wave function
- /// @input U_operator, psi_k_laststep, print_matrix
- /// @output psi_k
+ // (3) Apply U_operator (psi_k = U * psi_last)
if (!use_lapack)
{
upsi_tensor(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix);
}
- else
+ else if (myid == root_proc)
{
- if (myid == root_proc)
- {
- upsi_tensor_lapack(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix);
- }
+ upsi_tensor_lapack(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix);
}
- // (4)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
- /// @brief normalize psi_k
- /// @input Stmp, psi_not_norm, psi_k, print_matrix
- /// @output psi_k
+ // (4) Normalize psi_k
if (!use_lapack)
{
norm_psi_tensor(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix);
}
- else
+ else if (myid == root_proc)
+ {
+ norm_psi_tensor_lapack(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix);
+ }
+
+ // (5) Compute ekb
+ ct::Tensor Hold(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
+
+ // Resync H matrix
+ if (h_src != nullptr)
{
- if (myid == root_proc)
+ if (!use_lapack || myid == root_proc)
{
- norm_psi_tensor_lapack(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix);
+ ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+ syncmem_complex_h2d_op()(Hold.data>(), h_src, len_HS);
+ ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
}
}
- // (5)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
- /// @brief compute ekb
- /// @input Htmp, psi_k
- /// @output ekb
if (!use_lapack)
{
compute_ekb_tensor(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
}
- else
+ else if (myid == root_proc)
{
- if (myid == root_proc)
- {
- compute_ekb_tensor_lapack(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
- }
+ compute_ekb_tensor_lapack(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
}
#endif // __MPI
diff --git a/source/source_lcao/module_rt/middle_hamilt.cpp b/source/source_lcao/module_rt/middle_hamilt.cpp
index 9706afa558..10f432fbe9 100644
--- a/source/source_lcao/module_rt/middle_hamilt.cpp
+++ b/source/source_lcao/module_rt/middle_hamilt.cpp
@@ -180,40 +180,6 @@ void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv,
// ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
using ct_Device = typename ct::PsiToContainer::type;
- if (print_matrix)
- {
- ct::Tensor Htmp_cpu = Htmp.to_device();
- ct::Tensor H_laststep_cpu = H_laststep.to_device();
-
- ofs_running << std::setprecision(10);
- ofs_running << std::endl;
- ofs_running << " H(t+dt) :" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << Htmp_cpu.data>()[in + j].real() << "+"
- << Htmp_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- ofs_running << std::endl;
- ofs_running << " H(t):" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << H_laststep_cpu.data>()[in + j].real() << "+"
- << H_laststep_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- }
-
std::complex one_half = {0.5, 0.0};
// Perform the operation Htmp = one_half * H_laststep + one_half * Htmp
@@ -243,25 +209,6 @@ void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv,
1,
Stmp.data>(),
1);
-
- if (print_matrix)
- {
- ct::Tensor Htmp_cpu = Htmp.to_device();
-
- ofs_running << std::endl;
- ofs_running << " H (t+dt/2) :" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << Htmp_cpu.data>()[in + j].real() << "+"
- << Htmp_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- }
}
// Explicit instantiation of template functions
diff --git a/source/source_lcao/module_rt/norm_psi.cpp b/source/source_lcao/module_rt/norm_psi.cpp
index d29972dce3..f5a9c6c8b4 100644
--- a/source/source_lcao/module_rt/norm_psi.cpp
+++ b/source/source_lcao/module_rt/norm_psi.cpp
@@ -1,14 +1,13 @@
#include "norm_psi.h"
+#include "source_base/global_function.h" // ModuleBase::GlobalFunc::ZEROS
#include "source_base/module_container/ATen/kernels/blas.h"
#include "source_base/module_external/blas_connector.h"
#include "source_base/module_external/scalapack_connector.h"
-#include "source_base/global_function.h" // ModuleBase::GlobalFunc::ZEROS
-
+#include
#include
#include
-#include
namespace module_rt
{
@@ -494,33 +493,6 @@ void norm_psi_tensor_lapack(const Parallel_Orbitals* pv,
Cij.data>(),
nlocal); // Leading dimension of Cij
- if (print_matrix)
- {
- ct::Tensor Cij_print_cpu = Cij.to_device();
-
- ofs_running << "original Cij :" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- double aa = Cij_print_cpu.data>()[in + j].real();
- double bb = Cij_print_cpu.data>()[in + j].imag();
- if (std::abs(aa) < 1e-8)
- {
- aa = 0.0;
- }
- if (std::abs(bb) < 1e-8)
- {
- bb = 0.0;
- }
- ofs_running << aa << "+" << bb << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- }
-
// Normalize Cij: set diagonal elements to 1/sqrt(Cij[i][i]), off-diagonal elements to 0
if (ct_device_type == ct::DeviceType::GpuDevice)
{
@@ -587,70 +559,6 @@ void norm_psi_tensor_lapack(const Parallel_Orbitals* pv,
&beta,
psi_k.data>(),
nlocal); // Leading dimension of psi_k
-
- if (print_matrix)
- {
- ct::Tensor Cij_print_cpu = Cij.to_device();
- ct::Tensor psi_k_cpu = psi_k.to_device();
- ct::Tensor tmp1_cpu = tmp1.to_device();
-
- ofs_running << " Cij:" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << Cij_print_cpu.data>()[in + j].real() << "+"
- << Cij_print_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- ofs_running << std::endl;
- ofs_running << " psi_k:" << std::endl;
- for (int i = 0; i < nband; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- double aa = psi_k_cpu.data>()[in + j].real();
- double bb = psi_k_cpu.data>()[in + j].imag();
- if (std::abs(aa) < 1e-8)
- {
- aa = 0.0;
- }
- if (std::abs(bb) < 1e-8)
- {
- bb = 0.0;
- }
- ofs_running << aa << "+" << bb << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- ofs_running << " psi_k before normalization:" << std::endl;
- for (int i = 0; i < nband; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- double aa = tmp1_cpu.data>()[in + j].real();
- double bb = tmp1_cpu.data>()[in + j].imag();
- if (std::abs(aa) < 1e-8)
- {
- aa = 0.0;
- }
- if (std::abs(bb) < 1e-8)
- {
- bb = 0.0;
- }
- ofs_running << aa << "+" << bb << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- ofs_running << std::endl;
- }
}
// Explicit instantiation of template functions
diff --git a/source/source_lcao/module_rt/propagator_cn2.cpp b/source/source_lcao/module_rt/propagator_cn2.cpp
index 01614c9792..8ef07b0ebb 100644
--- a/source/source_lcao/module_rt/propagator_cn2.cpp
+++ b/source/source_lcao/module_rt/propagator_cn2.cpp
@@ -510,211 +510,73 @@ void Propagator::compute_propagator_cn2_tensor_lapack(const int nlocal,
// ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
using ct_Device = typename ct::PsiToContainer::type;
- // (1) copy Htmp to Numerator & Denominator
- ct::Tensor Numerator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal * nlocal}));
- Numerator.zero();
+ // Define coefficients
+ // beta1 = -i * dt/4 (for Numerator)
+ // beta2 = +i * dt/4 (for Denominator)
+ std::complex beta1 = {0.0, -0.25 * this->dt};
+ std::complex beta2 = {0.0, 0.25 * this->dt};
+
+ // ========================================================================
+ // Numerator = Stmp + beta1 * Htmp
+ // ========================================================================
+
+ // 1. Copy Stmp to U_operator
base_device::memory::synchronize_memory_op, Device, Device>()(
- Numerator.data>(),
- Htmp.data>(),
+ U_operator.data>(),
+ Stmp.data>(),
nlocal * nlocal);
+ // 2. U_operator = beta1 * Htmp + U_operator
+ ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal,
+ &beta1,
+ Htmp.data>(),
+ 1,
+ U_operator.data>(),
+ 1);
+
+ // ========================================================================
+ // Denominator = Stmp + beta2 * Htmp
+ // ========================================================================
+
ct::Tensor Denominator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal * nlocal}));
- Denominator.zero();
+
+ // 1. Copy Stmp to Denominator
base_device::memory::synchronize_memory_op, Device, Device>()(
Denominator.data>(),
- Htmp.data>(),
+ Stmp.data>(),
nlocal * nlocal);
- if (print_matrix)
- {
- ct::Tensor Stmp_cpu = Stmp.to_device();
- ct::Tensor Numerator_cpu = Numerator.to_device();
-
- ofs_running << std::endl;
- ofs_running << " S matrix :" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << Stmp_cpu.data>()[in + j].real() << "+"
- << Stmp_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- ofs_running << std::endl;
- ofs_running << " H matrix :" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << Numerator_cpu.data>()[in + j].real() << "+"
- << Numerator_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- }
-
- // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
- // (2) compute Numerator & Denominator by GEADD
- // Numerator = Stmp - i*para * Htmp; beta1 = - para = -0.25 * this->dt
- // Denominator = Stmp + i*para * Htmp; beta2 = para = 0.25 * this->dt
- std::complex one = {1.0, 0.0};
- std::complex beta1 = {0.0, -0.25 * this->dt};
- std::complex beta2 = {0.0, 0.25 * this->dt};
-
- // Numerator = -i*para * Htmp
- ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal,
- &beta1,
- Numerator.data>(),
- 1);
- // Numerator = Stmp + (-i*para * Htmp)
+ // 2. Denominator = beta2 * Htmp + Denominator
ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal,
- &one,
- Stmp.data>(),
- 1,
- Numerator.data>(),
- 1);
- // Denominator = i*para * Htmp
- ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal,
&beta2,
- Denominator.data>(),
- 1);
- // Denominator = Stmp + (i*para * Htmp)
- ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal,
- &one,
- Stmp.data>(),
+ Htmp.data>(),
1,
Denominator.data>(),
1);
- if (print_matrix)
- {
- ct::Tensor Denominator_cpu = Denominator.to_device();
+ // ========================================================================
+ // Solve D * U = N, result overwrites N (which is U_operator)
+ // ========================================================================
- ofs_running << " beta=" << beta1 << std::endl;
- ofs_running << " fenmu:" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << Denominator_cpu.data>()[in + j].real() << "+"
- << Denominator_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- }
-
- //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
- // (3) Next, invert Denominator
ct::Tensor ipiv(ct::DataType::DT_INT, ct_device_type, ct::TensorShape({nlocal}));
- ipiv.zero();
- // (3.1) compute ipiv
+ // No need to zero ipiv, it is output only.
+
+ // 1. LU Factorization of Denominator (In-place)
ct::kernels::lapack_getrf, ct_Device>()(nlocal,
nlocal,
Denominator.data>(),
nlocal,
ipiv.data());
- // Print ipiv
- if (print_matrix)
- {
- ct::Tensor ipiv_cpu = ipiv.to_device();
-
- ofs_running << " ipiv:" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- ofs_running << ipiv_cpu.data()[i] << " ";
- }
- ofs_running << std::endl;
- }
-
- // (3.2) compute inverse matrix of Denominator
- ct::Tensor Denominator_inv = create_identity_matrix>(nlocal, ct_device_type);
+ // 2. Solve D * X = B
ct::kernels::lapack_getrs, ct_Device>()('N',
nlocal,
nlocal,
Denominator.data>(),
nlocal,
ipiv.data(),
- Denominator_inv.data>(),
+ U_operator.data>(),
nlocal);
-
- //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
- // (4) U_operator = Denominator_inv * Numerator;
- std::complex one_gemm = {1.0, 0.0};
- std::complex zero_gemm = {0.0, 0.0};
- ct::kernels::blas_gemm, ct_Device>()('N',
- 'N',
- nlocal,
- nlocal,
- nlocal,
- &one_gemm,
- Denominator_inv.data>(),
- nlocal,
- Numerator.data>(),
- nlocal,
- &zero_gemm,
- U_operator.data>(),
- nlocal);
-
- if (print_matrix)
- {
- ct::Tensor Denominator_inv_cpu = Denominator_inv.to_device();
- ct::Tensor Numerator_cpu = Numerator.to_device();
- ct::Tensor U_operator_cpu = U_operator.to_device();
-
- ofs_running << " fenmu^-1:" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << Denominator_inv_cpu.data>()[in + j].real() << "+"
- << Denominator_inv_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- ofs_running << " fenzi:" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- ofs_running << Numerator_cpu.data>()[in + j].real() << "+"
- << Numerator_cpu.data>()[in + j].imag() << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- ofs_running << " U operator:" << std::endl;
- for (int i = 0; i < nlocal; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- double aa = U_operator_cpu.data>()[in + j].real();
- double bb = U_operator_cpu.data>()[in + j].imag();
- if (std::abs(aa) < 1e-8)
- {
- aa = 0.0;
- }
- if (std::abs(bb) < 1e-8)
- {
- bb = 0.0;
- }
- ofs_running << aa << "+" << bb << "i ";
- }
- ofs_running << std::endl;
- }
- }
}
// Explicit instantiation of template functions
diff --git a/source/source_lcao/module_rt/upsi.cpp b/source/source_lcao/module_rt/upsi.cpp
index 5892ae2a57..0982a77426 100644
--- a/source/source_lcao/module_rt/upsi.cpp
+++ b/source/source_lcao/module_rt/upsi.cpp
@@ -195,56 +195,6 @@ void upsi_tensor_lapack(const Parallel_Orbitals* pv,
&beta,
psi_k.data>(),
nlocal);
-
- if (print_matrix)
- {
- ct::Tensor psi_k_cpu = psi_k.to_device();
- ct::Tensor psi_k_laststep_cpu = psi_k_laststep.to_device();
-
- ofs_running << std::endl;
- ofs_running << " psi_k:" << std::endl;
- for (int i = 0; i < nband; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- double aa = psi_k_cpu.data>()[in + j].real();
- double bb = psi_k_cpu.data>()[in + j].imag();
- if (std::abs(aa) < 1e-8)
- {
- aa = 0.0;
- }
- if (std::abs(bb) < 1e-8)
- {
- bb = 0.0;
- }
- ofs_running << aa << "+" << bb << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- ofs_running << " psi_k_laststep:" << std::endl;
- for (int i = 0; i < nband; i++)
- {
- const int in = i * nlocal;
- for (int j = 0; j < nlocal; j++)
- {
- double aa = psi_k_laststep_cpu.data>()[in + j].real();
- double bb = psi_k_laststep_cpu.data>()[in + j].imag();
- if (std::abs(aa) < 1e-8)
- {
- aa = 0.0;
- }
- if (std::abs(bb) < 1e-8)
- {
- bb = 0.0;
- }
- ofs_running << aa << "+" << bb << "i ";
- }
- ofs_running << std::endl;
- }
- ofs_running << std::endl;
- }
}
// Explicit instantiation of template functions