#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP /* ========================================================================= Copyright (c) 2010-2011, Institute for Microelectronics, Institute for Analysis and Scientific Computing, TU Wien. ----------------- ViennaCL - The Vienna Computing Library ----------------- Project Head: Karl Rupp rupp@iue.tuwien.ac.at (A list of authors and contributors can be found in the PDF manual) License: MIT (X11), see file LICENSE in the base directory ============================================================================= */ /** @file viennacl/linalg/detail/spai/spai-dynamic.hpp @brief Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental in 1.2.x. SPAI code contributed by Nikolay Lukash */ #include #include #include #include #include #include #include #include //#include "block_matrix.hpp" //#include "block_vector.hpp" //#include "benchmark-utils.hpp" #include "boost/numeric/ublas/vector.hpp" #include "boost/numeric/ublas/matrix.hpp" #include "boost/numeric/ublas/matrix_proxy.hpp" #include "boost/numeric/ublas/vector_proxy.hpp" #include "boost/numeric/ublas/storage.hpp" #include "boost/numeric/ublas/io.hpp" #include "boost/numeric/ublas/lu.hpp" #include "boost/numeric/ublas/triangular.hpp" #include "boost/numeric/ublas/matrix_expression.hpp" // ViennaCL includes #include "viennacl/linalg/prod.hpp" #include "viennacl/matrix.hpp" #include "viennacl/compressed_matrix.hpp" #include "viennacl/linalg/compressed_matrix_operations.hpp" #include "viennacl/linalg/matrix_operations.hpp" #include "viennacl/scalar.hpp" #include "viennacl/linalg/cg.hpp" #include "viennacl/linalg/inner_prod.hpp" #include "viennacl/linalg/ilu.hpp" #include "viennacl/ocl/backend.hpp" #include "viennacl/linalg/detail/spai/block_matrix.hpp" #include "viennacl/linalg/detail/spai/block_vector.hpp" #include "viennacl/linalg/detail/spai/qr.hpp" #include "viennacl/linalg/detail/spai/spai_tag.hpp" #include "viennacl/linalg/kernels/spai_source.h" #include "viennacl/linalg/kernels/spai_kernels.h" namespace viennacl { namespace linalg { namespace detail { namespace spai { typedef std::pair PairT; struct CompareSecond{ bool operator()(const PairT& left, const PairT& right) { return static_cast(left.second) > static_cast(right.second); } }; /** @brief Initializes Dense matrix from sparse one * @param A_in Riginal sparse matrix * @param J Set of column indices * @param I Set of row indices * @param A_out dense matrix output */ template void initProjectSubMatrix(const SparseMatrixType& A_in, const std::vector& J, std::vector& I, DenseMatrixType& A_out){ typedef typename DenseMatrixType::value_type ScalarType; A_out.resize(I.size(), J.size(), false); for(size_t j = 0; j < J.size(); ++j){ for(size_t i = 0; i < I.size(); ++i){ A_out(i,j) = A_in(I[i],J[j]); } } } /** @brief Determines if element ind is in set {J} * @param J current set * @param ind current element */ bool isInIndexSet(const std::vector& J, const unsigned int& ind){ return (std::find(J.begin(), J.end(), ind) != J.end()); } /** @brief Composition of new matrix R, that is going to be used in Least Square problem solving * @param A matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices * @param R_n matrix A_Iu_J_u after QR factorization * @param R previously composed matrix R */ template void composeNewR(const MatrixType& A, const MatrixType& R_n, MatrixType& R){ typedef typename MatrixType::value_type ScalarType; size_t row_n = R_n.size1() - (A.size1() - R.size2()); MatrixType C = boost::numeric::ublas::zero_matrix(R.size1() + row_n, R.size2() + A.size2()); //write original R to new Composite R boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R; //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2() boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2())); //adding decomposed(QR) block to Composite R if(R_n.size1() > 0 && R_n.size2() > 0) boost::numeric::ublas::project(C, boost::numeric::ublas::range(R.size2(), R.size1() + row_n), boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n; R = C; } /** @brief Composition of new vector of coefficients beta from QR factorizations(necessary for Q recovery) * @param v_n new vector from last QR factorization * @param v composition of previous vectors from QR factorizations */ template void composeNewVector(const VectorType& v_n, VectorType& v){ typedef typename VectorType::value_type ScalarType; VectorType w = boost::numeric::ublas::zero_vector(v.size() + v_n.size()); boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v; boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n; v = w; } /** @brief Computation of Euclidean norm for sparse vector * @param v initial sparse vector * @param norm scalar that represents Euclidean norm */ template void sparse_norm_2(const SparseVectorType& v, ScalarType& norm){ for(typename SparseVectorType::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it){ norm += (vec_it->second)*(vec_it->second); } norm = std::sqrt(norm); } /** @brief Dot product of two sparse vectors * @param v1 initial sparse vector * @param v2 initial sparse vector * @param res_v scalar that represents dot product result */ template void sparse_inner_prod(const SparseVectorType& v1, const SparseVectorType& v2, ScalarType& res_v){ typename SparseVectorType::const_iterator v_it1 = v1.begin(); typename SparseVectorType::const_iterator v_it2 = v2.begin(); while((v_it1 != v1.end())&&(v_it2 != v2.end())){ if(v_it1->first == v_it2->first){ res_v += (v_it1->second)*(v_it2->second); ++v_it1; ++v_it2; } else if(v_it1->first < v_it2->first){ ++v_it1; } else ++v_it2; } } /** @brief Building a new set of column indices J_u, cf. Kallischko dissertation p.31 * @param A_v_c vectorized column-wise initial matrix * @param res residual vector * @param J set of column indices * @param J_u set of new column indices * @param tag SPAI tag with parameters */ template bool buildAugmentedIndexSet(const std::vector& A_v_c, const SparseVectorType& res, std::vector& J, std::vector& J_u, const spai_tag& tag){ std::vector > p; size_t cur_size = 0; ScalarType inprod, norm2; //print_sparse_vector(res); for(typename SparseVectorType::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it){ if(!isInIndexSet(J, res_it->first) && (std::abs(res_it->second) > tag.getResidualThreshold())){ inprod = norm2 = 0; sparse_inner_prod(res, A_v_c[res_it->first], inprod); sparse_norm_2(A_v_c[res_it->first], norm2); p.push_back(std::pair(res_it->first, (inprod*inprod)/(norm2*norm2))); } } std::sort(p.begin(), p.end(), CompareSecond()); while ((cur_size < J.size())&&(p.size() > 0)) { J_u.push_back(p[0].first); p.erase(p.begin()); cur_size++; } p.clear(); return (cur_size > 0); } /** @brief Building a new indices to current set of row indices I_n, cf. Kallischko dissertation p.32 * @param A_v_c vectorized column-wise initial matrix * @param I set of previous determined row indices * @param J_n set of new column indices * @param I_n set of new indices */ template void buildNewRowSet(const std::vector& A_v_c, const std::vector& I, const std::vector& J_n, std::vector& I_n){ for(size_t i = 0; i < J_n.size(); ++i){ for(typename SparseVectorType::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it){ if(!isInIndexSet(I, col_it->first)&&!isInIndexSet(I_n, col_it->first)){ I_n.push_back(col_it->first); } } } } /** @brief Composition of new block for QR factorization cf. Kallischko dissertation p.82, figure 4.7 * @param A_I_J previously composed block * @param A_I_J_u matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices * @param A_I_u_J_u is composition of lower part A(I, \\tilde J) and A(\\tilde I, \\tilde J) - new block for QR decomposition */ template void QRBlockComposition(const MatrixType& A_I_J, const MatrixType& A_I_J_u, MatrixType& A_I_u_J_u){ typedef typename MatrixType::value_type ScalarType; size_t row_n1 = A_I_J_u.size1() - A_I_J.size2(); size_t row_n2 = A_I_u_J_u.size1(); size_t row_n = row_n1 + row_n2; size_t col_n = A_I_J_u.size2(); MatrixType C = boost::numeric::ublas::zero_matrix(row_n, col_n); boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, row_n1), boost::numeric::ublas::range(0, col_n)) += boost::numeric::ublas::project(A_I_J_u, boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()), boost::numeric::ublas::range(0, col_n)); boost::numeric::ublas::project(C, boost::numeric::ublas::range(row_n1, row_n1 + row_n2), boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u; A_I_u_J_u = C; } /** @brief CPU-based dynamic update for SPAI preconditioner * @param A initial sparse matrix * @param A_v_c vectorized column-wise initial matrix * @param g_res container of residuals for all columns * @param g_is_update container with identificators that shows which block should be modified * @param g_I container of row index sets for all columns * @param g_J container of column index sets for all columns * @param g_b_v container of vectors of beta for Q recovery(cf. Golub Van Loan "Matrix Computations", 3rd edition p.211) * @param g_A_I_J container of block matrices from previous update * @param tag SPAI configuration tag */ template void block_update(const SparseMatrixType& A, const std::vector& A_v_c, std::vector& g_res, std::vector& g_is_update, std::vector >& g_I, std::vector >& g_J, std::vector& g_b_v, std::vector& g_A_I_J, spai_tag const & tag){ typedef typename DenseMatrixType::value_type ScalarType; //set of new column indices std::vector > g_J_u(g_J.size()); //set of new row indices std::vector > g_I_u(g_J.size()); //matrix A(I, \tilde J), cf. Kallischko p.31-32 std::vector g_A_I_J_u(g_J.size()); //matrix A(\tilde I, \tilde J), cf. Kallischko std::vector g_A_I_u_J_u(g_J.size()); //new vector of beta coefficients from QR factorization std::vector g_b_v_u(g_J.size()); #ifdef _OPENMP #pragma omp parallel for #endif for(std::size_t i = 0; i < g_J.size(); ++i){ if(g_is_update[i]){ if(buildAugmentedIndexSet(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)){ //initialize matrix A_I_\hatJ initProjectSubMatrix(A, g_J_u[i], g_I[i], g_A_I_J_u[i]); //multiplication of Q'*A_I_\hatJ apply_q_trans_mat(g_A_I_J[i], g_b_v[i], g_A_I_J_u[i]); //building new rows index set \hatI buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]); initProjectSubMatrix(A, g_J_u[i], g_I_u[i], g_A_I_u_J_u[i]); //composition of block for new QR factorization QRBlockComposition(g_A_I_J[i], g_A_I_J_u[i], g_A_I_u_J_u[i]); //QR factorization single_qr(g_A_I_u_J_u[i], g_b_v_u[i]); //composition of new R and new vector b_v composeNewR(g_A_I_J_u[i], g_A_I_u_J_u[i], g_A_I_J[i]); composeNewVector(g_b_v_u[i], g_b_v[i]); //composition of new sets: I and J g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end()); g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end()); }else{ g_is_update[i] = false; } } } } /**************************************************** GPU SPAI Update ****************************************************************/ //performs Q'*A(I, \tilde J) on GPU /** @brief Performs multiplication Q'*A(I, \\tilde J) on GPU * @param g_J_u container of sets of new column indices * @param g_I container of row indices * @param g_A_I_J_vcl block matrix composed from previous blocks, they are blocks of R * @param g_bv_vcl block of beta vectors * @param g_A_I_J_u_vcl block of matrices A(I, \\tilde J) * @param g_is_update indicators, that show if a certain block should be processed * @param cur_iter current iteration, used to make sure that kernel compiles just once */ template void block_q_multiplication(const std::vector >& g_J_u, const std::vector >& g_I, block_matrix& g_A_I_J_vcl, block_vector& g_bv_vcl, block_matrix& g_A_I_J_u_vcl, std::vector& g_is_update, const unsigned int cur_iter){ unsigned int local_r_n, local_c_n, sz_blocks; get_max_block_size(g_I, local_r_n); get_max_block_size(g_J_u, local_c_n); //for debug std::vector matrix_dims(g_I.size()*2, static_cast(0)); std::vector blocks_ind(g_I.size() + 1, static_cast(0)); compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims); std::vector con_A_I_J(sz_blocks, static_cast(0)); /*if(cur_iter == 1){ //if first run - compile the program std::string block_q_kernel_file_name = "kernels/spai/block_q.cl"; std::string block_q_kernel_source; read_kernel_from_file(block_q_kernel_file_name, block_q_kernel_source); viennacl::ocl::program & block_q_prog = viennacl::ocl::current_context().add_program(block_q_kernel_source.c_str(), "block_q_kernel_source"); block_q_prog.add_kernel("block_q_mult"); // }*/ viennacl::ocl::handle g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(cl_uint)*(g_is_update.size())), &(g_is_update[0])); viennacl::ocl::kernel& block_q_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai::program_name(), "block_q_mult"); block_q_kernel.local_work_size(0, local_c_n); block_q_kernel.global_work_size(0, 256); viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl, viennacl::ocl::local_mem(static_cast(sizeof(ScalarType)*(local_r_n*local_c_n))), static_cast(g_I.size()))); } /** @brief Assembly of container of index row sets: I_q, row indices for new "QR block" * @param g_I container of row indices * @param g_J container of column indices * @param g_I_u container of new row indices * @param g_I_q container of row indices for new QR blocks */ void assemble_qr_row_inds(const std::vector >& g_I, const std::vector > g_J, const std::vector >& g_I_u, std::vector >& g_I_q){ #ifdef _OPENMP #pragma omp parallel for #endif for(std::size_t i = 0; i < g_I.size(); ++i){ for(std::size_t j = g_J[i].size(); j < g_I[i].size(); ++j){ g_I_q[i].push_back(g_I[i][j]); } for(std::size_t j = 0; j < g_I_u[i].size(); ++j){ g_I_q[i].push_back(g_I_u[i][j]); } } } /** @brief Performs assembly for new QR block * @param g_J container of column indices * @param g_I container of row indices * @param g_J_u container of new column indices * @param g_I_u container of new row indices * @param g_I_q container of row indices for new QR blocks * @param g_A_I_J_u_vcl blocks of Q'*A(I, \\tilde J) * @param matrix_dimensions array with matrix dimensions for all blocks * @param g_A_I_u_J_u_vcl blocks A(\\tilde I, \\tilde J) * @param g_is_update container with update indicators * @param is_empty_block indicator if all previous blocks A(\\tilde I, \\tilde J) - are empty, in case if they are empty kernel with smaller number of arguments is used * @param cur_iter current iteration, used to make sure that kernel compiles just once */ template void assemble_qr_block( const std::vector >& g_J, const std::vector >& g_I, const std::vector >& g_J_u, const std::vector >& g_I_u, std::vector >& g_I_q, block_matrix& g_A_I_J_u_vcl, viennacl::ocl::handle& matrix_dimensions, block_matrix& g_A_I_u_J_u_vcl, std::vector& g_is_update, const bool is_empty_block, const unsigned int cur_iter){ //std::vector > g_I_q(g_I.size()); assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q); unsigned int sz_blocks; std::vector matrix_dims(g_I.size()*2, static_cast(0)); std::vector blocks_ind(g_I.size() + 1, static_cast(0)); compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims); std::vector con_A_I_J_q(sz_blocks, static_cast(0)); /*if(cur_iter == 1){ std::string qr_block_asm_file_name = "kernels/spai/qr_block_assembly_g.cl"; std::string qr_block_asm_source; read_kernel_from_file(qr_block_asm_file_name, qr_block_asm_source); viennacl::ocl::program & qr_block_asm_prog = viennacl::ocl::current_context().add_program(qr_block_asm_source.c_str(), "qr_block_assembly_kernel_source"); qr_block_asm_prog.add_kernel("block_qr_assembly"); //extra kernel in case of empty block A_I_u_J_u std::string qr_block_asm_file_name_1 = "kernels/spai/qr_block_assembly_1_g.cl"; std::string qr_block_asm_source_1; read_kernel_from_file(qr_block_asm_file_name_1, qr_block_asm_source_1); viennacl::ocl::program & qr_block_asm_prog_1 = viennacl::ocl::current_context().add_program(qr_block_asm_source_1.c_str(), "qr_block_assembly_kernel_source_1"); qr_block_asm_prog_1.add_kernel("block_qr_assembly_1"); }*/ block_matrix g_A_I_J_q_vcl; //need to allocate memory for QR block g_A_I_J_q_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(ScalarType)*sz_blocks), &(con_A_I_J_q[0])); g_A_I_J_q_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(cl_uint)*2*static_cast(g_I.size())), &(matrix_dims[0])); g_A_I_J_q_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(cl_uint)*2*static_cast(g_I.size() + 1)), &(blocks_ind[0])); viennacl::ocl::handle g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(cl_uint)*(g_is_update.size())), &(g_is_update[0])); if(!is_empty_block){ viennacl::ocl::kernel& qr_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai::program_name(), "block_qr_assembly"); qr_assembly_kernel.local_work_size(0, 1); qr_assembly_kernel.global_work_size(0, 256); viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(), g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(), g_A_I_J_q_vcl.handle(), g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(), g_is_update_vcl, static_cast(g_I.size()))); }else{ viennacl::ocl::kernel& qr_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai::program_name(), "block_qr_assembly_1"); qr_assembly_kernel.local_work_size(0, 1); qr_assembly_kernel.global_work_size(0, 256); viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(), g_A_I_J_q_vcl.handle(), g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(), g_is_update_vcl, static_cast(g_I.size()))); } g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle(); g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1(); g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2(); } /** @brief Performs assembly for new R matrix on GPU * @param g_I container of row indices * @param g_J container of column indices * @param g_A_I_J_vcl container of block matrices from previous update * @param g_A_I_J_u_vcl container of block matrices Q'*A(I, \\tilde J) * @param g_A_I_u_J_u_vcl container of block matrices QR factored on current iteration * @param g_bv_vcl block of beta vectors from previous iteration * @param g_bv_vcl_u block of updated beta vectors got after recent QR factorization * @param g_is_update container with identificators that shows which block should be modified * @param cur_iter current iteration, used to make sure that kernel compiles just once */ template void assemble_r(std::vector >& g_I, std::vector >& g_J, block_matrix& g_A_I_J_vcl, block_matrix& g_A_I_J_u_vcl, block_matrix& g_A_I_u_J_u_vcl, block_vector& g_bv_vcl, block_vector& g_bv_vcl_u, std::vector& g_is_update, const unsigned int cur_iter){ std::vector matrix_dims(g_I.size()*2, static_cast(0)); std::vector blocks_ind(g_I.size() + 1, static_cast(0)); std::vector start_bv_r_inds(g_I.size() + 1, 0); unsigned int sz_blocks, bv_size; compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims); get_size(g_J, bv_size); init_start_inds(g_J, start_bv_r_inds); std::vector con_A_I_J_r(sz_blocks, static_cast(0)); std::vector b_v_r(bv_size, static_cast(0)); /*if(cur_iter == 1){ std::string r_block_asm_file_name = "kernels/spai/r_block_assembly_g.cl"; std::string r_block_asm_source; read_kernel_from_file(r_block_asm_file_name, r_block_asm_source); viennacl::ocl::program & r_block_asm_prog = viennacl::ocl::current_context().add_program(r_block_asm_source.c_str(), "r_block_assembly_kernel_source"); r_block_asm_prog.add_kernel("block_r_assembly"); std::string bv_block_asm_file_name = "kernels/spai/bv_block_assembly_g.cl"; std::string bv_block_asm_source; read_kernel_from_file(bv_block_asm_file_name, bv_block_asm_source); viennacl::ocl::program & bv_block_asm_prog = viennacl::ocl::current_context().add_program(bv_block_asm_source.c_str(), "bv_block_assembly_kernel_source"); bv_block_asm_prog.add_kernel("block_bv_assembly"); }*/ block_matrix g_A_I_J_r_vcl; block_vector g_bv_r_vcl; g_A_I_J_r_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(ScalarType)*sz_blocks), &(con_A_I_J_r[0])); g_A_I_J_r_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(cl_uint)*2*static_cast(g_I.size())), &(matrix_dims[0])); g_A_I_J_r_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(cl_uint)*2*static_cast(g_I.size() + 1)), &(blocks_ind[0])); g_bv_r_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(ScalarType)*bv_size), &(b_v_r[0])); g_bv_r_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(cl_uint)*(g_I.size() + 1)), &(start_bv_r_inds[0])); viennacl::ocl::handle g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast(sizeof(cl_uint)*(g_is_update.size())), &(g_is_update[0])); viennacl::ocl::kernel& r_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai::program_name(), "block_r_assembly"); r_assembly_kernel.local_work_size(0, 1); r_assembly_kernel.global_work_size(0, 256); viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(), g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(), g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl, static_cast(g_I.size()))); viennacl::ocl::kernel & bv_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai::program_name(), "block_bv_assembly"); bv_assembly_kernel.local_work_size(0, 1); bv_assembly_kernel.global_work_size(0, 256); viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(), g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(), g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl, static_cast(g_I.size()))); g_bv_vcl.handle() = g_bv_r_vcl.handle(); g_bv_vcl.handle1() = g_bv_r_vcl.handle1(); g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle(); g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2(); g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1(); } /** @brief GPU-based block update * @param A sparse matrix * @param A_v_c vectorized column-wise initial matrix * @param g_is_update container with identificators that shows which block should be modified * @param g_res container of residuals for all columns * @param g_J container of column index sets for all columns * @param g_I container of row index sets for all columns * @param g_A_I_J_vcl container of block matrices from previous update * @param g_bv_vcl block of beta vectors from previous iteration * @param tag SPAI configuration tag * @param cur_iter current iteration, used to make sure that kernel compiles just once */ template void block_update(const viennacl::compressed_matrix& A, const std::vector& A_v_c, std::vector& g_is_update, std::vector& g_res, std::vector >& g_J, std::vector >& g_I, block_matrix& g_A_I_J_vcl, block_vector& g_bv_vcl, spai_tag const & tag, const unsigned int cur_iter){ //updated index set for columns std::vector > g_J_u(g_J.size()); //updated index set for rows std::vector > g_I_u(g_J.size()); //mixed index set of old and updated indices for rows std::vector > g_I_q(g_J.size()); //GPU memory for A_I_\hatJ block_matrix g_A_I_J_u_vcl; //GPU memory for A_\hatI_\hatJ block_matrix g_A_I_u_J_u_vcl; bool is_empty_block; //GPU memory for new b_v block_vector g_bv_u_vcl; #ifdef _OPENMP #pragma omp parallel for #endif for(std::size_t i = 0; i < g_J.size(); ++i){ if(g_is_update[i]){ if(buildAugmentedIndexSet(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)){ buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]); } } } //assemble new A_I_J_u blocks on GPU and multiply them with Q' block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block, cur_iter); //I have matrix A_I_J_u ready.. block_q_multiplication(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, cur_iter); //assemble A_\hatI_\hatJ block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block, cur_iter); assemble_qr_block(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(), g_A_I_u_J_u_vcl, g_is_update, is_empty_block, cur_iter); block_qr(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, cur_iter); //concatanation of new and old indices #ifdef _OPENMP #pragma omp parallel for #endif for(std::size_t i = 0; i < g_J.size(); ++i){ g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end()); g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end()); } assemble_r(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl, g_bv_vcl, g_bv_u_vcl, g_is_update, cur_iter); } } } } } #endif