#ifndef VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
#define VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
//Automatically generated file from auxiliary-directory, do not edit manually!
namespace viennacl
{
 namespace linalg
 {
  namespace kernels
  {
const char * const compressed_matrix_align4_vec_mul = 
"__kernel void vec_mul(\n"
"          __global const unsigned int * row_indices,\n"
"          __global const uint4 * column_indices, \n"
"          __global const float4 * elements,\n"
"          __global const float * vector,  \n"
"          __global float * result,\n"
"          unsigned int size)\n"
"{ \n"
"  float dot_prod;\n"
"  unsigned int start, next_stop;\n"
"  uint4 col_idx;\n"
"  float4 tmp_vec;\n"
"  float4 tmp_entries;\n"
"  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
"  {\n"
"    dot_prod = 0.0f;\n"
"    start = row_indices[row] / 4;\n"
"    next_stop = row_indices[row+1] / 4;\n"
"    for (unsigned int i = start; i < next_stop; ++i)\n"
"    {\n"
"      col_idx = column_indices[i];\n"
"      tmp_entries = elements[i];\n"
"      tmp_vec.x = vector[col_idx.x];\n"
"      tmp_vec.y = vector[col_idx.y];\n"
"      tmp_vec.z = vector[col_idx.z];\n"
"      tmp_vec.w = vector[col_idx.w];\n"
"      dot_prod += dot(tmp_entries, tmp_vec);\n"
"    }\n"
"    result[row] = dot_prod;\n"
"  }\n"
"}\n"
; //compressed_matrix_align4_vec_mul

const char * const compressed_matrix_align1_row_scaling_1 = 
"__kernel void row_scaling_1(\n"
"          __global const unsigned int * row_indices,\n"
"          __global const unsigned int * column_indices, \n"
"          __global const float * elements,\n"
"          __global float * diag_M_inv,\n"
"          unsigned int size) \n"
"{ \n"
"  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
"  {\n"
"    float dot_prod = 0.0f;\n"
"    unsigned int row_end = row_indices[row+1];\n"
"    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
"      dot_prod += fabs(elements[i]);\n"
"    diag_M_inv[row] = 1.0f / dot_prod;\n"
"  }\n"
"}\n"
; //compressed_matrix_align1_row_scaling_1

const char * const compressed_matrix_align1_lu_forward = 
" \n"
"// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
"__kernel void lu_forward(\n"
"          __global const unsigned int * row_indices,\n"
"          __global const unsigned int * column_indices, \n"
"          __global const float * elements,\n"
"          __local  int * buffer,                              \n"
"          __local  float * vec_entries,   //a memory block from vector\n"
"          __global float * vector,\n"
"          unsigned int size) \n"
"{\n"
"  int waiting_for; //block index that must be finished before the current thread can start\n"
"  unsigned int waiting_for_index;\n"
"  int block_offset;\n"
"  unsigned int col;\n"
"  unsigned int row;\n"
"  unsigned int row_index_end;\n"
"  \n"
"  //backward substitution: one thread per row in blocks of get_global_size(0)\n"
"  for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)\n"
"  {\n"
"    block_offset = block_num * get_global_size(0);\n"
"    row = block_offset + get_global_id(0);\n"
"    buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
"    waiting_for = -1;\n"
"    if (row < size)\n"
"    {\n"
"      vec_entries[get_global_id(0)] = vector[row];\n"
"      waiting_for_index = row_indices[row];\n"
"      row_index_end = row_indices[row+1];\n"
"    }\n"
"    \n"
"    if (get_global_id(0) == 0)\n"
"      buffer[get_global_size(0)] = 1;\n"
"    //try to eliminate all lines in the block. \n"
"    //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
"    for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
"    {\n"
"      barrier(CLK_LOCAL_MEM_FENCE);\n"
"      if (row < size) //valid index?\n"
"      {\n"
"        if (waiting_for >= 0)\n"
"        {\n"
"          if (buffer[waiting_for] == 1)\n"
"            waiting_for = -1;\n"
"        }\n"
"        \n"
"        if (waiting_for == -1) //substitution not yet done, check whether possible\n"
"        {\n"
"          //check whether reduction is possible:\n"
"          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
"          {\n"
"            col = column_indices[j];\n"
"            if (col < block_offset) //index valid, but not from current block\n"
"              vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
"            else if (col < row)  //index is from current block\n"
"            {\n"
"              if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
"              {\n"
"                waiting_for = col - block_offset;\n"
"                waiting_for_index = j;\n"
"                break;\n"
"              }\n"
"              else  //updated entry is available in shared memory:\n"
"                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
"            }\n"
"          }\n"
"          \n"
"          if (waiting_for == -1)  //this row is done\n"
"          {\n"
"            buffer[get_global_id(0)] = 1;\n"
"            waiting_for = -2; //magic number: thread is finished\n"
"          }\n"
"        } \n"
"      } //row < size\n"
"      else\n"
"        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
"      ///////// check whether all threads are done. If yes, exit loop /////////////\n"
"      \n"
"      if (buffer[get_global_id(0)] == 0)\n"
"        buffer[get_global_size(0)] = 0;\n"
"      barrier(CLK_LOCAL_MEM_FENCE);\n"
"      \n"
"      if (buffer[get_global_size(0)] > 0)  //all threads break this loop simultaneously\n"
"        break;\n"
"      if (get_global_id(0) == 0)\n"
"        buffer[get_global_size(0)] = 1;\n"
"    } //for k\n"
"    \n"
"    //write to vector:\n"
"    if (row < size)\n"
"      vector[row] = vec_entries[get_global_id(0)];\n"
"    \n"
"    barrier(CLK_GLOBAL_MEM_FENCE);\n"
"  } //for block_num\n"
"}\n"
; //compressed_matrix_align1_lu_forward

const char * const compressed_matrix_align1_bicgstab_kernel2 = 
"void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )\n"
"{\n"
"  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
"  {\n"
"    barrier(CLK_LOCAL_MEM_FENCE);\n"
"    if (get_local_id(0) < stride)\n"
"      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
"  }\n"
"}\n"
"//////// inner products:\n"
"float bicgstab_kernel2_inner_prod(\n"
"          __global const float * vec1,\n"
"          __global const float * vec2,\n"
"          unsigned int size,\n"
"          __local float * tmp_buffer)\n"
"{\n"
"  float tmp = 0;\n"
"  unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
"  for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
"  {\n"
"    if (i < size)\n"
"      tmp += vec1[i] * vec2[i];\n"
"  }\n"
"  tmp_buffer[get_local_id(0)] = tmp;\n"
"  \n"
"  helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);\n"
"  barrier(CLK_LOCAL_MEM_FENCE);\n"
"  return tmp_buffer[0];\n"
"}\n"
"__kernel void bicgstab_kernel2(\n"
"          __global const float * tmp0,\n"
"          __global const float * tmp1,\n"
"          __global const float * r0star, \n"
"          __global const float * s, \n"
"          __global float * p, \n"
"          __global float * result,\n"
"          __global float * residual,\n"
"          __global const float * alpha,\n"
"          __global float * ip_rr0star,\n"
"          __global float * error_estimate,\n"
"          __local float * tmp_buffer,\n"
"          unsigned int size) \n"
"{ \n"
"  float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);\n"
"  float alpha_local = alpha[0];\n"
"  \n"
"  //result += alpha * p + omega * s;\n"
"  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
"    result[i] += alpha_local * p[i] + omega_local * s[i];\n"
"  //residual = s - omega * tmp1;\n"
"  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
"    residual[i] = s[i] - omega_local * tmp1[i];\n"
"  //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);\n"
"  float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);\n"
"  float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);\n"
"  \n"
"  //p = residual + beta * (p - omega*tmp0);\n"
"  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
"    p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);\n"
"  //compute norm of residual:\n"
"  float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);\n"
"  barrier(CLK_GLOBAL_MEM_FENCE);\n"
"  //update values:\n"
"  if (get_global_id(0) == 0)\n"
"  {\n"
"    error_estimate[0] = new_error_estimate;\n"
"    ip_rr0star[0] = new_ip_rr0star;\n"
"  }\n"
"}\n"
; //compressed_matrix_align1_bicgstab_kernel2

const char * const compressed_matrix_align1_lu_backward = 
"// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
"__kernel void lu_backward(\n"
"          __global const unsigned int * row_indices,\n"
"          __global const unsigned int * column_indices, \n"
"          __global const float * elements,\n"
"          __local  int * buffer,                              \n"
"          __local  float * vec_entries,   //a memory block from vector\n"
"          __global float * vector,\n"
"          unsigned int size) \n"
"{\n"
"  int waiting_for; //block index that must be finished before the current thread can start\n"
"  unsigned int waiting_for_index;\n"
"  unsigned int block_offset;\n"
"  unsigned int col;\n"
"  unsigned int row;\n"
"  unsigned int row_index_end;\n"
"  float diagonal_entry = 42;\n"
"  \n"
"  //forward substitution: one thread per row in blocks of get_global_size(0)\n"
"  for (int block_num = size / get_global_size(0); block_num > -1; --block_num)\n"
"  {\n"
"    block_offset = block_num * get_global_size(0);\n"
"    row = block_offset + get_global_id(0);\n"
"    buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
"    waiting_for = -1;\n"
"    \n"
"    if (row < size)\n"
"    {\n"
"      vec_entries[get_global_id(0)] = vector[row];\n"
"      waiting_for_index = row_indices[row];\n"
"      row_index_end = row_indices[row+1];\n"
"      diagonal_entry = column_indices[waiting_for_index];\n"
"    }\n"
"    \n"
"    if (get_global_id(0) == 0)\n"
"       buffer[get_global_size(0)] = 1;\n"
"    //try to eliminate all lines in the block. \n"
"    //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
"    for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
"    {\n"
"      barrier(CLK_LOCAL_MEM_FENCE);\n"
"      if (row < size) //valid index?\n"
"      {\n"
"        if (waiting_for >= 0)\n"
"        {\n"
"          if (buffer[waiting_for] == 1)\n"
"            waiting_for = -1;\n"
"        }\n"
"        \n"
"        if (waiting_for == -1) //substitution not yet done, check whether possible\n"
"        {\n"
"          //check whether reduction is possible:\n"
"          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
"          {\n"
"            col = column_indices[j];\n"
"            barrier(CLK_LOCAL_MEM_FENCE);\n"
"            if (col >= block_offset + get_global_size(0))  //index valid, but not from current block\n"
"              vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
"            else if (col > row)  //index is from current block\n"
"            {\n"
"              if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
"              {\n"
"                waiting_for = col - block_offset;\n"
"                waiting_for_index = j;\n"
"                break;\n"
"              }\n"
"              else  //updated entry is available in shared memory:\n"
"                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
"            }\n"
"            else if (col == row)\n"
"              diagonal_entry = elements[j];\n"
"          }\n"
"          \n"
"          if (waiting_for == -1)  //this row is done\n"
"          {\n"
"            if (row == 0)\n"
"              vec_entries[get_global_id(0)] /= elements[0];\n"
"            else\n"
"              vec_entries[get_global_id(0)] /= diagonal_entry;\n"
"            buffer[get_global_id(0)] = 1;\n"
"            waiting_for = -2; //magic number: thread is finished\n"
"          }\n"
"        } \n"
"      } //row < size\n"
"      else\n"
"        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
"      \n"
"      ///////// check whether all threads are done. If yes, exit loop /////////////\n"
"      if (buffer[get_global_id(0)] == 0)\n"
"        buffer[get_global_size(0)] = 0;\n"
"      barrier(CLK_LOCAL_MEM_FENCE);\n"
"      \n"
"      if (buffer[get_global_size(0)] > 0)  //all threads break the loop simultaneously\n"
"        break;\n"
"      if (get_global_id(0) == 0)\n"
"        buffer[get_global_size(0)] = 1;\n"
"    } //for k\n"
"    if (row < size)\n"
"      vector[row] = vec_entries[get_global_id(0)];\n"
"      //vector[row] = diagonal_entry;\n"
"    \n"
"    //if (row == 0)\n"
"      //vector[0] = diagonal_entry;\n"
"      //vector[0] = elements[0];\n"
"    barrier(CLK_GLOBAL_MEM_FENCE);\n"
"  } //for block_num\n"
"}\n"
; //compressed_matrix_align1_lu_backward

const char * const compressed_matrix_align1_vec_mul = 
"__kernel void vec_mul(\n"
"          __global const unsigned int * row_indices,\n"
"          __global const unsigned int * column_indices, \n"
"          __global const float * elements,\n"
"          __global const float * vector,  \n"
"          __global float * result,\n"
"          unsigned int size) \n"
"{ \n"
"  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
"  {\n"
"    float dot_prod = 0.0f;\n"
"    unsigned int row_end = row_indices[row+1];\n"
"    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
"      dot_prod += elements[i] * vector[column_indices[i]];\n"
"    result[row] = dot_prod;\n"
"  }\n"
"}\n"
; //compressed_matrix_align1_vec_mul

const char * const compressed_matrix_align1_jacobi = 
"__kernel void jacobi(\n"
" __global const unsigned int * row_indices,\n"
" __global const unsigned int * column_indices,\n"
" __global const float * elements,\n"
" float weight,\n"
" __global const float * old_result,\n"
" __global float * new_result,\n"
" __global const float * rhs,\n"
" unsigned int size)\n"
" {\n"
"  float sum, diag=1;\n"
"  int col;\n"
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
"  {\n"
"    sum = 0;\n"
"    for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++)\n"
"    {\n"
"      col = column_indices[j];\n"
"      if (i == col)\n"
"	diag = elements[j];\n"
"      else \n"
"	sum += elements[j] * old_result[col]; \n"
"    } \n"
"      new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n"
"   } \n"
" } \n"
; //compressed_matrix_align1_jacobi

const char * const compressed_matrix_align1_jacobi_precond = 
"__kernel void jacobi_precond(\n"
"          __global const unsigned int * row_indices,\n"
"          __global const unsigned int * column_indices, \n"
"          __global const float * elements,\n"
"          __global float * diag_M_inv,\n"
"          unsigned int size) \n"
"{ \n"
"  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
"  {\n"
"    float diag = 1.0f;\n"
"    unsigned int row_end = row_indices[row+1];\n"
"    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
"    {\n"
"      if (row == column_indices[i])\n"
"      {\n"
"        diag = elements[i];\n"
"        break;\n"
"      }\n"
"    }\n"
"    diag_M_inv[row] = 1.0f / diag;\n"
"  }\n"
"}\n"
; //compressed_matrix_align1_jacobi_precond

const char * const compressed_matrix_align1_bicgstab_kernel1 = 
"void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )\n"
"{\n"
"  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
"  {\n"
"    barrier(CLK_LOCAL_MEM_FENCE);\n"
"    if (get_local_id(0) < stride)\n"
"      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
"  }\n"
"}\n"
"//////// inner products:\n"
"float bicgstab_kernel1_inner_prod(\n"
"          __global const float * vec1,\n"
"          __global const float * vec2,\n"
"          unsigned int size,\n"
"          __local float * tmp_buffer)\n"
"{\n"
"  float tmp = 0;\n"
"  unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
"  for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
"  {\n"
"    if (i < size)\n"
"      tmp += vec1[i] * vec2[i];\n"
"  }\n"
"  tmp_buffer[get_local_id(0)] = tmp;\n"
"  \n"
"  helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);\n"
"  barrier(CLK_LOCAL_MEM_FENCE);\n"
"  return tmp_buffer[0];\n"
"}\n"
"__kernel void bicgstab_kernel1(\n"
"          __global const float * tmp0,\n"
"          __global const float * r0star, \n"
"          __global const float * residual,\n"
"          __global float * s,\n"
"          __global float * alpha,\n"
"          __global const float * ip_rr0star,\n"
"          __local float * tmp_buffer,\n"
"          unsigned int size) \n"
"{ \n"
"  float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);\n"
"  \n"
"  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
"    s[i] = residual[i] - alpha_local * tmp0[i];\n"
"  \n"
"  if (get_global_id(0) == 0)\n"
"    alpha[0] = alpha_local;\n"
"}\n"
; //compressed_matrix_align1_bicgstab_kernel1

const char * const compressed_matrix_align1_row_scaling_2 = 
"__kernel void row_scaling_2(\n"
"          __global const unsigned int * row_indices,\n"
"          __global const unsigned int * column_indices, \n"
"          __global const float * elements,\n"
"          __global float * diag_M_inv,\n"
"          unsigned int size) \n"
"{ \n"
"  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
"  {\n"
"    float dot_prod = 0.0f;\n"
"    float temp = 0.0f;\n"
"    unsigned int row_end = row_indices[row+1];\n"
"    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
"    {\n"
"      temp = elements[i];\n"
"      dot_prod += temp * temp;\n"
"    }\n"
"    diag_M_inv[row] = 1.0f / sqrt(dot_prod);\n"
"  }\n"
"}\n"
; //compressed_matrix_align1_row_scaling_2

const char * const compressed_matrix_align8_vec_mul = 
"__kernel void vec_mul(\n"
"          __global const unsigned int * row_indices,\n"
"          __global const uint8 * column_indices, \n"
"          __global const float8 * elements,\n"
"          __global const float * vector,  \n"
"          __global float * result,\n"
"          unsigned int size)\n"
"{ \n"
"  float dot_prod;\n"
"  unsigned int start, next_stop;\n"
"  uint8 col_idx;\n"
"  float8 tmp_vec;\n"
"  float8 tmp_entries;\n"
"  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
"  {\n"
"    dot_prod = 0.0f;\n"
"    start = row_indices[row] / 8;\n"
"    next_stop = row_indices[row+1] / 8;\n"
"    for (unsigned int i = start; i < next_stop; ++i)\n"
"    {\n"
"      col_idx = column_indices[i];\n"
"      tmp_entries = elements[i];\n"
"      tmp_vec.s0 = vector[col_idx.s0];\n"
"      tmp_vec.s1 = vector[col_idx.s1];\n"
"      tmp_vec.s2 = vector[col_idx.s2];\n"
"      tmp_vec.s3 = vector[col_idx.s3];\n"
"      tmp_vec.s4 = vector[col_idx.s4];\n"
"      tmp_vec.s5 = vector[col_idx.s5];\n"
"      tmp_vec.s6 = vector[col_idx.s6];\n"
"      tmp_vec.s7 = vector[col_idx.s7];\n"
"      dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n"
"      dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n"
"    }\n"
"    result[row] = dot_prod;\n"
"  }\n"
"}\n"
; //compressed_matrix_align8_vec_mul

  }  //namespace kernels
 }  //namespace linalg
}  //namespace viennacl
#endif
