#include "libscl.h"
#include <omp.h>

using namespace std;
using namespace scl;

int main(int argc, char** argp, char** envp)
{
  realmat a, b;
  if(!vecread("a.dat",a) || !vecread("b.dat",b)) error("Read failed");
  if (a.ncol() != b.nrow()) error("Not conformable");

  const INTEGER arows = a.nrow();
  const INTEGER acols = b.nrow();
  const INTEGER bcols = b.ncol();

  realmat r(arows,bcols,0.0);

  INTEGER chunk = 2;
  INTEGER tid;

  string msg;

  #pragma omp parallel shared(chunk) private(tid, msg)
  {
    tid = omp_get_thread_num();
    #pragma omp for schedule(static, chunk)
    for (INTEGER j=1; j<=bcols; ++j) {
      msg = "Thread" + fmt('d',3,tid)();
      msg += " did col" + fmt('d',3,j)() + "\n";
      cout << msg;
      for (INTEGER k=1; k<=acols; ++k) {
        for (INTEGER i=1; i<=arows; ++i) {
          r(i,j) += a(i,k)*b(k,j);
        }
      }
    }
  }

  std::cout << a << b << r << '\n';

  return 0;
}
