/* Anu Rao 11/1/94 CSC 652
   LU decomposition using the incremental method; row parallelized
     Assumptions: global variables: matrix a */

#include "pt.h"

#include <stdio.h>
#include <stdlib.h>

#define max(a,b) ((a) > (b) ? (a) : (b))

double **a;

int asize,nworkers;

pt_gate_t gate;

void LU(pt_arg_t *info)
{
  int myID=pt_myid(info);
  int i, j, k;
  int startrow, endrow, goof;

  /* define starting & ending points for processor myID */
  startrow = (myID * asize)/nworkers;
  endrow = startrow + (asize/nworkers) - 1;
  
  for ( k = 0; k <= asize; k++ ) {
    goof = max(k+1,startrow);
    for ( i = goof; i <= endrow; i++ ) 
      a[i][k] /= a[k][k];  
    pt_gate_sync(&gate);

    /* update l's and u's by one term */
    for ( i = goof; i <= endrow; i++ ) {
      for ( j = k+1; j < asize; j++ ) 
        a[i][j] -= a[i][k] * a[k][j];
    }  
    pt_gate_sync(&gate);
  }
}

int main(int argc,char *argv[])
{
  int i, j;

  if (argc!=3) {
    asize=8;
    nworkers=4;
  } else {
    /* read in size of matrix, number of processors */
    asize = atoi(argv[1]);
    nworkers = atoi(argv[2]);
  }

  /* allocate a to be an n by n matrix */
  a = (double **)malloc(asize*sizeof(double *));
  for (i = 0; i < asize; i++) {
    a[i] = (double *)malloc(asize*sizeof(double));
  } 

  /* set up the gate */
  pt_gate_init(&gate,nworkers);

  /* initialize a */
  for (i = 0; i < asize; i++) {
    for (j = 0; j < asize; j++) {
      a[i][j] = 1.0;
      if (i == j) a[i][j] = 2.0;
    }
  }

  /* do the decomposition */
  pt_fork(nworkers,LU,NULL,NULL);

  /* print results */
  for (i = 0; i < asize; i++) {
    for (j = 0; j < asize; j++) {
      printf("%8.3lf ",a[i][j]);
    }
    printf("\n");
  }

  return(0);
}

/* EOF ludcmp.c */

