/* ************************** 
   cnv2.c

   Created on: June 11, 2011
   Author: Michael Hirsch
   ************************** */
#ifndef CNV2_C
#define CNV2_C

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <complex.h>
#include <fftw3.h>
#include <sys/time.h>
#include <string.h>

#include "cnv2.h"
#include "utils.h"


cnvmat * cnv2mat(matrix * k, size sz, char cnv_type[5]){
  
  // Allocate parameters of convolution
  cnvmat * cnvobj = (cnvmat*) malloc(sizeof(cnvmat));
  size sfft;
  size sy;
  size sf;
  size sx;
  size offset;
 
 // Check what is f and what is x assuming size(f) < size(x)
  if ( (sz.m >= k->size.m) && (sz.n >= k->size.n) ) {
    cnvobj->id = "F";  
    sf = k->size;
    sx = sz;
  }
  else if ( (sz.m < k->size.m) && (sz.n < k->size.n) ) {
    cnvobj->id = "X";
    sx = k->size;
    sf = sz;
  }
  else {
    fprintf(stderr, "Size missmatch!\n");
    exit(EXIT_FAILURE);
  }

  // Check type of convolution
  if (strncmp(cnv_type, "valid", 5) == 0) {
    sy.m     = sx.m - sf.m + 1;
    sy.n     = sx.n - sf.n + 1;
    sfft.m   = sx.m;
    sfft.n   = sx.n;
    offset.m = sf.m - 1;
    offset.n = sf.n - 1;
    }
  else if (strncmp(cnv_type, "same", 4) == 0) {
    sy.m   = sx.m;
    sy.n   = sx.n;
    sfft.m = sx.m + sf.m - 1;
    sfft.n = sx.n + sf.n - 1;
    offset.m = floor(0.5 * sf.m);
    offset.n = floor(0.5 * sf.n);
  }
  else if (strncmp(cnv_type, "full", 4) == 0) {
    sy.m   = sx.m + sf.m - 1;
    sy.n   = sx.n + sf.n - 1;
    sfft.m = sx.m + sf.m - 1;
    sfft.n = sx.n + sf.n - 1;
    offset.m = 0;
    offset.n = 0;

    fprintf(stderr, "Not yet implemented!\n");
    exit(EXIT_FAILURE);
  }
  else if (strncmp(cnv_type, "circ", 4) == 0) {
    sy.m   = sx.m;
    sy.n   = sx.n;
    sfft.m = sx.m;
    sfft.n = sx.m;
    offset.m = floor(0.5 * sf.m);
    offset.n = floor(0.5 * sf.n);

    fprintf(stderr, "Not yet implemented!\n");
    exit(EXIT_FAILURE);
  }
  else {
    fprintf(stderr, "Not a valid convolution type!\n");
    exit(EXIT_FAILURE);
  }

  // Pad f and x to be sized a power of 2 for efficient comp. of FFT
  // Not sure if this is particularly clever, padding the array 
  // might take as long as for FFTW3 to compute the FFT 
  // for the time being I keep it as it is as it's essential for CUDA
  // see http://software.intel.com/en-us/articles/fft-length-and-layout-advisor/
  //sfft.m = pow(2, ceil( log(sfft.m) / log(2) ));
  //sfft.n = pow(2, ceil( log(sfft.n) / log(2) ));

  // Do in place padding, TODO: set offsets in switch above
  size offset_tmp;
  offset_tmp.m = 0;
  offset_tmp.n = 0;

  // add additional padding to ensure that FFT(fp) fits in mem space of fp
  // note that size(FFT(fp)) = (sp.n/2 + 1) * 2 (factor 2 since complex)
  size sp;
  sp.m = sfft.m;
  sp.n = sfft.n + 2;

  matrix * fp = pad(k, sp, offset_tmp);

  // Compute FFT, use FFT for real-valued signals
  // we do in-place transform hence in- and ouput pointer must be equal
  // need void pointer since input and output are of different type 
  cmatrix * fft = (cmatrix*) fftw_malloc(sizeof(cmatrix));
  // for out of place transforms, seems to need the same runtime
  // y->mat = (cmplx*) fftw_malloc(sfft.m * (sfft.n/2+1)*2 * sizeof(y->mat));
  // for in place transforms
  void * pt_tmp = fp->mat;  
  fft->mat = pt_tmp;

  const fftw_plan plan_fwd = fftw_plan_dft_r2c_2d(sfft.m, sfft.n,
						  fp->mat, fft->mat, 
						  FFTW_ESTIMATE);


  fftw_execute(plan_fwd);

  const fftw_plan plan_bwd = fftw_plan_dft_c2r_2d(sfft.m, sfft.n,
						  fft->mat, fp->mat, 
						  FFTW_ESTIMATE);
    
  // Save fft of padded kernel in cnvmat as well as all other relevant stuff
  // If possible save bwd plan also in cnvmat
  /*
  matrix * ft = (matrix*) malloc(sizeof(matrix));
  ft->mat = (real*) malloc(sfft.m * 2*(sfft.n/2 + 1) * sizeof(real));
  
  fftw_plan plan_bwd = fftw_plan_dft_c2r_2d(ft->size.m, ft->size.n, 
					    y->mat, ft->mat, FFTW_ESTIMATE);
  fftw_execute(plan_bwd);
  fftw_destroy_plan(plan_bwd);
  */

  // Save everything in cnvmat
  //cnvobj->cnv_type = cnv_type;
  cnvobj->offset   = offset;
  cnvobj->sx	   = sx;
  cnvobj->sy	   = sy;
  cnvobj->sf	   = sf;
  cnvobj->sz	   = sz;
  cnvobj->sfft	   = sfft;
  cnvobj->sp       = sp;
  cnvobj->fft	   = fft;
  cnvobj->plan_fwd = plan_fwd;
  cnvobj->plan_bwd = plan_bwd;

  //printf("ID of cnvmat: %s \n", cnvobj->id);
  //printf("Type of cnvmat: %s \n", cnvobj->cnv_type);

  return cnvobj;
}


matrix * cnv(cnvmat * cnvobj, matrix * z){
  // Input is either PSF or image, the output is the convolved image
  // Do padding and compute fft
  size offset_tmp;
  offset_tmp.m = 0;
  offset_tmp.n = 0;

  matrix * zp = pad(z, cnvobj->sp, offset_tmp);
 
  cmatrix * zp_fft = (cmatrix*) fftw_malloc(sizeof(cmatrix));
  void * pt_tmp1 = zp->mat;  
  zp_fft->mat = pt_tmp1;

  // compute Fourier transform
  fftw_execute_dft_r2c(cnvobj->plan_fwd, zp->mat, zp_fft->mat);
  
  // do modulation in Fourier domain in-place! 
  // Divide by number of FFT elements which is not done by FFTW
  for (int i=0; i < (cnvobj->sp.m * cnvobj->sp.n/2 ); i++){
    zp_fft->mat[i] *= (cnvobj->fft->mat[i] / (cnvobj->sfft.m * cnvobj->sfft.n ));
  }

  // compute inverse Fourier transform
  matrix * yp = (matrix*) fftw_malloc(sizeof(matrix));
  void * pt_tmp2 = zp_fft->mat;  
  yp->mat = pt_tmp2;
  yp->size = cnvobj->sp;

  fftw_execute_dft_c2r(cnvobj->plan_bwd, zp_fft->mat, yp->mat);

  // Do cropping
  matrix * y = crop(yp, cnvobj->sy, cnvobj->offset);

  return y;
}


matrix * cnvtp(cnvmat * cnvobj, matrix * z){
  // Input is either PSF or image, the output is the convolved image
  // Do padding and compute fft

  matrix * zp = pad(z, cnvobj->sp, cnvobj->offset);
 
  cmatrix * zp_fft = (cmatrix*) fftw_malloc(sizeof(cmatrix));
  void * pt_tmp1 = zp->mat;  
  zp_fft->mat = pt_tmp1;

  // compute Fourier transform
  fftw_execute_dft_r2c(cnvobj->plan_fwd, zp->mat, zp_fft->mat);  
  
  // do modulation in Fourier domain in-place!
  for (int i=0; i < (cnvobj->sp.m * cnvobj->sp.n/2 ); i++){
    zp_fft->mat[i] *= (conj(cnvobj->fft->mat[i]) / (cnvobj->sfft.m*cnvobj->sfft.n ));
  }
  
  // compute inverse Fourier transform
  matrix * yp = (matrix*) fftw_malloc(sizeof(matrix));
  void * pt_tmp2 = zp_fft->mat;  
  yp->mat = pt_tmp2;
  yp->size = cnvobj->sp;

  fftw_execute_dft_c2r(cnvobj->plan_bwd, zp_fft->mat, yp->mat);

  // Do cropping
  size offset_tmp;
  offset_tmp.m = 0;
  offset_tmp.n = 0;

  matrix * y = crop(yp, cnvobj->sz, offset_tmp);

  return y;
}


matrix * pad(matrix * in, size sfft, size offset){
  // Allocate memory for padded matrix 
  matrix * out = (matrix*) fftw_malloc(sizeof(matrix));
  // Set new size
  out->size = sfft;
  // Do copying and padding
  out->mat = (real*) calloc(sfft.m * sfft.n, sizeof(out->mat));

  for (int i=0; i < in->size.m; i++){
    for (int j=0; j < in->size.n; j++){
      out->mat[IDX(i+offset.m,j+offset.n, out->size.n)] = in->mat[IDX(i,j,in->size.n)];
    }
  }
  return out;
}


matrix * crop(matrix * in, size sfft, size offset){
  // Allocate memory for padded matrix 
  matrix * out = (matrix*) malloc(sizeof(matrix));
  // Set new size
  out->size = sfft;
  // Do cropping
  out->mat = (real*) malloc(sfft.m * sfft.n * sizeof(out->mat));
  for (int i=0; i < sfft.m; i++){
    for (int j=0; j < sfft.n; j++){
      out->mat[IDX(i,j,out->size.n)] = in->mat[IDX(i+offset.m,j+offset.n,in->size.n)];
    }
  }
  return out;
}

#endif