/*
FSM_GA code: FSM_GA is a GPU-accelerated implementation of a genetic algorithm
(GA) for finding well-performing finite-state machines (FSM) for predicting
binary sequences.

Copyright (c) 2013, Texas State University. All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted for academic, research, experimental, or personal use provided
that the following conditions are met:

   * Redistributions of source code must retain the above copyright notice,
     this list of conditions, and the following disclaimer.
   * Redistributions in binary form must reproduce the above copyright notice,
     this list of conditions, and the following disclaimer in the documentation
     and/or other materials provided with the distribution.
   * Neither the name of Texas State University nor the names of its
     contributors may be used to endorse or promote products derived from this
     software without specific prior written permission.

For all other uses, please contact the Office for Commercialization and Industry
Relations at Texas State University <http://www.txstate.edu/ocir/>.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Authors: Martin Burtscher
*/


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/time.h>
#include <cuda.h>
#include <curand_kernel.h>

#define FSMSIZE 8
#define TABSIZE 32768
#define POPCNT 128
#define POPSIZE 512
#define CUTOFF 1

__device__ unsigned char bfsm[POPCNT][FSMSIZE * 2], same[POPCNT];
__device__ int smax[POPCNT], sbest[POPCNT], oldmax[POPCNT];

__global__
__launch_bounds__(POPSIZE, 4)
void FSMKernel(int length, unsigned short *data, int *best, curandState *rndstate)
{
  register int i, d, pc, s, bit, id, misses, rnd;
  unsigned long long myresult, current;
  unsigned char *fsm, state[TABSIZE];
  __shared__ unsigned char next[FSMSIZE * 2 * POPSIZE];

  fsm = &next[threadIdx.x * (FSMSIZE * 2)];

  if (threadIdx.x == 0) {
    oldmax[blockIdx.x] = 0;
    same[blockIdx.x] = 0;
  }
  __syncthreads();

  id = threadIdx.x + blockIdx.x * blockDim.x;
  curand_init(1234, id, 0, &rndstate[id]);

  // initial population
  for (i = 0; i < FSMSIZE * 2; i++) {
    fsm[i] = curand(&rndstate[id]) & (FSMSIZE - 1);
  }

  // run generations until cutoff times no improvement
  do {
    // reset miss counter and initial state
    memset(state, 0, TABSIZE);
    misses = 0;

    // evaluate FSM
    #pragma unroll
    for (i = 0; i < length & ~31; i++) {
      d = (int)data[i];
      pc = (d >> 1) & (TABSIZE - 1);
      bit = d & 1;
      s = (int)state[pc];
      misses += bit ^ (s & 1);
      state[pc] = fsm[s + s + bit];
    }
    for (; i < length; i++) {
      d = (int)data[i];
      pc = (d >> 1) & (TABSIZE - 1);
      bit = d & 1;
      s = (int)state[pc];
      misses += bit ^ (s & 1);
      state[pc] = fsm[s + s + bit];
    }

    // determine best FSM
    if (threadIdx.x == 0) {
      atomicAdd(&best[2], 1);  // increment generation count
      smax[blockIdx.x] = 0;
      sbest[blockIdx.x] = 0;
    }
    __syncthreads();
    atomicMax(&smax[blockIdx.x], length - misses);
    __syncthreads();
    if (length - misses == smax[blockIdx.x]) atomicMax(&sbest[blockIdx.x], threadIdx.x);
    __syncthreads();
    bit = 0;
    if (sbest[blockIdx.x] == threadIdx.x) {
      // check if there was an improvement
      same[blockIdx.x]++;
      if (oldmax[blockIdx.x] < smax[blockIdx.x]) {
        oldmax[blockIdx.x] = smax[blockIdx.x];
        same[blockIdx.x] = 0;
      }
    } else {
      // select 1/8 of threads for mutation (best FSM does crossover)
      if ((curand(&rndstate[id]) & 7) == 0) bit = 1;
    }
    __syncthreads();

    if (bit) {
      // mutate best FSM by flipping random bits with 1/4th probability
      for (i = 0; i < FSMSIZE * 2; i++) {
        rnd = curand(&rndstate[id]) & curand(&rndstate[id]);
        fsm[i] = (next[i + sbest[blockIdx.x] * FSMSIZE * 2] ^ rnd) & (FSMSIZE - 1);
      }
    } else {
      // crossover best FSM with random FSMs using 3/4 of bits from best FSM
      for (i = 0; i < FSMSIZE * 2; i++) {
        rnd = curand(&rndstate[id]) & curand(&rndstate[id]);
        fsm[i] = (fsm[i] & rnd) | (next[i + sbest[blockIdx.x] * FSMSIZE * 2] & ~rnd);
      }
    }
  } while (same[blockIdx.x] < CUTOFF);  // end of loop over generations

  // record best result of this block
  if (sbest[blockIdx.x] == threadIdx.x) {
    id = blockIdx.x;
    myresult = length - misses;
    myresult = (myresult << 32) + id;
    current = *((unsigned long long *)best);
    while (myresult > current) {
      atomicCAS((unsigned long long *)best, current, myresult);
      current = *((unsigned long long *)best);
    }
    for (i = 0; i < FSMSIZE * 2; i++) {
      bfsm[id][i] = fsm[i];
    }
  }
}

__global__
void MaxKernel(int *best)
{
  register int i, id;

  // copy best FSM state assignment over
  id = best[0];
  for (i = 0; i < FSMSIZE * 2; i++) {
    best[i + 3] = bfsm[id][i];
  }
}

int main(int argc, char *argv[])
{
  register int i, j, d, s, bit, length, pc, misses, besthits, generations;
  register unsigned short *data, *gdata;
  unsigned char state[TABSIZE], fsm[FSMSIZE * 2];
  int *gbest, best[FSMSIZE * 2 + 3], trans[FSMSIZE][2];
  register FILE *f;
  curandState *gstate;
  register double runtime;
  struct timeval starttime, endtime;

  if (argc != 2) {fprintf(stderr, "usage: %s trace_file_name\n", argv[0]); exit(-1);}

  assert(sizeof(unsigned short) == 2);
  assert((FSMSIZE & (FSMSIZE - 1)) == 0);
  assert((TABSIZE & (TABSIZE - 1)) == 0);
  assert((0 < FSMSIZE) && (FSMSIZE <= 256));
  assert((0 < TABSIZE) && (TABSIZE <= 32768));
  assert(0 < POPCNT);
  assert((0 < POPSIZE) && (POPSIZE <= 1024));
  assert(0 < CUTOFF);

  f = fopen(argv[1], "rb");
  if (f == NULL) {fprintf(stderr, "error: could not open file %s\n", argv[1]); exit(-1);}
  fseek(f, 0, SEEK_END);
  length = ftell(f);
  fseek(f, 0, SEEK_SET);
  data = (unsigned short *)malloc(length);
  if (data == NULL) {fprintf(stderr, "error: could not allocate %d bytes of memory\n", length); exit(-1);}
  length /= sizeof(unsigned short);
  if (length != fread(data, sizeof(unsigned short), length, f)) {fprintf(stderr, "error: could not read all entries from file\n"); exit(-1);}
  fclose(f);

  printf("%s\t#trace\n", argv[1]);
  printf("%d\t#fsmsize\n", FSMSIZE);
  printf("%d\t#entries\n", length);
  printf("%d\t#tabsize\n", TABSIZE);
  printf("%d\t#blocks\n", POPCNT);
  printf("%d\t#threads\n", POPSIZE);
  printf("%d\t#cutoff\n", CUTOFF);

  //  cudaSetDevice(1);
  cudaFuncSetCacheConfig(FSMKernel, cudaFuncCachePreferEqual);

  if (cudaSuccess != cudaMalloc((void **)&gdata, sizeof(unsigned short) * length)) fprintf(stderr, "could not allocate gdata\n");
  if (cudaSuccess != cudaMalloc((void **)&gbest, sizeof(int) * (FSMSIZE * 2 + 3))) fprintf(stderr, "could not allocate gbest\n");
  if (cudaSuccess != cudaMalloc((void **)&gstate, POPCNT * POPSIZE * sizeof(curandState))) fprintf(stderr, "could not allocate gstate\n");

  gettimeofday(&starttime, NULL);
  if (cudaSuccess != cudaMemcpy(gdata, data, sizeof(unsigned short) * length, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of data to device failed\n");
  if (cudaSuccess != cudaMemset(gbest, 0, sizeof(int) * (FSMSIZE * 2 + 3))) fprintf(stderr, "zeroing out of gbest failed\n");
  FSMKernel<<<POPCNT, POPSIZE>>>(length, gdata, gbest, gstate);
  MaxKernel<<<1, 1>>>(gbest);
  if (cudaSuccess != cudaMemcpy(best, gbest, sizeof(int) * (FSMSIZE * 2 + 3), cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of best from device failed\n");
  gettimeofday(&endtime, NULL);

  besthits = best[1];
  generations = best[2];

  runtime = endtime.tv_sec + endtime.tv_usec / 1000000.0 - starttime.tv_sec - starttime.tv_usec / 1000000.0;
  printf("%.6lf\t#runtime [s]\n", runtime);
  printf("%.6lf\t#throughput [Gtr/s]\n", 0.000000001 * POPSIZE * generations * length / runtime);

  // evaluate saturating up/down counter
  for (i = 0; i < FSMSIZE; i++) {
    fsm[i * 2 + 0] = i - 1;
    fsm[i * 2 + 1] = i + 1;
  }
  fsm[0] = 0;
  fsm[(FSMSIZE - 1) * 2 + 1] = FSMSIZE - 1;
  memset(state, 0, TABSIZE);
  misses = 0;
  for (i = 0; i < length; i++) {
    d = (int)data[i];
    pc = (d >> 1) & (TABSIZE - 1);
    bit = d & 1;
    s = (int)state[pc];
    misses += bit ^ (((s + s) / FSMSIZE) & 1);
    state[pc] = fsm[s + s + bit];
  }
  printf("%d\t#sudcnt hits\n", length-misses);
  printf("%d\t#GAfsm hits\n", besthits);

  printf("%.3lf\t#sudcnt hits [%]\n", 100.0 * (length - misses) / length);
  printf("%.3lf\t#GAfsm hits [%]\n\n", 100.0 * besthits / length);

  // verify result and count transitions
  for (i = 0; i < FSMSIZE; i++) {
    for (j = 0; j < 2; j++) {
      trans[i][j] = 0;
    }
  }
  for (i = 0; i < FSMSIZE * 2; i++) {
    fsm[i] = best[i + 3];
  }
  memset(state, 0, TABSIZE);
  misses = 0;
  for (i = 0; i < length; i++) {
    d = (int)data[i];
    pc = (d >> 1) & (TABSIZE - 1);
    bit = d & 1;
    s = (int)state[pc];
    trans[s][bit]++;
    misses += bit ^ (s & 1);
    state[pc] = (unsigned char)fsm[s + s + bit];
  }
  assert((length - misses) == besthits);

  // print FSM state assignment in R's ncol format
  for (bit = 0; bit < 2; bit++) {
    for (s = 0; s < FSMSIZE; s++) {
      d = fsm[s + s + bit];
      printf("%c%d %c%d %d\n", (s & 1) ? 'P' : 'N', s / 2, (d & 1) ? 'P' : 'N', d / 2, ((bit * 2) - 1) * trans[s][bit]);
    }
  }

  return 0;
}