#ifndef __NEEDLEMAN_WUNSCH_GLOBAL_MATCH_GPU_CUH__
#define __NEEDLEMAN_WUNSCH_GLOBAL_MATCH_GPU_CUH__

#include <vector_types.h>
#include <cuda_runtime.h>

#include "cuda_declarations.h"
#include "data_management.cuh"
#include "thread_manager.h"
#include "sequences.h"
#include "main_cu.h"
#include "exceptions.h"
#include "hi_res_timer.h"

#include "defines_gpu.cuh"



#define Y_STEPS     ALIGNMENT_MATCH_Y_STEPS
#define WIN_SIZE    winSize
#define MEM_OFFSET  memOffset
#define BLOCK_SIZE  ALIGNMENT_MATCH_BLOCK_SIZE

#define seqXNo      (blockIdx.x * blockDim.x + threadIdx.x)
#define seqYNo      (blockIdx.y * blockDim.y + threadIdx.y)
#define startX      (tex1Starts[seqXNo])
#define startY      (tex2Starts[seqYNo])

/*******************************************************************************
 * "back" consist of 4bits x 8 (=32bits):                                      *
 * 4bits:                                                                      *
 * -> 0 and 1 bits:                                                            *
 *      - 00 -> stop                                                           *
 *      - 01 -> up                                                             *
 *      - 10 -> left                                                           *
 *      - 11 -> crosswise                                                      *
 * -> 2 bit:                                                                   *
 *      - 0 -> not continueUp                                                  *
 *      - 1 -> continueUp                                                      *
 * -> 3 bit:                                                                   *
 *      - 0 -> not continueLeft                                                *
 *      - 1 -> continueLeft                                                    *
 *                                                                             *
 * BACK:                                                                       *
 * back[startPosA + ( ( ((y) + 8) / 8) * rowWidth + (x) + 1 ) * MEM_OFFSET]    *
 * BACK(-1,-1) => the firs element of -1 row (and -1 column)                   *
 *******************************************************************************/
#define BACK(x,y)   back[startPosA + ( ( ((y) + 8) / 8) * rowWidth + (x) + 1 ) * MEM_OFFSET]

__global__ void NeedlemanWunschGlobalMatchKernel(short2* AF, unsigned int* back, int* scores, short rowWidth, bool border = false)
{
    /***************************************************************************
     * |\xxx                                                                   *
     * | \xx    we do not compute x                                            *
     * |  \x                                                                   *
     * |___\                                                                   *
     ***************************************************************************/
    if(border && (seqXNo > seqYNo))
        return;

    int blockThread = threadIdx.x + threadIdx.y * blockDim.x; //0...(BLOCK_SIZE-1)

    short2 lengthXY;
    lengthXY.x = tex1Starts[seqXNo + 1] - startX;
    lengthXY.y = tex2Starts[seqYNo + 1] - startY;

    if((lengthXY.x == 0) || (lengthXY.y == 0))//if there is nothing to do -> quit
        return;

    //startPosA == thread number within whole grid
    int startPosA = seqYNo * WIN_SIZE + seqXNo;

    //initialization of the -1 row in A matrix
    // - 2 bytes for element of A matrix
    // - 2 bytes for element of F matrix
    for(short x = 0; x < lengthXY.x; x++)
    {
        short2 tmp;
        //(x + 1) because the first element should be -gapEx
        tmp.x = -gapEx * (x + 1);
        tmp.y = SHORT_MIN + gapEx;
        AF[startPosA + x * MEM_OFFSET] = tmp;

        //fill the -1 row of "back" array
        BACK(x,-1) = 9; //0000 0000 0000 0000 0000 0000 0000 1001 == 9
    }

    //fill the -1 column of "back" array
    for(short y = 0; y < lengthXY.y; y+=Y_STEPS)
    {
        BACK(-1,y) = 1717986918; //0110 0110 0110 0110 0110 0110 0110 0110 = 1717986918
    }
    BACK(-1,-1) = 0; //stop element

    //one element of AE_shared consist of:
    // - one A element
    // - one E element
    __shared__ short2 AE_shared[Y_STEPS][BLOCK_SIZE];
    //elements of Y sequence go to sharedYSeq
    __shared__ int sharedYSeq[Y_STEPS/4][BLOCK_SIZE];


    short2 AF_current;
    AF_current.x = 0;

    // |
    // |
    // |
    // V
    for (short y = 0; y < lengthXY.y; y += Y_STEPS)
    {
        short2 A_init_upleft;
        A_init_upleft.x = -gapEx * y;

        //initialialization of the -1 column in A matrix
        // - one element of A matrix
        // - one element of E matrix
        for (short i = 0; i < Y_STEPS; i++)
        {
            short2 tmp;
            tmp.x = -gapEx * (y + i + 1);
            tmp.y = SHORT_MIN + gapEx;
            AE_shared[i][blockThread] = tmp;
        }


        //we read elements of the Y sequence
        for (short i = 0; i < Y_STEPS/4; i++)
        {
            sharedYSeq[i][blockThread] = PACK_BYTES(tex1Dfetch(texSeqsY, startY + y + i*4 + 0),
                                                    tex1Dfetch(texSeqsY, startY + y + i*4 + 1),
                                                    tex1Dfetch(texSeqsY, startY + y + i*4 + 2),
                                                    tex1Dfetch(texSeqsY, startY + y + i*4 + 3));
        }


        //------>
        for (short x = 0; x < lengthXY.x; x++)
        {
            //actual up_left gets a value of recent read value from the global memory
            //and actual read value is stored in first two bites of A_upleft
            A_init_upleft.y = A_init_upleft.x;

            char2 XYSeq;
            XYSeq.x = tex1Dfetch(texSeqsX, startX + x);

            //read from global memory
            short2 AF_up = AF[startPosA + x * MEM_OFFSET];

            //A_init -> up element read in previous iteration from global memory (up-left)
            A_init_upleft.x = AF_up.x;

            short2 AE_left;
            int E_current;
            int similarity;
            unsigned int back8 = 0;
            short ymin = min(Y_STEPS, lengthXY.y - y); //(i < Y_STEPS) && (i + y < lengthY)
            //  |  /|  /|
            //  | / | / |
            //  |/  |/  V
            //  |  /|  /|
            //  | / | / |
            //  |/  |/  V
            for(short i = 0; i < ymin; i++)
            {
                AE_left = AE_shared[i][blockThread];

                XYSeq.y = (sharedYSeq[i/4][blockThread] >> (((15-i)%4) * 8)) & 0xFF;

                similarity = substitutionMatrix[XYSeq.y*lettersCount + XYSeq.x];
                similarity += A_init_upleft.y;

                E_current = max(AE_left.y - gapEx, AE_left.x - gapOp);
                AF_current.y = max(AF_up.y - gapEx, AF_up.x - gapOp);

                AF_current.x = max(E_current, AF_current.y);
                AF_current.x = max(AF_current.x, similarity);

                //"back" array
                back8 <<= 1;
                back8 |= ((AF_current.x==E_current) && (AF_current.x!=AF_current.y)) || (AF_current.x==similarity); //if go left
                back8 <<= 1;
                back8 |= (AF_current.x==AF_current.y) || (AF_current.x==similarity); //if go up
                back8 <<= 1;
                back8 |= (AF_current.y == (AF_up.y - gapEx)); //if continue up
                back8 <<= 1;
                back8 |= (E_current == (AE_left.y - gapEx)); //if continue left

                //initialize variables for next iterations
                short2 AE_tmp;
                AE_tmp.x = AF_current.x;
                AE_tmp.y = E_current;
                AE_shared[i][blockThread] = AE_tmp;
                A_init_upleft.y = AE_left.x;
                AF_up = AF_current;

            }

            //we want the last row of back8 to be completed
            back8 <<= 4 * (Y_STEPS - ymin);

            //write variables to global memory for next loop
            AF[startPosA + x * MEM_OFFSET] = AF_current;
            BACK(x,y) = back8;

        }
    }

    //here write result (AF_current) to global memory
    scores[startPosA] = AF_current.x;
}


/*******************************************************************************
 * If you don't want to use the texture when backtracing then                  *
 * comment the two lines below.                                                *
 *                                                                             *
 * UNFORTUNATELLY (OR NOT) WE CAN'T USE TEXTURE FOR THIS PURPOSE               *
 * BECAUSE THE MAXIMAL NUMBER OF TEXTURE ELEMENTS (NOT BYTES) IS 2^27          *
 * WHICH WE CAN EASLY EXCEED.                                                  *
 *******************************************************************************/
//#undef  BACK
//#define BACK(x,y)   tex1Dfetch(texBack, startPosA + ( ( ((y) + 8) / 8) * rowWidth + (x) + 1 ) * MEM_OFFSET)

/*******************************************************************************
 * "back" consist of 4bits x 8 (=32bits):                                      *
 * 4bits:                                                                      *
 * -> 0 and 1 bits:                                                            *
 *      - 00 -> stop                                                           *
 *      - 01 -> up                                                             *
 *      - 10 -> left                                                           *
 *      - 11 -> crosswise                                                      *
 * -> 2 bit:                                                                   *
 *      - 0 -> not continueUp                                                  *
 *      - 1 -> continueUp                                                      *
 * -> 3 bit:                                                                   *
 *      - 0 -> not continueLeft                                                *
 *      - 1 -> continueLeft                                                    *
 *                                                                             *
 *******************************************************************************/

#define STOP         0
#define UP           4
#define LEFT         8
#define CROSSWISE   12
#define DIRECTION   12
#define CONTIN_UP    2
#define CONTIN_LEFT  1
#define ELEMENT     15

__global__ void NeedlemanWunschGlobalBackKernel(unsigned int* back,
                                                short rowWidth,
                                                unsigned int* prePPL,
                                                short2* identityMatrix,
                                                int* scores,
                                                int nwcutoff,
                                                bool border = false)
{
    if(border && (seqXNo > seqYNo))
        return;

    short2 lengthXY;
    lengthXY.x = tex1Starts[seqXNo + 1] - startX;
    lengthXY.y = tex2Starts[seqYNo + 1] - startY;

    if((lengthXY.x == 0) || (lengthXY.y == 0))//if there is nothing to do -> quit
        return;

    //startPosA == thread number within whole grid
    int startPosA = seqYNo * WIN_SIZE + seqXNo;

    short2 length_identical;
    length_identical.x = 0; //total length of alignment (without spaces)
    length_identical.y = 0; //number of identical characters
    
    //CUT-OFFS on score
    if (scores[startPosA] <= nwcutoff)
    {
        prePPL[startPosA] = 0xFFFFFFFF;
        length_identical.x = 1;
        identityMatrix[startPosA] = length_identical;
        return;
    }

    short2 indexXY;
    indexXY.x = lengthXY.x - 1; //lengthX (-1 because of addressing in BACK(x,y))
    indexXY.y = lengthXY.y - 1; //lengthY


    short carret = 0;
    unsigned int element;

    unsigned int back8 = BACK(indexXY.x, indexXY.y);

    unsigned char prevDirection = CROSSWISE;// 1100 == 12 =>crosswise
    unsigned char back1; //current element of back array
    unsigned char todo;

    back8 >>= ((8 - ((indexXY.y + 1) % 8)) % 8) * 4;

    back1 = back8 & ELEMENT;
    back8 >>= 4;


    unsigned char tmpIdentity = 255;


    while(back1 & DIRECTION) //while(direction != STOP)
    {
        tmpIdentity = 255;

        if( ((prevDirection & DIRECTION) == UP) && (prevDirection & CONTIN_UP) )
        {
            todo = UP;
        }
        else if( ((prevDirection & DIRECTION) == LEFT) && (prevDirection & CONTIN_LEFT) )
        {
            todo = LEFT;
        }
        else if ((back1 & DIRECTION) == UP)
        {
            todo = UP;
        }
        else if ((back1 & DIRECTION) == LEFT)
        {
            todo = LEFT;
        }
        else //if (back1 & DIRECTION == CROSSWISE)
        {
            todo = CROSSWISE;
        }


        if (todo == LEFT)
        {
            indexXY.x--;
            back8 = BACK(indexXY.x, indexXY.y);
            back8 >>= ((8 - ((indexXY.y + 1) % 8)) % 8) * 4; //because of the last row of back array
        }
        else if (todo == UP)
        {
            indexXY.y--;
            if((indexXY.y % 8) == 7)
                back8 = BACK(indexXY.x, indexXY.y);
        }
        else //if (todo == CROSSWISE)
        {
            length_identical.x++;
            if(tex1Dfetch(texSeqsX, startX + indexXY.x) == tex1Dfetch(texSeqsY, startY + indexXY.y))
                length_identical.y++;
            
            tmpIdentity = 0;
            
            indexXY.x--;
            indexXY.y--;
            
            back8 = BACK(indexXY.x, indexXY.y);
            back8 >>= ((8 - ((indexXY.y + 1) % 8)) % 8) * 4; //because of the last row of back array
        }


        prevDirection = todo | back1&3;
        back1 = back8 & ELEMENT;
        back8 >>= 4;


        //if tmpIdentity == 255 then the element is spare
        //(needed only to keep memory writes aligned)


        // Why max? Because if algorithm starts with going up or left
        // (not diagonally) then element would be 4095 instead of -1.
        // We do not need to remember "-1" value, since it has
        // identity value == 255.
        element = max(lengthXY.x - indexXY.x - 2, 0);
        element <<= 12;
        element |= max(lengthXY.y - indexXY.y - 2, 0);
        element <<= 8;
        element |= tmpIdentity;
        prePPL[startPosA + MEM_OFFSET*carret] = element;
        carret++;

    }

    // prePPL elements:
    // - if prePPL element == 0xFFFFFFFF then this is the last element
    // - if prePPL ends with 0xFF then this element is spare
    //
    prePPL[startPosA + MEM_OFFSET*carret] = 0xFFFFFFFF;

    //calculating % of identity
    unsigned char identity = (unsigned char)((length_identical.y * 100) / length_identical.x);

    for(short i=0; i<carret; i++)
    {
        element = prePPL[startPosA + MEM_OFFSET*i];
        element |= identity;
        prePPL[startPosA + MEM_OFFSET*i] = element;
    }

    identityMatrix[startPosA] = length_identical;

}

#undef STOP
#undef UP
#undef LEFT
#undef CROSSWISE
#undef DIRECTION
#undef CONTIN_UP
#undef CONTIN_LEFT
#undef ELEMENT

#undef Y_STEPS
#undef WIN_SIZE
#undef MEM_OFFSET
#undef BLOCK_SIZE
#undef seqXNo
#undef seqYNo
#undef startX
#undef startY

#undef BACK




#endif
