#ifndef __EXTENDED_LIBRARY_BUILDER_GPU_CUH__
#define __EXTENDED_LIBRARY_BUILDER_GPU_CUH__

#include <vector_types.h>
#include <cuda_runtime.h>

#include "cuda_declarations.h"
#include "data_management.cuh"
#include "thread_manager.h"
#include "sequences.h"
#include "main_cu.h"
#include "exceptions.h"
#include "hi_res_timer.h"

#include "defines_gpu.cuh"

/*******************************************************************************
 * initExtendedLibrary - initialization of extended library.                   *
 * We initialize it with weight values from prilary library (PL),              *
 * because its final value is: PL weight + sth.                                *
 *******************************************************************************/

__global__ void initExtendedLibrary(unsigned int* baseWin, unsigned int* results, int elementsCount)
{
    // Block shape: 1 x 256
    
    int elemNo = blockDim.x*blockIdx.x + threadIdx.x;

    if(elemNo < elementsCount)
        results[elemNo] = baseWin[elemNo]&0xFF; // &0xFF - because we read only weight (from primary library of course)
}


// instead of SEQ3_SHM_LEN we use BLOCK_WIDTH
// so 3rd window == BLOCK_WIDTH
#define BLOCK_WIDTH  256

// 16K                    - 1K                             = 15K
// (entire shared memory)   (memory for the third sequence)
// 15K / 3 = 5K
// (we have to divide to three identical parts:
//   1) for base sequence
//   2) for results
//   3) for second sequence - first reinforcing)
// 5K / 4 = 1280
// (the integer count interpretation)
// 1280 - 256 = 1024
// (to use a little bit less shared memory than we have)
//#define SEQ1_SHM_LEN 1280
//#define SEQ2_SHM_LEN 1024

// However tests show that both
// buffers size 512 is the best.

#define SEQ1_SHM_LEN 512

// buffer size for second sequence
#define SEQ2_SHM_LEN 512

__global__ void extLibBuilder(unsigned int* baseWin,
                              unsigned int* reinWin,
                              unsigned int* starts,
                              unsigned int* results,
                              unsigned short sequenceNumber,      // window width
                              unsigned short winHeight,           // window height
                              unsigned short windowX,             // window X (REIN. WIN NO.)
                              unsigned short windowY)             // window Y (BASE WIN NO.)
{
    // Block shape: 1 x 256
    // each block process one alignment (e.g. AB with 'C')

    // We compute only those elements which are
    // over/below diagonal.
    if((windowX == windowY) && (blockIdx.x >= blockIdx.y))
        return;

    // We add "windowX*winHeight" to globalSeqNo.x
    // because each time we compute only
    // elements within a square window depending on
    // REIN. WINDOW that we have.
    ushort2 globalSeqNo;
    globalSeqNo.x = windowX*winHeight + blockIdx.x; // local X ("in window") coordinate is the same
    globalSeqNo.y = windowY*winHeight + blockIdx.y; // local Y ("in window") coordinate is blockIdx.y

    if(globalSeqNo.y >= sequenceNumber)
        return;
    if(globalSeqNo.x >= sequenceNumber)
        return;

    // 1 seq = base sequence                e.g. AB
    // 2 seq = reinforcing sequence no. 1   e.g. AC
    // 3 seq = reinforcing sequence no. 2   e.g. BC
    //shared memory for 1, 2 and 3 sequences
    __shared__ unsigned int shm1seq[SEQ1_SHM_LEN];
    __shared__ unsigned int shmResults[SEQ1_SHM_LEN];
    __shared__ unsigned int shm2seq[SEQ2_SHM_LEN];
    __shared__ unsigned int shm3seq[BLOCK_WIDTH];


    unsigned int startBaseWin = starts[windowY*winHeight*sequenceNumber];
    unsigned int startReinWin = starts[windowX*winHeight*sequenceNumber];

    unsigned int seq1start = starts[globalSeqNo.y*sequenceNumber + globalSeqNo.x];
    unsigned int seq1stop  = starts[globalSeqNo.y*sequenceNumber + globalSeqNo.x + 1];
    short seq1length = (short)(seq1stop - seq1start);


    for(short seq1offset=0; seq1offset<seq1length; seq1offset+=SEQ1_SHM_LEN)
    // we process BASE ALIGNMENT in parts, each of length BASE_SHM_LEN
    {
        // fetching part of BASE SEQ
        for(short i=0; (i<SEQ1_SHM_LEN) && (i + seq1offset < seq1length); i+=BLOCK_WIDTH)
        {
            if(threadIdx.x + i + seq1offset < seq1length)
            {
                shm1seq[i + threadIdx.x] = baseWin[threadIdx.x + i + seq1offset + seq1start - startBaseWin];
                //chyba faktycznie nie trzeba inicjalizowac...
                shmResults[i + threadIdx.x] = shm1seq[i + threadIdx.x] & 0xFF;//results[threadIdx.x + i + seq1offset];
            }
        }

        // iterating through all 'letters'
        // (e.g. AB with C, D, E, F..., omitting AB and diagonal element)
        for(short seqNo=0; seqNo<sequenceNumber; seqNo++)
        {
            if((seqNo==globalSeqNo.x)||(seqNo==globalSeqNo.y))
                continue;

            // globalSeqNo.x will corespond to the Y value in reinforcing window
            // and must be divided modulo winHeight to transform global
            // coordinate to local window coordinate

            // e.g. 1 2 with 3
            // 1 2 - seq1
            // 3 2 - seq2
            // 3 1 - seq3 (from reinforcement window)

            // determining the lengths of 2nd and 3rd sequecnes
            unsigned int seq2start = starts[globalSeqNo.y*sequenceNumber + seqNo];
            unsigned int seq2stop  = starts[globalSeqNo.y*sequenceNumber + seqNo + 1];
            short seq2length = (short)(seq2stop - seq2start);

            unsigned int seq3start = starts[globalSeqNo.x*sequenceNumber + seqNo];
            unsigned int seq3stop  = starts[globalSeqNo.x*sequenceNumber + seqNo + 1];
            short seq3length = (short)(seq3stop - seq3start);

            // iterate through 2nd sequence
            for(short seq2offset=0; seq2offset<seq2length; seq2offset+=SEQ2_SHM_LEN)
            {
                // fetching part of 2nd seqence (buffer size is REIN_SHM_LEN)
                for(short i=0; (i<SEQ2_SHM_LEN) && (i + seq2offset < seq2length); i+=BLOCK_WIDTH)
                {
                    if(threadIdx.x + i + seq2offset < seq2length)
                    {
                        shm2seq[i + threadIdx.x] = baseWin[threadIdx.x + i + seq2offset + seq2start - startBaseWin];
                    }
                }

                // iterate through 3rd sequence
                for(short seq3offset=0; seq3offset < seq3length; seq3offset+=BLOCK_WIDTH)
                {
                    // fetching part of 3rd sequence (buffer size is BLOCK_WIDTH)
                    if(threadIdx.x + seq3offset < seq3length)
                    {
                        shm3seq[threadIdx.x] = reinWin[threadIdx.x + seq3offset + seq3start - startReinWin];
                    }

                    __syncthreads();// to make shared memory visible to all threads

                    // now all data is fetched and we can start
                    // checking the reinforcement

                    // we iterate through 1st sequence, because
                    // its buffer size exceeds BLOCK_WIDTH
                    for(short i1seq=0; i1seq<SEQ1_SHM_LEN; i1seq+=BLOCK_WIDTH)
                    {
                        if(i1seq+threadIdx.x+seq1offset < seq1length)
                        // only threads that have sth to do can proceed
                        {
                            short3 seq1xy;
                            seq1xy.x = (shm1seq[i1seq+threadIdx.x]>>20);
                            seq1xy.y = (shm1seq[i1seq+threadIdx.x]>>8 )&0xFFF;
                            seq1xy.z = (shm1seq[i1seq+threadIdx.x]    )&0xFF;// weight

                            if(seq1xy.z==255)// end of sequence
                                break;

                            // each thread iterates through all
                            // 2nd sequence's elements
                            for(short i2seq=0; (i2seq<SEQ2_SHM_LEN)/*&&(i2seq+seq2offset<seq2length)*/; i2seq++)
                            {
                                short3 seq2xy;
                                seq2xy.x = (shm2seq[i2seq]>>20);
                                seq2xy.y = (shm2seq[i2seq]>>8 )&0xFFF;
                                seq2xy.z = (shm2seq[i2seq]    )&0xFF;// weight

                                if((seq1xy.y<seq2xy.y) || (seq2xy.z==255))
                                    break;

                                // and here each thread iterates through
                                // all alements in 3rd sequence
                                // if there is any point (see "if" condition)
                                if(seq1xy.y==seq2xy.y)
                                    for(short i3seq=0; (i3seq<BLOCK_WIDTH)/*&&(i3seq+seq3offset<seq3length)*/; i3seq++)
                                    {
                                        short3 seq3xy;
                                        seq3xy.x = (shm3seq[i3seq]>>20);
                                        seq3xy.y = (shm3seq[i3seq]>>8 )&0xFFF;
                                        seq3xy.z = (shm3seq[i3seq]    )&0xFF;// weight

                                        if((seq1xy.x<seq3xy.y) || (seq3xy.z==255))
                                            break;

                                        if(/*(seq1xy.y==seq2xy.y)&&*/(seq1xy.x==seq3xy.y)&&(seq2xy.x==seq3xy.x))
                                        {
                                            shmResults[i1seq+threadIdx.x] += (unsigned int)min((int)seq2xy.z, (int)seq3xy.z);
                                        }
                                    }

                            }
                        }
                    }

                    __syncthreads();// to keep input data the same until all threads within the block finish

                }



            }




        }


        __syncthreads();
        // saving results
        for(short i=0; (i<SEQ1_SHM_LEN) && (i + seq1offset < seq1length); i+=BLOCK_WIDTH)
        {
            if(threadIdx.x + i + seq1offset < seq1length)
            {
                results[threadIdx.x + i + seq1offset + seq1start - startBaseWin] = //3;
                       shmResults[i + threadIdx.x];
            }
        }
        
    }
    
    

//    int elemNo = blockDim.x*blockIdx.x + threadIdx.x;
//
//    if(elemNo < elementsCount)
//        results[elemNo] = baseWin[elemNo]&0xFF; // &0xFF - because we read only weight (from primary library of course)
}


#undef BLOCK_WIDTH
#undef SEQ1_SHM_LEN
#undef SEQ2_SHM_LEN


#endif
